Mercurial > repos > ulfschaefer > data_manager_phemost
changeset 0:25d4d9f313a0 draft default tip
Uploaded
author | ulfschaefer |
---|---|
date | Wed, 13 Jul 2016 05:50:48 -0400 |
parents | |
children | |
files | data_manager/fetch_mlst_data.py data_manager/fetch_mlst_data.xml data_manager_conf.xml tool-data/mlst_data.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 6 files changed, 490 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_mlst_data.py Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,293 @@ +#!/usr/bin/env python + +''' +Download MLST datasets from this site: http://pubmlst.org/data/ by +parsing an xml file (http://pubmlst.org/data/dbases.xml). + +Data is downloaded for a species determined by the user: +- profiles (maps STs to allele numbers) +- numbered sequences for each locus in the scheme + +In addition, the alleles are concatenated together for use with SRST2. + +A log file is also generated in the working directory, detailing the +time, date and location of all files downloaded, as well as the <retrieved> +tag which tells us when the XML entry was last updated. + +If the species name input by the user matches multiple <species> in the +xml file, the script simply reports the possible matches so the user can +try again. +''' + +""" +- Remove empty line at the end of profiles.txt file. +- Ensure the allele names at the profiles.txt file don't contain "_". + +""" +from argparse import ArgumentParser +import xml.dom.minidom as xml +import urllib2 as url +import re +import os +import sys +import glob +import csv +import shutil +from urlparse import urlparse +import time +import subprocess +from json import dumps +from json import loads + +# -------------------------------------------------------------------------------------------------- + +def parse_args(): + parser = ArgumentParser(description='Download MLST datasets by species' + 'from pubmlst.org.') + + parser.add_argument('--repository_url', + metavar = 'URL', + default = 'http://pubmlst.org/data/dbases.xml', + help = 'URL for MLST repository XML index') + + parser.add_argument('--species', + metavar = 'NAME', + required = True, + help = 'The name of the species that you want to download (e.g. "Escherichia coli")') + + parser.add_argument('--outfile', + metavar = 'FILE', + required = True, + help = 'The name of the Json file to write that galaxy stuff to.') + + parser.add_argument('--reference', + metavar = 'ACCESSION', + required = True, + help = 'NCBI accession number of the reference genome to use for flanking regions.') + + return parser.parse_args() + +# -------------------------------------------------------------------------------------------------- + +def main(): + + """ + <species> + Achromobacter spp. + <mlst> + <database> + <url>http://pubmlst.org/achromobacter</url> + <retrieved>2015-08-11</retrieved> + <profiles> + <count>272</count> + <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> + </profiles> + <loci> + <locus> + nusA + <url> + http://pubmlst.org/data/alleles/achromobacter/nusA.tfa + </url> + </locus> + <locus> + rpoB + <url> + http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa + </url> + </locus> + """ + + args = parse_args() + docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml + + doc = xml.parse(docFile) + root = doc.childNodes[0] + found_species = [] + + if args.species == "Escherichia coli": + args.species = "Escherichia coli#1" + elif args.species == "Acinetobacter baumannii": + args.species = "Acinetobacter baumannii#1" + elif args.species == "Pasteurella multocida": + args.species = "Pasteurella multocida#1" + else: + pass + + for species_node in root.getElementsByTagName('species'): + info = getSpeciesInfo(species_node, args.species) + if info != None: + found_species.append(info) + + if len(found_species) == 0: + sys.stderr.write("No species matched your query.\n") + exit(1) + + if len(found_species) > 1: + sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) + for info in found_species: + sys.stderr.write(info.name + '\n') + exit(2) + + # output information for the single matching species + assert len(found_species) == 1 + species_info = found_species[0] + species_name_underscores = species_info.name.replace(' ', '_') + timestamp = time.strftime("%Y%m%d%H%M%S") + + params = loads(open(args.outfile).read()) + folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) + + if not os.path.isdir(folder): + os.makedirs(folder) + + profile_doc = url.urlopen(species_info.profiles_url) + with open(os.path.join(folder, 'profiles.txt'), 'w') as f: + sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) + for line in profile_doc.readlines(): + cols = line.split("\t") + f.write("%s\n" % ('\t'.join(cols[0:8]))) + profile_doc.close() + + for locus in species_info.loci: + locus_path = urlparse(locus.url).path + locus_filename = locus_path.split('/')[-1] + locus_filename = locus_filename.replace("_.tfa", ".fas") + locus_filename = locus_filename.replace("tfa", "fas") + locus_doc = url.urlopen(locus.url) + with open(os.path.join(folder, locus_filename), 'w') as locus_file: + locus_fasta_content = locus_doc.read() + locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") + sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) + locus_file.write(locus_fasta_content) + locus_doc.close() + + get_reference(folder, args.reference) + + + # do Galaxy stuff + data_manager_dict = {} + data_manager_dict['data_tables'] = {} + name = "%s-%s" % (species_info.name, timestamp) + data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, + dbkey=species_name_underscores, + name=name, + time_stamp=timestamp, + file_path=folder)] + #save info to json file + with open(args.outfile, 'wb') as fjson: + fjson.write(dumps(data_manager_dict)) + +# end of main -------------------------------------------------------------------------------------- + +def get_reference(folder, acc): + + # We're getting this file from Japan! + # It seems to work pretty well until they take down or change their website + # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 + refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) + remote_ref = url.urlopen(refurl) + ref_filename = os.path.join(folder, 'reference.seq') + with open(ref_filename, 'wb') as fRef: + fRef.write(remote_ref.read()) + remote_ref.close() + + cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ + % (ref_filename, ref_filename.replace("reference.seq", "reference")) + p = subprocess.Popen(cmd, + shell=True, + stdin=None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, close_fds=True) + p.wait() + + return + +# -------------------------------------------------------------------------------------------------- + +# test if a node is an Element and that it has a specific tag name +def testElementTag(node, name): + return node.nodeType == node.ELEMENT_NODE and node.localName == name + +# -------------------------------------------------------------------------------------------------- + +# Get the text from an element node with a text node child +def getText(element): + result = '' + for node in element.childNodes: + if node.nodeType == node.TEXT_NODE: + result += node.data + return normaliseText(result) + +# -------------------------------------------------------------------------------------------------- + +# remove unwanted whitespace including linebreaks etc. +def normaliseText(str): + return ' '.join(str.split()) + +# -------------------------------------------------------------------------------------------------- + +# A collection of interesting information about a taxa +class SpeciesInfo(object): + def __init__(self): + self.name = None # String name of species + self.database_url = None # URL as string + self.retrieved = None # date as string + self.profiles_url = None # URL as string + self.profiles_count = None # positive integer + self.loci = [] # list of loci + + def __str__(self): + s = "Name: %s\n" % self.name + s += "Database URL: %s\n" % self.database_url + s += "Retrieved: %s\n" % self.retrieved + s += "Profiles URL: %s\n" % self.profiles_url + s += "Profiles count: %s\n" % self.profiles_count + s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) + return s + +# -------------------------------------------------------------------------------------------------- + +class LocusInfo(object): + def __init__(self): + self.url = None + self.name = None + def __str__(self): + return "Locus: name:%s,url:%s" % (self.name, self.url) + +# -------------------------------------------------------------------------------------------------- + +# retrieve the interesting information for a given sample element +def getSpeciesInfo(species_node, species): + this_name = getText(species_node) + print this_name + if this_name.startswith(species): + info = SpeciesInfo() + info.name = this_name + for mlst_node in species_node.getElementsByTagName('mlst'): + for database_node in mlst_node.getElementsByTagName('database'): + for database_child_node in database_node.childNodes: + if testElementTag(database_child_node, 'url'): + info.database_url = getText(database_child_node) + elif testElementTag(database_child_node, 'retrieved'): + info.retrieved = getText(database_child_node) + elif testElementTag(database_child_node, 'profiles'): + for profile_count in database_child_node.getElementsByTagName('count'): + info.profiles_count = getText(profile_count) + for profile_url in database_child_node.getElementsByTagName('url'): + info.profiles_url = getText(profile_url) + elif testElementTag(database_child_node, 'loci'): + for locus_node in database_child_node.getElementsByTagName('locus'): + locus_info = LocusInfo() + locus_info.name = getText(locus_node) + for locus_url in locus_node.getElementsByTagName('url'): + locus_info.url = getText(locus_url) + info.loci.append(locus_info) + + return info + else: + return None + +# -------------------------------------------------------------------------------------------------- + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_mlst_data.xml Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,157 @@ +<tool id="fetch_mlst_data" name="Fetch MLST Data" version="0.0.1" tool_type="manage_data"> + <description>fetching</description> + <requirements> + <requirement type="package" version="2.7.10">python</requirement> + </requirements> + <command interpreter="python">fetch_mlst_data.py --outfile "${out_file}" --species "$species" --reference $reference</command> + <inputs> + <param name="species" type="select" label="Select the organism"> + <option value="Achromobacter spp.">Achromobacter spp.</option> + <option value="Acinetobacter baumannii">Acinetobacter baumannii</option> + <option value="Aeromonas spp.">Aeromonas spp.</option> + <option value="Anaplasma phagocytophilum">Anaplasma phagocytophilum</option> + <option value="Arcobacter spp.">Arcobacter spp.</option> + <option value="Aspergillus fumigatus">Aspergillus fumigatus</option> + <option value="Bacillus cereus">Bacillus cereus</option> + <option value="Bacillus licheniformis">Bacillus licheniformis</option> + <option value="Bacillus subtilis">Bacillus subtilis</option> + <option value="Bartonella henselae">Bartonella henselae</option> + <option value="Bordetella spp.">Bordetella spp.</option> + <option value="Borrelia spp.">Borrelia spp.</option> + <option value="Brachyspira hampsonii">Brachyspira hampsonii</option> + <option value="Brachyspira hyodysenteriae">Brachyspira hyodysenteriae</option> + <option value="Brachyspira intermedia">Brachyspira intermedia</option> + <option value="Brachyspira pilosicoli">Brachyspira pilosicoli</option> + <option value="Brachyspira spp.">Brachyspira spp.</option> + <option value="Burkholderia cepacia complex">Burkholderia cepacia complex</option> + <option value="Burkholderia pseudomallei">Burkholderia pseudomallei</option> + <option value="Campylobacter concisus/curvus">Campylobacter concisus/curvus</option> + <option value="Campylobacter fetus">Campylobacter fetus</option> + <option value="Campylobacter helveticus">Campylobacter helveticus</option> + <option value="Campylobacter hyointestinalis">Campylobacter hyointestinalis</option> + <option value="Campylobacter insulaenigrae">Campylobacter insulaenigrae</option> + <option value="Campylobacter jejuni">Campylobacter jejuni</option> + <option value="Campylobacter lanienae">Campylobacter lanienae</option> + <option value="Campylobacter lari">Campylobacter lari</option> + <option value="Campylobacter sputorum">Campylobacter sputorum</option> + <option value="Campylobacter upsaliensis">Campylobacter upsaliensis</option> + <option value="Candida albicans">Candida albicans</option> + <option value="Candida glabrata">Candida glabrata</option> + <option value="Candida krusei">Candida krusei</option> + <option value="Candida tropicalis">Candida tropicalis</option> + <option value="Carnobacterium maltaromaticum">Carnobacterium maltaromaticum</option> + <option value="Chlamydiales spp.">Chlamydiales spp.</option> + <option value="Citrobacter freundii">Citrobacter freundii</option> + <option value="Clonorchis sinensis">Clonorchis sinensis</option> + <option value="Clostridium botulinum">Clostridium botulinum</option> + <option value="Clostridium difficile">Clostridium difficile</option> + <option value="Clostridium septicum">Clostridium septicum</option> + <option value="Corynebacterium diphtheriae">Corynebacterium diphtheriae</option> + <option value="Cronobacter spp.">Cronobacter spp.</option> + <option value="Enterobacter cloacae">Enterobacter cloacae</option> + <option value="Enterococcus faecalis">Enterococcus faecalis</option> + <option value="Enterococcus faecium">Enterococcus faecium</option> + <option value="Escherichia coli">Escherichia coli</option> + <option value="Flavobacterium psychrophilum">Flavobacterium psychrophilum</option> + <option value="Haemophilus influenzae">Haemophilus influenzae</option> + <option value="Haemophilus parasuis">Haemophilus parasuis</option> + <option value="Helicobacter cinaedi">Helicobacter cinaedi</option> + <option value="Helicobacter pylori">Helicobacter pylori</option> + <option value="Helicobacter suis"> Helicobacter suis</option> + <option value="Kingella kingae">Kingella kingae</option> + <option value="Klebsiella oxytoca">Klebsiella oxytoca</option> + <option value="Klebsiella pneumoniae">Klebsiella pneumoniae</option> + <option value="Kudoa septempunctata">Kudoa septempunctata</option> + <option value="Lactobacillus salivarius">Lactobacillus salivarius</option> + <option value="Leptospira spp.">Leptospira spp.</option> + <option value="Listeria monocytogenes">Listeria monocytogenes</option> + <option value="Mannheimia haemolytica">Mannheimia haemolytica</option> + <option value="Melissococcus plutonius">Melissococcus plutonius</option> + <option value="Moraxella catarrhalis">Moraxella catarrhalis</option> + <option value="Mycobacterium abscessus">Mycobacterium abscessus</option> + <option value="Mycobacterium massiliense">Mycobacterium massiliense</option> + <option value="Mycoplasma agalactiae">Mycoplasma agalactiae</option> + <option value="Mycoplasma bovis">Mycoplasma bovis</option> + <option value="Mycoplasma hyorhinis">Mycoplasma hyorhinis</option> + <option value="Neisseria spp.">Neisseria spp.</option> + <option value="Orientia tsutsugamushi">Orientia tsutsugamushi</option> + <option value="Ornithobacterium rhinotracheale">Ornithobacterium rhinotracheale</option> + <option value="Paenibacillus larvae">Paenibacillus larvae</option> + <option value="Pasteurella multocida">Pasteurella multocida</option> + <option value="Pediococcus pentosaceus">Pediococcus pentosaceus</option> + <option value="Porphyromonas gingivalis">Porphyromonas gingivalis</option> + <option value="Propionibacterium acnes">Propionibacterium acnes</option> + <option value="Pseudomonas aeruginosa">Pseudomonas aeruginosa</option> + <option value="Pseudomonas fluorescens">Pseudomonas fluorescens</option> + <option value="Riemerella anatipestifer">Riemerella anatipestifer</option> + <option value="Salmonella enterica">Salmonella enterica</option> + <option value="Sinorhizobium spp.">Sinorhizobium spp.</option> + <option value="Staphylococcus aureus">Staphylococcus aureus</option> + <option value="Staphylococcus epidermidis">Staphylococcus epidermidis</option> + <option value="Staphylococcus haemolyticus">Staphylococcus haemolyticus</option> + <option value="Staphylococcus lugdunensis">Staphylococcus lugdunensis</option> + <option value="Staphylococcus pseudintermedius">Staphylococcus pseudintermedius</option> + <option value="Stapylococcus hominis">Stapylococcus hominis</option> + <option value="Stenotrophomonas maltophilia">Stenotrophomonas maltophilia</option> + <option value="Streptococcus agalactiae">Streptococcus agalactiae</option> + <option value="Streptococcus canis">Streptococcus canis</option> + <option value="Streptococcus dysgalactiae equisimilis">Streptococcus dysgalactiae equisimilis</option> + <option value="Streptococcus gallolyticus">Streptococcus gallolyticus</option> + <option value="Streptococcus oralis">Streptococcus oralis</option> + <option value="Streptococcus pneumoniae">Streptococcus pneumoniae</option> + <option value="Streptococcus pyogenes">Streptococcus pyogenes</option> + <option value="Streptococcus suis">Streptococcus suis</option> + <option value="Streptococcus thermophilus">Streptococcus thermophilus</option> + <option value="Streptococcus uberis">Streptococcus uberis</option> + <option value="Streptococcus zooepidemicus">Streptococcus zooepidemicus</option> + <option value="Streptomyces spp">Streptomyces spp</option> + <option value="Taylorella spp.">Taylorella spp.</option> + <option value="Tenacibaculum spp.">Tenacibaculum spp.</option> + <option value="Trichomonas vaginalis">Trichomonas vaginalis</option> + <option value="Vibrio cholerae">Vibrio cholerae</option> + <option value="Vibrio parahaemolyticus">Vibrio parahaemolyticus</option> + <option value="Vibrio spp.">Vibrio spp.</option> + <option value="Vibrio tapetis"> Vibrio tapetis</option> + <option value="Vibrio vulnificus">Vibrio vulnificus</option> + <option value="Wolbachia">Wolbachia</option> + <option value="Xylella fastidiosa">Xylella fastidiosa</option> + <option value="Yersinia pseudotuberculosis">Yersinia pseudotuberculosis</option> + <option value="Yersinia ruckeri">Yersinia ruckeri</option> + <option value="Yersinia spp.">Yersinia spp.</option> + </param> + + <param name="reference" type="text" value="NC_xxxxxx" label="Please provide a NCBI accession number for the reference genome to use." help="A reference genome is used for extracting the flanking regions to tying genes. Please see help below." /> + + </inputs> + <outputs> + <data name="out_file" format="data_manager_json"/> + </outputs> + + <help> +**What it does** + +Fetches MLST loci and profiles for a given organism from pubmlst.org and populates the "mlst" data table. + +A reference must be provided each time. Use an NBCI accession number (e.g. NC_003210) so it can be automatically downloaded. + +Here are some valid accession number for the species we use most frequently: + + +======================== ===================== +Organism recommended accession +======================== ===================== +Campylobacter jejuni AL111168 +Escherichia coli CP002797 +Listeria monocytogenes NC_003210 +Salmonella enterica NC_003197 +Staphylococcus aureus NC_002952 +Streptococcus pneumoniae NC_017769 +Streptococcus pyogenes AE014074 +======================== ===================== + +------ + +If your organism of choice is not on this list you need to look up the correct accession number for a valid reference genome on the NCBI website (http://www.ncbi.nlm.nih.gov/nuccore/). + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/fetch_mlst_data.xml" id="fetch_mlst_data" version="1.0.0"> + <data_table name="mlst_data"> + <output> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="time_stamp" /> + <column name="file_path" output_ref="out_file" > + <move type="directory"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mlst_data</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mlst_data/${dbkey}/${time_stamp}</value_translation> <!-- Store this value in the final Data Table --> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/mlst_data.loc.sample Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,7 @@ +#This file contains entries that point to available MLST data +#It has 5 columns separated by a tab character +#value species name +#dbkey species name +#name Species name + time stamp to be shown in table, eg. Klebsiella pneumoniae-20160707142859 +#time_stamp when the data was downloaded +#file_path absolute path to there allele fastas, profile.txt and reference are stored
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> +<tables> + <table name="mlst_data" comment_char="#"> + <columns>value, dbkey, name, time_stamp, file_path</columns> + <file path="tool-data/mlst_data.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Jul 13 05:50:48 2016 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="python" version="2.7.10"> + <repository changeset_revision="0339c4a9b87b" name="package_python_2_7_10" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>