changeset 0:25d4d9f313a0 draft default tip

Uploaded
author ulfschaefer
date Wed, 13 Jul 2016 05:50:48 -0400
parents
children
files data_manager/fetch_mlst_data.py data_manager/fetch_mlst_data.xml data_manager_conf.xml tool-data/mlst_data.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 6 files changed, 490 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_mlst_data.py	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+
+'''
+Download MLST datasets from this site: http://pubmlst.org/data/ by
+parsing an xml file (http://pubmlst.org/data/dbases.xml).
+
+Data is downloaded for a species determined by the user:
+- profiles (maps STs to allele numbers)
+- numbered sequences for each locus in the scheme
+
+In addition, the alleles are concatenated together for use with SRST2.
+
+A log file is also generated in the working directory, detailing the
+time, date and location of all files downloaded, as well as the <retrieved>
+tag which tells us when the XML entry was last updated.
+
+If the species name input by the user matches multiple <species> in the
+xml file, the script simply reports the possible matches so the user can
+try again.
+'''
+
+"""
+- Remove empty line at the end of profiles.txt file.
+- Ensure the allele names at the profiles.txt file don't contain "_".
+
+"""
+from argparse import ArgumentParser
+import xml.dom.minidom as xml
+import urllib2 as url
+import re
+import os
+import sys
+import glob
+import csv
+import shutil
+from urlparse import urlparse
+import time
+import subprocess
+from json import dumps
+from json import loads
+
+# --------------------------------------------------------------------------------------------------
+
+def parse_args():
+    parser = ArgumentParser(description='Download MLST datasets by species'
+                                        'from pubmlst.org.')
+
+    parser.add_argument('--repository_url',
+                        metavar = 'URL',
+                        default = 'http://pubmlst.org/data/dbases.xml',
+                        help = 'URL for MLST repository XML index')
+
+    parser.add_argument('--species',
+                        metavar = 'NAME',
+                        required = True,
+                        help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
+
+    parser.add_argument('--outfile',
+                        metavar = 'FILE',
+                        required = True,
+                        help = 'The name of the Json file to write that galaxy stuff to.')
+
+    parser.add_argument('--reference',
+                        metavar = 'ACCESSION',
+                        required = True,
+                        help = 'NCBI accession number of the reference genome to use for flanking regions.')
+
+    return parser.parse_args()
+
+# --------------------------------------------------------------------------------------------------
+
+def main():
+
+    """
+    <species>
+    Achromobacter spp.
+    <mlst>
+    <database>
+    <url>http://pubmlst.org/achromobacter</url>
+    <retrieved>2015-08-11</retrieved>
+    <profiles>
+    <count>272</count>
+    <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
+    </profiles>
+    <loci>
+    <locus>
+    nusA
+    <url>
+    http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
+    </url>
+    </locus>
+    <locus>
+    rpoB
+    <url>
+    http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
+    </url>
+    </locus>
+    """
+
+    args = parse_args()
+    docFile = url.urlopen(args.repository_url) # url address  #args.repository_url =http://pubmlst.org/data/dbases.xml
+
+    doc = xml.parse(docFile)
+    root = doc.childNodes[0]
+    found_species = []
+
+    if args.species == "Escherichia coli":
+        args.species = "Escherichia coli#1"
+    elif args.species == "Acinetobacter baumannii":
+        args.species = "Acinetobacter baumannii#1"
+    elif args.species == "Pasteurella multocida":
+        args.species = "Pasteurella multocida#1"
+    else:
+        pass
+
+    for species_node in root.getElementsByTagName('species'):
+        info = getSpeciesInfo(species_node, args.species)
+        if info != None:
+            found_species.append(info)
+
+    if len(found_species) == 0:
+        sys.stderr.write("No species matched your query.\n")
+        exit(1)
+
+    if len(found_species) > 1:
+        sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
+        for info in found_species:
+            sys.stderr.write(info.name + '\n')
+        exit(2)
+
+    # output information for the single matching species
+    assert len(found_species) == 1
+    species_info = found_species[0]
+    species_name_underscores = species_info.name.replace(' ', '_')
+    timestamp = time.strftime("%Y%m%d%H%M%S")
+
+    params = loads(open(args.outfile).read())
+    folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
+
+    if not os.path.isdir(folder):
+        os.makedirs(folder)
+
+    profile_doc = url.urlopen(species_info.profiles_url)
+    with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
+        sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
+        for line in profile_doc.readlines():
+            cols = line.split("\t")
+            f.write("%s\n" % ('\t'.join(cols[0:8])))
+    profile_doc.close()
+
+    for locus in species_info.loci:
+        locus_path = urlparse(locus.url).path
+        locus_filename = locus_path.split('/')[-1]
+        locus_filename = locus_filename.replace("_.tfa", ".fas")
+        locus_filename = locus_filename.replace("tfa", "fas")
+        locus_doc = url.urlopen(locus.url)
+        with open(os.path.join(folder, locus_filename), 'w') as locus_file:
+            locus_fasta_content = locus_doc.read()
+            locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
+            sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
+            locus_file.write(locus_fasta_content)
+        locus_doc.close()
+
+    get_reference(folder, args.reference)
+
+
+    # do Galaxy stuff
+    data_manager_dict = {}
+    data_manager_dict['data_tables'] = {}
+    name = "%s-%s" % (species_info.name, timestamp)
+    data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
+                                                          dbkey=species_name_underscores,
+                                                          name=name,
+                                                          time_stamp=timestamp,
+                                                          file_path=folder)]
+    #save info to json file
+    with open(args.outfile, 'wb') as fjson:
+        fjson.write(dumps(data_manager_dict))
+
+# end of main --------------------------------------------------------------------------------------
+
+def get_reference(folder, acc):
+
+    # We're getting this file from Japan!
+    # It seems to work pretty well until they take down or change their website
+    # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
+    refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
+    remote_ref = url.urlopen(refurl)
+    ref_filename = os.path.join(folder, 'reference.seq')
+    with open(ref_filename, 'wb') as fRef:
+        fRef.write(remote_ref.read())
+    remote_ref.close()
+
+    cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
+          % (ref_filename, ref_filename.replace("reference.seq", "reference"))
+    p = subprocess.Popen(cmd,
+                         shell=True,
+                         stdin=None,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, close_fds=True)
+    p.wait()
+
+    return
+
+# --------------------------------------------------------------------------------------------------
+
+# test if a node is an Element and that it has a specific tag name
+def testElementTag(node, name):
+    return node.nodeType == node.ELEMENT_NODE and node.localName == name
+
+# --------------------------------------------------------------------------------------------------
+
+# Get the text from an element node with a text node child
+def getText(element):
+    result = ''
+    for node in element.childNodes:
+        if node.nodeType == node.TEXT_NODE:
+            result += node.data
+    return normaliseText(result)
+
+# --------------------------------------------------------------------------------------------------
+
+# remove unwanted whitespace including linebreaks etc.
+def normaliseText(str):
+    return ' '.join(str.split())
+
+# --------------------------------------------------------------------------------------------------
+
+# A collection of interesting information about a taxa
+class SpeciesInfo(object):
+    def __init__(self):
+        self.name = None # String name of species
+        self.database_url = None # URL as string
+        self.retrieved = None # date as string
+        self.profiles_url = None # URL as string
+        self.profiles_count = None # positive integer
+        self.loci = [] # list of loci
+
+    def __str__(self):
+        s = "Name: %s\n" % self.name
+        s += "Database URL: %s\n" % self.database_url
+        s += "Retrieved: %s\n" % self.retrieved
+        s += "Profiles URL: %s\n" % self.profiles_url
+        s += "Profiles count: %s\n" % self.profiles_count
+        s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
+        return s
+
+# --------------------------------------------------------------------------------------------------
+
+class LocusInfo(object):
+    def __init__(self):
+        self.url = None
+        self.name = None
+    def __str__(self):
+        return "Locus: name:%s,url:%s" % (self.name, self.url)
+
+# --------------------------------------------------------------------------------------------------
+
+# retrieve the interesting information for a given sample element
+def getSpeciesInfo(species_node, species):
+    this_name = getText(species_node)
+    print this_name
+    if this_name.startswith(species):
+        info = SpeciesInfo()
+        info.name = this_name
+        for mlst_node in species_node.getElementsByTagName('mlst'):
+            for database_node in mlst_node.getElementsByTagName('database'):
+                for database_child_node in database_node.childNodes:
+                    if testElementTag(database_child_node, 'url'):
+                        info.database_url = getText(database_child_node)
+                    elif testElementTag(database_child_node, 'retrieved'):
+                        info.retrieved = getText(database_child_node)
+                    elif testElementTag(database_child_node, 'profiles'):
+                        for profile_count in database_child_node.getElementsByTagName('count'):
+                            info.profiles_count = getText(profile_count)
+                        for profile_url in database_child_node.getElementsByTagName('url'):
+                            info.profiles_url = getText(profile_url)
+                    elif testElementTag(database_child_node, 'loci'):
+                        for locus_node in database_child_node.getElementsByTagName('locus'):
+                            locus_info = LocusInfo()
+                            locus_info.name = getText(locus_node)
+                            for locus_url in locus_node.getElementsByTagName('url'):
+                                locus_info.url = getText(locus_url)
+                            info.loci.append(locus_info)
+
+        return info
+    else:
+        return None
+
+# --------------------------------------------------------------------------------------------------
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_mlst_data.xml	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,157 @@
+<tool id="fetch_mlst_data" name="Fetch MLST Data" version="0.0.1" tool_type="manage_data">
+    <description>fetching</description>
+    <requirements>
+         <requirement type="package" version="2.7.10">python</requirement>
+    </requirements>
+    <command interpreter="python">fetch_mlst_data.py --outfile "${out_file}" --species "$species" --reference $reference</command>
+    <inputs>
+      <param name="species" type="select" label="Select the organism">
+        <option value="Achromobacter spp.">Achromobacter spp.</option>
+        <option value="Acinetobacter baumannii">Acinetobacter baumannii</option>
+        <option value="Aeromonas spp.">Aeromonas spp.</option>
+        <option value="Anaplasma phagocytophilum">Anaplasma phagocytophilum</option>
+        <option value="Arcobacter spp.">Arcobacter spp.</option>
+        <option value="Aspergillus fumigatus">Aspergillus fumigatus</option>
+        <option value="Bacillus cereus">Bacillus cereus</option>
+        <option value="Bacillus licheniformis">Bacillus licheniformis</option>
+        <option value="Bacillus subtilis">Bacillus subtilis</option>
+        <option value="Bartonella henselae">Bartonella henselae</option>
+        <option value="Bordetella spp.">Bordetella spp.</option>
+        <option value="Borrelia spp.">Borrelia spp.</option>
+        <option value="Brachyspira hampsonii">Brachyspira hampsonii</option>
+        <option value="Brachyspira hyodysenteriae">Brachyspira hyodysenteriae</option>
+        <option value="Brachyspira intermedia">Brachyspira intermedia</option>
+        <option value="Brachyspira pilosicoli">Brachyspira pilosicoli</option>
+        <option value="Brachyspira spp.">Brachyspira spp.</option>
+        <option value="Burkholderia cepacia complex">Burkholderia cepacia complex</option>
+        <option value="Burkholderia pseudomallei">Burkholderia pseudomallei</option>
+        <option value="Campylobacter concisus/curvus">Campylobacter concisus/curvus</option>
+        <option value="Campylobacter fetus">Campylobacter fetus</option>
+        <option value="Campylobacter helveticus">Campylobacter helveticus</option>
+        <option value="Campylobacter hyointestinalis">Campylobacter hyointestinalis</option>
+        <option value="Campylobacter insulaenigrae">Campylobacter insulaenigrae</option>
+        <option value="Campylobacter jejuni">Campylobacter jejuni</option>
+        <option value="Campylobacter lanienae">Campylobacter lanienae</option>
+        <option value="Campylobacter lari">Campylobacter lari</option>
+        <option value="Campylobacter sputorum">Campylobacter sputorum</option>
+        <option value="Campylobacter upsaliensis">Campylobacter upsaliensis</option>
+        <option value="Candida albicans">Candida albicans</option>
+        <option value="Candida glabrata">Candida glabrata</option>
+        <option value="Candida krusei">Candida krusei</option>
+        <option value="Candida tropicalis">Candida tropicalis</option>
+        <option value="Carnobacterium maltaromaticum">Carnobacterium maltaromaticum</option>
+        <option value="Chlamydiales spp.">Chlamydiales spp.</option>
+        <option value="Citrobacter freundii">Citrobacter freundii</option>
+        <option value="Clonorchis sinensis">Clonorchis sinensis</option>
+        <option value="Clostridium botulinum">Clostridium botulinum</option>
+        <option value="Clostridium difficile">Clostridium difficile</option>
+        <option value="Clostridium septicum">Clostridium septicum</option>
+        <option value="Corynebacterium diphtheriae">Corynebacterium diphtheriae</option>
+        <option value="Cronobacter spp.">Cronobacter spp.</option>
+        <option value="Enterobacter cloacae">Enterobacter cloacae</option>
+        <option value="Enterococcus faecalis">Enterococcus faecalis</option>
+        <option value="Enterococcus faecium">Enterococcus faecium</option>
+        <option value="Escherichia coli">Escherichia coli</option>
+        <option value="Flavobacterium psychrophilum">Flavobacterium psychrophilum</option>
+        <option value="Haemophilus influenzae">Haemophilus influenzae</option>
+        <option value="Haemophilus parasuis">Haemophilus parasuis</option>
+        <option value="Helicobacter cinaedi">Helicobacter cinaedi</option>
+        <option value="Helicobacter pylori">Helicobacter pylori</option>
+        <option value="Helicobacter suis"> Helicobacter suis</option>
+        <option value="Kingella kingae">Kingella kingae</option>
+        <option value="Klebsiella oxytoca">Klebsiella oxytoca</option>
+        <option value="Klebsiella pneumoniae">Klebsiella pneumoniae</option>
+        <option value="Kudoa septempunctata">Kudoa septempunctata</option>
+        <option value="Lactobacillus salivarius">Lactobacillus salivarius</option>
+        <option value="Leptospira spp.">Leptospira spp.</option>
+        <option value="Listeria monocytogenes">Listeria monocytogenes</option>
+        <option value="Mannheimia haemolytica">Mannheimia haemolytica</option>
+        <option value="Melissococcus plutonius">Melissococcus plutonius</option>
+        <option value="Moraxella catarrhalis">Moraxella catarrhalis</option>
+        <option value="Mycobacterium abscessus">Mycobacterium abscessus</option>
+        <option value="Mycobacterium massiliense">Mycobacterium massiliense</option>
+        <option value="Mycoplasma agalactiae">Mycoplasma agalactiae</option>
+        <option value="Mycoplasma bovis">Mycoplasma bovis</option>
+        <option value="Mycoplasma hyorhinis">Mycoplasma hyorhinis</option>
+        <option value="Neisseria spp.">Neisseria spp.</option>
+        <option value="Orientia tsutsugamushi">Orientia tsutsugamushi</option>
+        <option value="Ornithobacterium rhinotracheale">Ornithobacterium rhinotracheale</option>
+        <option value="Paenibacillus larvae">Paenibacillus larvae</option>
+        <option value="Pasteurella multocida">Pasteurella multocida</option>
+        <option value="Pediococcus pentosaceus">Pediococcus pentosaceus</option>
+        <option value="Porphyromonas gingivalis">Porphyromonas gingivalis</option>
+        <option value="Propionibacterium acnes">Propionibacterium acnes</option>
+        <option value="Pseudomonas aeruginosa">Pseudomonas aeruginosa</option>
+        <option value="Pseudomonas fluorescens">Pseudomonas fluorescens</option>
+        <option value="Riemerella anatipestifer">Riemerella anatipestifer</option>
+        <option value="Salmonella enterica">Salmonella enterica</option>
+        <option value="Sinorhizobium spp.">Sinorhizobium spp.</option>
+        <option value="Staphylococcus aureus">Staphylococcus aureus</option>
+        <option value="Staphylococcus epidermidis">Staphylococcus epidermidis</option>
+        <option value="Staphylococcus haemolyticus">Staphylococcus haemolyticus</option>
+        <option value="Staphylococcus lugdunensis">Staphylococcus lugdunensis</option>
+        <option value="Staphylococcus pseudintermedius">Staphylococcus pseudintermedius</option>
+        <option value="Stapylococcus hominis">Stapylococcus hominis</option>
+        <option value="Stenotrophomonas maltophilia">Stenotrophomonas maltophilia</option>
+        <option value="Streptococcus agalactiae">Streptococcus agalactiae</option>
+        <option value="Streptococcus canis">Streptococcus canis</option>
+        <option value="Streptococcus dysgalactiae equisimilis">Streptococcus dysgalactiae equisimilis</option>
+        <option value="Streptococcus gallolyticus">Streptococcus gallolyticus</option>
+        <option value="Streptococcus oralis">Streptococcus oralis</option>
+        <option value="Streptococcus pneumoniae">Streptococcus pneumoniae</option>
+        <option value="Streptococcus pyogenes">Streptococcus pyogenes</option>
+        <option value="Streptococcus suis">Streptococcus suis</option>
+        <option value="Streptococcus thermophilus">Streptococcus thermophilus</option>
+        <option value="Streptococcus uberis">Streptococcus uberis</option>
+        <option value="Streptococcus zooepidemicus">Streptococcus zooepidemicus</option>
+        <option value="Streptomyces spp">Streptomyces spp</option>
+        <option value="Taylorella spp.">Taylorella spp.</option>
+        <option value="Tenacibaculum spp.">Tenacibaculum spp.</option>
+        <option value="Trichomonas vaginalis">Trichomonas vaginalis</option>
+        <option value="Vibrio cholerae">Vibrio cholerae</option>
+        <option value="Vibrio parahaemolyticus">Vibrio parahaemolyticus</option>
+        <option value="Vibrio spp.">Vibrio spp.</option>
+        <option value="Vibrio tapetis"> Vibrio tapetis</option>
+        <option value="Vibrio vulnificus">Vibrio vulnificus</option>
+        <option value="Wolbachia">Wolbachia</option>
+        <option value="Xylella fastidiosa">Xylella fastidiosa</option>
+        <option value="Yersinia pseudotuberculosis">Yersinia pseudotuberculosis</option>
+        <option value="Yersinia ruckeri">Yersinia ruckeri</option>
+        <option value="Yersinia spp.">Yersinia spp.</option>
+	  </param>
+
+    <param name="reference" type="text" value="NC_xxxxxx" label="Please provide a NCBI accession number for the reference genome to use." help="A reference genome is used for extracting the flanking regions to tying genes. Please see help below." />
+
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+
+    <help>
+**What it does**
+
+Fetches MLST loci and profiles for a given organism from pubmlst.org and populates the "mlst" data table.
+
+A reference must be provided each time. Use an NBCI accession number (e.g. NC_003210) so it can be automatically downloaded.
+
+Here are some valid accession number for the species we use most frequently:
+
+
+========================  =====================
+Organism                  recommended accession
+========================  =====================
+Campylobacter jejuni      AL111168
+Escherichia coli          CP002797
+Listeria monocytogenes    NC_003210
+Salmonella enterica       NC_003197
+Staphylococcus aureus     NC_002952
+Streptococcus pneumoniae  NC_017769
+Streptococcus pyogenes    AE014074
+========================  =====================
+
+------
+
+If your organism of choice is not on this list you need to look up the correct accession number for a valid reference genome on the NCBI website (http://www.ncbi.nlm.nih.gov/nuccore/).
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/fetch_mlst_data.xml" id="fetch_mlst_data" version="1.0.0">
+        <data_table name="mlst_data">
+            <output>
+                <column name="value" />
+                <column name="dbkey" />
+                <column name="name" />
+                <column name="time_stamp" />
+                <column name="file_path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mlst_data</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mlst_data/${dbkey}/${time_stamp}</value_translation> <!-- Store this value in the final Data Table -->
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/mlst_data.loc.sample	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,7 @@
+#This file contains entries that point to available MLST data
+#It has 5 columns separated by a tab character
+#value	species name
+#dbkey	species name
+#name	Species name + time stamp to be shown in table, eg. Klebsiella pneumoniae-20160707142859
+#time_stamp	when the data was downloaded
+#file_path	absolute path to there allele fastas, profile.txt and reference are stored
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<tables>
+    <table name="mlst_data" comment_char="#">
+        <columns>value, dbkey, name, time_stamp, file_path</columns>
+        <file path="tool-data/mlst_data.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Jul 13 05:50:48 2016 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+	<package name="python" version="2.7.10">
+        <repository changeset_revision="0339c4a9b87b" name="package_python_2_7_10" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>