view data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip

Uploaded
author ulfschaefer
date Wed, 13 Jul 2016 05:50:48 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

'''
Download MLST datasets from this site: http://pubmlst.org/data/ by
parsing an xml file (http://pubmlst.org/data/dbases.xml).

Data is downloaded for a species determined by the user:
- profiles (maps STs to allele numbers)
- numbered sequences for each locus in the scheme

In addition, the alleles are concatenated together for use with SRST2.

A log file is also generated in the working directory, detailing the
time, date and location of all files downloaded, as well as the <retrieved>
tag which tells us when the XML entry was last updated.

If the species name input by the user matches multiple <species> in the
xml file, the script simply reports the possible matches so the user can
try again.
'''

"""
- Remove empty line at the end of profiles.txt file.
- Ensure the allele names at the profiles.txt file don't contain "_".

"""
from argparse import ArgumentParser
import xml.dom.minidom as xml
import urllib2 as url
import re
import os
import sys
import glob
import csv
import shutil
from urlparse import urlparse
import time
import subprocess
from json import dumps
from json import loads

# --------------------------------------------------------------------------------------------------

def parse_args():
    parser = ArgumentParser(description='Download MLST datasets by species'
                                        'from pubmlst.org.')

    parser.add_argument('--repository_url',
                        metavar = 'URL',
                        default = 'http://pubmlst.org/data/dbases.xml',
                        help = 'URL for MLST repository XML index')

    parser.add_argument('--species',
                        metavar = 'NAME',
                        required = True,
                        help = 'The name of the species that you want to download (e.g. "Escherichia coli")')

    parser.add_argument('--outfile',
                        metavar = 'FILE',
                        required = True,
                        help = 'The name of the Json file to write that galaxy stuff to.')

    parser.add_argument('--reference',
                        metavar = 'ACCESSION',
                        required = True,
                        help = 'NCBI accession number of the reference genome to use for flanking regions.')

    return parser.parse_args()

# --------------------------------------------------------------------------------------------------

def main():

    """
    <species>
    Achromobacter spp.
    <mlst>
    <database>
    <url>http://pubmlst.org/achromobacter</url>
    <retrieved>2015-08-11</retrieved>
    <profiles>
    <count>272</count>
    <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
    </profiles>
    <loci>
    <locus>
    nusA
    <url>
    http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
    </url>
    </locus>
    <locus>
    rpoB
    <url>
    http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
    </url>
    </locus>
    """

    args = parse_args()
    docFile = url.urlopen(args.repository_url) # url address  #args.repository_url =http://pubmlst.org/data/dbases.xml

    doc = xml.parse(docFile)
    root = doc.childNodes[0]
    found_species = []

    if args.species == "Escherichia coli":
        args.species = "Escherichia coli#1"
    elif args.species == "Acinetobacter baumannii":
        args.species = "Acinetobacter baumannii#1"
    elif args.species == "Pasteurella multocida":
        args.species = "Pasteurella multocida#1"
    else:
        pass

    for species_node in root.getElementsByTagName('species'):
        info = getSpeciesInfo(species_node, args.species)
        if info != None:
            found_species.append(info)

    if len(found_species) == 0:
        sys.stderr.write("No species matched your query.\n")
        exit(1)

    if len(found_species) > 1:
        sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
        for info in found_species:
            sys.stderr.write(info.name + '\n')
        exit(2)

    # output information for the single matching species
    assert len(found_species) == 1
    species_info = found_species[0]
    species_name_underscores = species_info.name.replace(' ', '_')
    timestamp = time.strftime("%Y%m%d%H%M%S")

    params = loads(open(args.outfile).read())
    folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)

    if not os.path.isdir(folder):
        os.makedirs(folder)

    profile_doc = url.urlopen(species_info.profiles_url)
    with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
        sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
        for line in profile_doc.readlines():
            cols = line.split("\t")
            f.write("%s\n" % ('\t'.join(cols[0:8])))
    profile_doc.close()

    for locus in species_info.loci:
        locus_path = urlparse(locus.url).path
        locus_filename = locus_path.split('/')[-1]
        locus_filename = locus_filename.replace("_.tfa", ".fas")
        locus_filename = locus_filename.replace("tfa", "fas")
        locus_doc = url.urlopen(locus.url)
        with open(os.path.join(folder, locus_filename), 'w') as locus_file:
            locus_fasta_content = locus_doc.read()
            locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
            sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
            locus_file.write(locus_fasta_content)
        locus_doc.close()

    get_reference(folder, args.reference)


    # do Galaxy stuff
    data_manager_dict = {}
    data_manager_dict['data_tables'] = {}
    name = "%s-%s" % (species_info.name, timestamp)
    data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
                                                          dbkey=species_name_underscores,
                                                          name=name,
                                                          time_stamp=timestamp,
                                                          file_path=folder)]
    #save info to json file
    with open(args.outfile, 'wb') as fjson:
        fjson.write(dumps(data_manager_dict))

# end of main --------------------------------------------------------------------------------------

def get_reference(folder, acc):

    # We're getting this file from Japan!
    # It seems to work pretty well until they take down or change their website
    # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
    refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
    remote_ref = url.urlopen(refurl)
    ref_filename = os.path.join(folder, 'reference.seq')
    with open(ref_filename, 'wb') as fRef:
        fRef.write(remote_ref.read())
    remote_ref.close()

    cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
          % (ref_filename, ref_filename.replace("reference.seq", "reference"))
    p = subprocess.Popen(cmd,
                         shell=True,
                         stdin=None,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE, close_fds=True)
    p.wait()

    return

# --------------------------------------------------------------------------------------------------

# test if a node is an Element and that it has a specific tag name
def testElementTag(node, name):
    return node.nodeType == node.ELEMENT_NODE and node.localName == name

# --------------------------------------------------------------------------------------------------

# Get the text from an element node with a text node child
def getText(element):
    result = ''
    for node in element.childNodes:
        if node.nodeType == node.TEXT_NODE:
            result += node.data
    return normaliseText(result)

# --------------------------------------------------------------------------------------------------

# remove unwanted whitespace including linebreaks etc.
def normaliseText(str):
    return ' '.join(str.split())

# --------------------------------------------------------------------------------------------------

# A collection of interesting information about a taxa
class SpeciesInfo(object):
    def __init__(self):
        self.name = None # String name of species
        self.database_url = None # URL as string
        self.retrieved = None # date as string
        self.profiles_url = None # URL as string
        self.profiles_count = None # positive integer
        self.loci = [] # list of loci

    def __str__(self):
        s = "Name: %s\n" % self.name
        s += "Database URL: %s\n" % self.database_url
        s += "Retrieved: %s\n" % self.retrieved
        s += "Profiles URL: %s\n" % self.profiles_url
        s += "Profiles count: %s\n" % self.profiles_count
        s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
        return s

# --------------------------------------------------------------------------------------------------

class LocusInfo(object):
    def __init__(self):
        self.url = None
        self.name = None
    def __str__(self):
        return "Locus: name:%s,url:%s" % (self.name, self.url)

# --------------------------------------------------------------------------------------------------

# retrieve the interesting information for a given sample element
def getSpeciesInfo(species_node, species):
    this_name = getText(species_node)
    print this_name
    if this_name.startswith(species):
        info = SpeciesInfo()
        info.name = this_name
        for mlst_node in species_node.getElementsByTagName('mlst'):
            for database_node in mlst_node.getElementsByTagName('database'):
                for database_child_node in database_node.childNodes:
                    if testElementTag(database_child_node, 'url'):
                        info.database_url = getText(database_child_node)
                    elif testElementTag(database_child_node, 'retrieved'):
                        info.retrieved = getText(database_child_node)
                    elif testElementTag(database_child_node, 'profiles'):
                        for profile_count in database_child_node.getElementsByTagName('count'):
                            info.profiles_count = getText(profile_count)
                        for profile_url in database_child_node.getElementsByTagName('url'):
                            info.profiles_url = getText(profile_url)
                    elif testElementTag(database_child_node, 'loci'):
                        for locus_node in database_child_node.getElementsByTagName('locus'):
                            locus_info = LocusInfo()
                            locus_info.name = getText(locus_node)
                            for locus_url in locus_node.getElementsByTagName('url'):
                                locus_info.url = getText(locus_url)
                            info.loci.append(locus_info)

        return info
    else:
        return None

# --------------------------------------------------------------------------------------------------

if __name__ == '__main__':
    main()