comparison data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip

Uploaded
author ulfschaefer
date Wed, 13 Jul 2016 05:50:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:25d4d9f313a0
1 #!/usr/bin/env python
2
3 '''
4 Download MLST datasets from this site: http://pubmlst.org/data/ by
5 parsing an xml file (http://pubmlst.org/data/dbases.xml).
6
7 Data is downloaded for a species determined by the user:
8 - profiles (maps STs to allele numbers)
9 - numbered sequences for each locus in the scheme
10
11 In addition, the alleles are concatenated together for use with SRST2.
12
13 A log file is also generated in the working directory, detailing the
14 time, date and location of all files downloaded, as well as the <retrieved>
15 tag which tells us when the XML entry was last updated.
16
17 If the species name input by the user matches multiple <species> in the
18 xml file, the script simply reports the possible matches so the user can
19 try again.
20 '''
21
22 """
23 - Remove empty line at the end of profiles.txt file.
24 - Ensure the allele names at the profiles.txt file don't contain "_".
25
26 """
27 from argparse import ArgumentParser
28 import xml.dom.minidom as xml
29 import urllib2 as url
30 import re
31 import os
32 import sys
33 import glob
34 import csv
35 import shutil
36 from urlparse import urlparse
37 import time
38 import subprocess
39 from json import dumps
40 from json import loads
41
42 # --------------------------------------------------------------------------------------------------
43
44 def parse_args():
45 parser = ArgumentParser(description='Download MLST datasets by species'
46 'from pubmlst.org.')
47
48 parser.add_argument('--repository_url',
49 metavar = 'URL',
50 default = 'http://pubmlst.org/data/dbases.xml',
51 help = 'URL for MLST repository XML index')
52
53 parser.add_argument('--species',
54 metavar = 'NAME',
55 required = True,
56 help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
57
58 parser.add_argument('--outfile',
59 metavar = 'FILE',
60 required = True,
61 help = 'The name of the Json file to write that galaxy stuff to.')
62
63 parser.add_argument('--reference',
64 metavar = 'ACCESSION',
65 required = True,
66 help = 'NCBI accession number of the reference genome to use for flanking regions.')
67
68 return parser.parse_args()
69
70 # --------------------------------------------------------------------------------------------------
71
72 def main():
73
74 """
75 <species>
76 Achromobacter spp.
77 <mlst>
78 <database>
79 <url>http://pubmlst.org/achromobacter</url>
80 <retrieved>2015-08-11</retrieved>
81 <profiles>
82 <count>272</count>
83 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
84 </profiles>
85 <loci>
86 <locus>
87 nusA
88 <url>
89 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
90 </url>
91 </locus>
92 <locus>
93 rpoB
94 <url>
95 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
96 </url>
97 </locus>
98 """
99
100 args = parse_args()
101 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml
102
103 doc = xml.parse(docFile)
104 root = doc.childNodes[0]
105 found_species = []
106
107 if args.species == "Escherichia coli":
108 args.species = "Escherichia coli#1"
109 elif args.species == "Acinetobacter baumannii":
110 args.species = "Acinetobacter baumannii#1"
111 elif args.species == "Pasteurella multocida":
112 args.species = "Pasteurella multocida#1"
113 else:
114 pass
115
116 for species_node in root.getElementsByTagName('species'):
117 info = getSpeciesInfo(species_node, args.species)
118 if info != None:
119 found_species.append(info)
120
121 if len(found_species) == 0:
122 sys.stderr.write("No species matched your query.\n")
123 exit(1)
124
125 if len(found_species) > 1:
126 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
127 for info in found_species:
128 sys.stderr.write(info.name + '\n')
129 exit(2)
130
131 # output information for the single matching species
132 assert len(found_species) == 1
133 species_info = found_species[0]
134 species_name_underscores = species_info.name.replace(' ', '_')
135 timestamp = time.strftime("%Y%m%d%H%M%S")
136
137 params = loads(open(args.outfile).read())
138 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
139
140 if not os.path.isdir(folder):
141 os.makedirs(folder)
142
143 profile_doc = url.urlopen(species_info.profiles_url)
144 with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
145 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
146 for line in profile_doc.readlines():
147 cols = line.split("\t")
148 f.write("%s\n" % ('\t'.join(cols[0:8])))
149 profile_doc.close()
150
151 for locus in species_info.loci:
152 locus_path = urlparse(locus.url).path
153 locus_filename = locus_path.split('/')[-1]
154 locus_filename = locus_filename.replace("_.tfa", ".fas")
155 locus_filename = locus_filename.replace("tfa", "fas")
156 locus_doc = url.urlopen(locus.url)
157 with open(os.path.join(folder, locus_filename), 'w') as locus_file:
158 locus_fasta_content = locus_doc.read()
159 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
160 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
161 locus_file.write(locus_fasta_content)
162 locus_doc.close()
163
164 get_reference(folder, args.reference)
165
166
167 # do Galaxy stuff
168 data_manager_dict = {}
169 data_manager_dict['data_tables'] = {}
170 name = "%s-%s" % (species_info.name, timestamp)
171 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
172 dbkey=species_name_underscores,
173 name=name,
174 time_stamp=timestamp,
175 file_path=folder)]
176 #save info to json file
177 with open(args.outfile, 'wb') as fjson:
178 fjson.write(dumps(data_manager_dict))
179
180 # end of main --------------------------------------------------------------------------------------
181
182 def get_reference(folder, acc):
183
184 # We're getting this file from Japan!
185 # It seems to work pretty well until they take down or change their website
186 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
187 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
188 remote_ref = url.urlopen(refurl)
189 ref_filename = os.path.join(folder, 'reference.seq')
190 with open(ref_filename, 'wb') as fRef:
191 fRef.write(remote_ref.read())
192 remote_ref.close()
193
194 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
195 % (ref_filename, ref_filename.replace("reference.seq", "reference"))
196 p = subprocess.Popen(cmd,
197 shell=True,
198 stdin=None,
199 stdout=subprocess.PIPE,
200 stderr=subprocess.PIPE, close_fds=True)
201 p.wait()
202
203 return
204
205 # --------------------------------------------------------------------------------------------------
206
207 # test if a node is an Element and that it has a specific tag name
208 def testElementTag(node, name):
209 return node.nodeType == node.ELEMENT_NODE and node.localName == name
210
211 # --------------------------------------------------------------------------------------------------
212
213 # Get the text from an element node with a text node child
214 def getText(element):
215 result = ''
216 for node in element.childNodes:
217 if node.nodeType == node.TEXT_NODE:
218 result += node.data
219 return normaliseText(result)
220
221 # --------------------------------------------------------------------------------------------------
222
223 # remove unwanted whitespace including linebreaks etc.
224 def normaliseText(str):
225 return ' '.join(str.split())
226
227 # --------------------------------------------------------------------------------------------------
228
229 # A collection of interesting information about a taxa
230 class SpeciesInfo(object):
231 def __init__(self):
232 self.name = None # String name of species
233 self.database_url = None # URL as string
234 self.retrieved = None # date as string
235 self.profiles_url = None # URL as string
236 self.profiles_count = None # positive integer
237 self.loci = [] # list of loci
238
239 def __str__(self):
240 s = "Name: %s\n" % self.name
241 s += "Database URL: %s\n" % self.database_url
242 s += "Retrieved: %s\n" % self.retrieved
243 s += "Profiles URL: %s\n" % self.profiles_url
244 s += "Profiles count: %s\n" % self.profiles_count
245 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
246 return s
247
248 # --------------------------------------------------------------------------------------------------
249
250 class LocusInfo(object):
251 def __init__(self):
252 self.url = None
253 self.name = None
254 def __str__(self):
255 return "Locus: name:%s,url:%s" % (self.name, self.url)
256
257 # --------------------------------------------------------------------------------------------------
258
259 # retrieve the interesting information for a given sample element
260 def getSpeciesInfo(species_node, species):
261 this_name = getText(species_node)
262 print this_name
263 if this_name.startswith(species):
264 info = SpeciesInfo()
265 info.name = this_name
266 for mlst_node in species_node.getElementsByTagName('mlst'):
267 for database_node in mlst_node.getElementsByTagName('database'):
268 for database_child_node in database_node.childNodes:
269 if testElementTag(database_child_node, 'url'):
270 info.database_url = getText(database_child_node)
271 elif testElementTag(database_child_node, 'retrieved'):
272 info.retrieved = getText(database_child_node)
273 elif testElementTag(database_child_node, 'profiles'):
274 for profile_count in database_child_node.getElementsByTagName('count'):
275 info.profiles_count = getText(profile_count)
276 for profile_url in database_child_node.getElementsByTagName('url'):
277 info.profiles_url = getText(profile_url)
278 elif testElementTag(database_child_node, 'loci'):
279 for locus_node in database_child_node.getElementsByTagName('locus'):
280 locus_info = LocusInfo()
281 locus_info.name = getText(locus_node)
282 for locus_url in locus_node.getElementsByTagName('url'):
283 locus_info.url = getText(locus_url)
284 info.loci.append(locus_info)
285
286 return info
287 else:
288 return None
289
290 # --------------------------------------------------------------------------------------------------
291
292 if __name__ == '__main__':
293 main()