Mercurial > repos > ulfschaefer > data_manager_phemost
comparison data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip
Uploaded
author | ulfschaefer |
---|---|
date | Wed, 13 Jul 2016 05:50:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:25d4d9f313a0 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ''' | |
4 Download MLST datasets from this site: http://pubmlst.org/data/ by | |
5 parsing an xml file (http://pubmlst.org/data/dbases.xml). | |
6 | |
7 Data is downloaded for a species determined by the user: | |
8 - profiles (maps STs to allele numbers) | |
9 - numbered sequences for each locus in the scheme | |
10 | |
11 In addition, the alleles are concatenated together for use with SRST2. | |
12 | |
13 A log file is also generated in the working directory, detailing the | |
14 time, date and location of all files downloaded, as well as the <retrieved> | |
15 tag which tells us when the XML entry was last updated. | |
16 | |
17 If the species name input by the user matches multiple <species> in the | |
18 xml file, the script simply reports the possible matches so the user can | |
19 try again. | |
20 ''' | |
21 | |
22 """ | |
23 - Remove empty line at the end of profiles.txt file. | |
24 - Ensure the allele names at the profiles.txt file don't contain "_". | |
25 | |
26 """ | |
27 from argparse import ArgumentParser | |
28 import xml.dom.minidom as xml | |
29 import urllib2 as url | |
30 import re | |
31 import os | |
32 import sys | |
33 import glob | |
34 import csv | |
35 import shutil | |
36 from urlparse import urlparse | |
37 import time | |
38 import subprocess | |
39 from json import dumps | |
40 from json import loads | |
41 | |
42 # -------------------------------------------------------------------------------------------------- | |
43 | |
44 def parse_args(): | |
45 parser = ArgumentParser(description='Download MLST datasets by species' | |
46 'from pubmlst.org.') | |
47 | |
48 parser.add_argument('--repository_url', | |
49 metavar = 'URL', | |
50 default = 'http://pubmlst.org/data/dbases.xml', | |
51 help = 'URL for MLST repository XML index') | |
52 | |
53 parser.add_argument('--species', | |
54 metavar = 'NAME', | |
55 required = True, | |
56 help = 'The name of the species that you want to download (e.g. "Escherichia coli")') | |
57 | |
58 parser.add_argument('--outfile', | |
59 metavar = 'FILE', | |
60 required = True, | |
61 help = 'The name of the Json file to write that galaxy stuff to.') | |
62 | |
63 parser.add_argument('--reference', | |
64 metavar = 'ACCESSION', | |
65 required = True, | |
66 help = 'NCBI accession number of the reference genome to use for flanking regions.') | |
67 | |
68 return parser.parse_args() | |
69 | |
70 # -------------------------------------------------------------------------------------------------- | |
71 | |
72 def main(): | |
73 | |
74 """ | |
75 <species> | |
76 Achromobacter spp. | |
77 <mlst> | |
78 <database> | |
79 <url>http://pubmlst.org/achromobacter</url> | |
80 <retrieved>2015-08-11</retrieved> | |
81 <profiles> | |
82 <count>272</count> | |
83 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> | |
84 </profiles> | |
85 <loci> | |
86 <locus> | |
87 nusA | |
88 <url> | |
89 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa | |
90 </url> | |
91 </locus> | |
92 <locus> | |
93 rpoB | |
94 <url> | |
95 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa | |
96 </url> | |
97 </locus> | |
98 """ | |
99 | |
100 args = parse_args() | |
101 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml | |
102 | |
103 doc = xml.parse(docFile) | |
104 root = doc.childNodes[0] | |
105 found_species = [] | |
106 | |
107 if args.species == "Escherichia coli": | |
108 args.species = "Escherichia coli#1" | |
109 elif args.species == "Acinetobacter baumannii": | |
110 args.species = "Acinetobacter baumannii#1" | |
111 elif args.species == "Pasteurella multocida": | |
112 args.species = "Pasteurella multocida#1" | |
113 else: | |
114 pass | |
115 | |
116 for species_node in root.getElementsByTagName('species'): | |
117 info = getSpeciesInfo(species_node, args.species) | |
118 if info != None: | |
119 found_species.append(info) | |
120 | |
121 if len(found_species) == 0: | |
122 sys.stderr.write("No species matched your query.\n") | |
123 exit(1) | |
124 | |
125 if len(found_species) > 1: | |
126 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) | |
127 for info in found_species: | |
128 sys.stderr.write(info.name + '\n') | |
129 exit(2) | |
130 | |
131 # output information for the single matching species | |
132 assert len(found_species) == 1 | |
133 species_info = found_species[0] | |
134 species_name_underscores = species_info.name.replace(' ', '_') | |
135 timestamp = time.strftime("%Y%m%d%H%M%S") | |
136 | |
137 params = loads(open(args.outfile).read()) | |
138 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) | |
139 | |
140 if not os.path.isdir(folder): | |
141 os.makedirs(folder) | |
142 | |
143 profile_doc = url.urlopen(species_info.profiles_url) | |
144 with open(os.path.join(folder, 'profiles.txt'), 'w') as f: | |
145 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) | |
146 for line in profile_doc.readlines(): | |
147 cols = line.split("\t") | |
148 f.write("%s\n" % ('\t'.join(cols[0:8]))) | |
149 profile_doc.close() | |
150 | |
151 for locus in species_info.loci: | |
152 locus_path = urlparse(locus.url).path | |
153 locus_filename = locus_path.split('/')[-1] | |
154 locus_filename = locus_filename.replace("_.tfa", ".fas") | |
155 locus_filename = locus_filename.replace("tfa", "fas") | |
156 locus_doc = url.urlopen(locus.url) | |
157 with open(os.path.join(folder, locus_filename), 'w') as locus_file: | |
158 locus_fasta_content = locus_doc.read() | |
159 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") | |
160 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) | |
161 locus_file.write(locus_fasta_content) | |
162 locus_doc.close() | |
163 | |
164 get_reference(folder, args.reference) | |
165 | |
166 | |
167 # do Galaxy stuff | |
168 data_manager_dict = {} | |
169 data_manager_dict['data_tables'] = {} | |
170 name = "%s-%s" % (species_info.name, timestamp) | |
171 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, | |
172 dbkey=species_name_underscores, | |
173 name=name, | |
174 time_stamp=timestamp, | |
175 file_path=folder)] | |
176 #save info to json file | |
177 with open(args.outfile, 'wb') as fjson: | |
178 fjson.write(dumps(data_manager_dict)) | |
179 | |
180 # end of main -------------------------------------------------------------------------------------- | |
181 | |
182 def get_reference(folder, acc): | |
183 | |
184 # We're getting this file from Japan! | |
185 # It seems to work pretty well until they take down or change their website | |
186 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 | |
187 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) | |
188 remote_ref = url.urlopen(refurl) | |
189 ref_filename = os.path.join(folder, 'reference.seq') | |
190 with open(ref_filename, 'wb') as fRef: | |
191 fRef.write(remote_ref.read()) | |
192 remote_ref.close() | |
193 | |
194 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ | |
195 % (ref_filename, ref_filename.replace("reference.seq", "reference")) | |
196 p = subprocess.Popen(cmd, | |
197 shell=True, | |
198 stdin=None, | |
199 stdout=subprocess.PIPE, | |
200 stderr=subprocess.PIPE, close_fds=True) | |
201 p.wait() | |
202 | |
203 return | |
204 | |
205 # -------------------------------------------------------------------------------------------------- | |
206 | |
207 # test if a node is an Element and that it has a specific tag name | |
208 def testElementTag(node, name): | |
209 return node.nodeType == node.ELEMENT_NODE and node.localName == name | |
210 | |
211 # -------------------------------------------------------------------------------------------------- | |
212 | |
213 # Get the text from an element node with a text node child | |
214 def getText(element): | |
215 result = '' | |
216 for node in element.childNodes: | |
217 if node.nodeType == node.TEXT_NODE: | |
218 result += node.data | |
219 return normaliseText(result) | |
220 | |
221 # -------------------------------------------------------------------------------------------------- | |
222 | |
223 # remove unwanted whitespace including linebreaks etc. | |
224 def normaliseText(str): | |
225 return ' '.join(str.split()) | |
226 | |
227 # -------------------------------------------------------------------------------------------------- | |
228 | |
229 # A collection of interesting information about a taxa | |
230 class SpeciesInfo(object): | |
231 def __init__(self): | |
232 self.name = None # String name of species | |
233 self.database_url = None # URL as string | |
234 self.retrieved = None # date as string | |
235 self.profiles_url = None # URL as string | |
236 self.profiles_count = None # positive integer | |
237 self.loci = [] # list of loci | |
238 | |
239 def __str__(self): | |
240 s = "Name: %s\n" % self.name | |
241 s += "Database URL: %s\n" % self.database_url | |
242 s += "Retrieved: %s\n" % self.retrieved | |
243 s += "Profiles URL: %s\n" % self.profiles_url | |
244 s += "Profiles count: %s\n" % self.profiles_count | |
245 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) | |
246 return s | |
247 | |
248 # -------------------------------------------------------------------------------------------------- | |
249 | |
250 class LocusInfo(object): | |
251 def __init__(self): | |
252 self.url = None | |
253 self.name = None | |
254 def __str__(self): | |
255 return "Locus: name:%s,url:%s" % (self.name, self.url) | |
256 | |
257 # -------------------------------------------------------------------------------------------------- | |
258 | |
259 # retrieve the interesting information for a given sample element | |
260 def getSpeciesInfo(species_node, species): | |
261 this_name = getText(species_node) | |
262 print this_name | |
263 if this_name.startswith(species): | |
264 info = SpeciesInfo() | |
265 info.name = this_name | |
266 for mlst_node in species_node.getElementsByTagName('mlst'): | |
267 for database_node in mlst_node.getElementsByTagName('database'): | |
268 for database_child_node in database_node.childNodes: | |
269 if testElementTag(database_child_node, 'url'): | |
270 info.database_url = getText(database_child_node) | |
271 elif testElementTag(database_child_node, 'retrieved'): | |
272 info.retrieved = getText(database_child_node) | |
273 elif testElementTag(database_child_node, 'profiles'): | |
274 for profile_count in database_child_node.getElementsByTagName('count'): | |
275 info.profiles_count = getText(profile_count) | |
276 for profile_url in database_child_node.getElementsByTagName('url'): | |
277 info.profiles_url = getText(profile_url) | |
278 elif testElementTag(database_child_node, 'loci'): | |
279 for locus_node in database_child_node.getElementsByTagName('locus'): | |
280 locus_info = LocusInfo() | |
281 locus_info.name = getText(locus_node) | |
282 for locus_url in locus_node.getElementsByTagName('url'): | |
283 locus_info.url = getText(locus_url) | |
284 info.loci.append(locus_info) | |
285 | |
286 return info | |
287 else: | |
288 return None | |
289 | |
290 # -------------------------------------------------------------------------------------------------- | |
291 | |
292 if __name__ == '__main__': | |
293 main() |