Mercurial > repos > iuc > data_manager_qiime_database_downloader
comparison data_manager/data_manager_qiime_download.py @ 0:f8608fddfb23 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_qiime_database_downloader commit 4934eb34300b5fa54d62d8b67e5b6e989e963ac9
author | iuc |
---|---|
date | Mon, 15 May 2017 11:08:43 -0400 |
parents | |
children | 9e86c09a6cae |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f8608fddfb23 |
---|---|
1 #!/usr/bin/env python | |
2 # Data manager for reference data for the QIIME Galaxy tools | |
3 | |
4 import argparse | |
5 import ftplib | |
6 import json | |
7 import os | |
8 import tarfile | |
9 import zipfile | |
10 | |
11 import requests | |
12 | |
13 | |
14 protocol = { | |
15 "unite": "http", | |
16 "greengenes": "ftp", | |
17 "silva": "http", | |
18 "img": "ftp" | |
19 } | |
20 baseUrl = { | |
21 "unite": "http://unite.ut.ee/sh_files/sh_qiime_release_", | |
22 "greengenes": "greengenes.microbio.me", | |
23 "silva": "http://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_", | |
24 "img": "ftp.microbio.me" | |
25 } | |
26 ftp_dir = { | |
27 "greengenes": "/greengenes_release/gg_", | |
28 "img": "" | |
29 } | |
30 ftp_file_prefix = { | |
31 "greengenes": "gg_", | |
32 "img": "" | |
33 } | |
34 ftp_file_suffix = { | |
35 "greengenes": "_otus", | |
36 "img": "" | |
37 } | |
38 extension = { | |
39 "unite": "zip", | |
40 "greengenes": "tar.gz", | |
41 "silva": { | |
42 "104_release": "tgz", | |
43 "108_release": "tgz", | |
44 "108_release_curated": "tgz", | |
45 "111_release": "tgz", | |
46 "119_consensus_majority_taxonomy": "zip", | |
47 "119_release": "zip", | |
48 "119_release_aligned_rep_files": "tar.gz", | |
49 "123_release": "zip", | |
50 "128_release": "tgz"}, | |
51 "img": "tgz" | |
52 } | |
53 filetypes = ["rep_set", "rep_set_aligned", "taxonomy", "trees"] | |
54 | |
55 | |
56 # Utility functions for interacting with Galaxy JSON | |
57 def read_input_json(jsonfile): | |
58 """Read the JSON supplied from the data manager tool | |
59 | |
60 Returns a tuple (param_dict,extra_files_path) | |
61 | |
62 'param_dict' is an arbitrary dictionary of parameters | |
63 input into the tool; 'extra_files_path' is the path | |
64 to a directory where output files must be put for the | |
65 receiving data manager to pick them up. | |
66 | |
67 NB the directory pointed to by 'extra_files_path' | |
68 doesn't exist initially, it is the job of the script | |
69 to create it if necessary. | |
70 | |
71 """ | |
72 params = json.loads(open(jsonfile).read()) | |
73 return (params['param_dict'], | |
74 params['output_data'][0]['extra_files_path']) | |
75 | |
76 | |
77 # Utility functions for creating data table dictionaries | |
78 # | |
79 # Example usage: | |
80 # >>> d = create_data_tables_dict() | |
81 # >>> add_data_table(d,'my_data') | |
82 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
83 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
84 # >>> print str(json.dumps(d)) | |
85 def create_data_tables_dict(): | |
86 """Return a dictionary for storing data table information | |
87 | |
88 Returns a dictionary that can be used with 'add_data_table' | |
89 and 'add_data_table_entry' to store information about a | |
90 data table. It can be converted to JSON to be sent back to | |
91 the data manager. | |
92 | |
93 """ | |
94 d = {} | |
95 d['data_tables'] = {} | |
96 return d | |
97 | |
98 | |
99 def add_data_table(d, table): | |
100 """Add a data table to the data tables dictionary | |
101 | |
102 Creates a placeholder for a data table called 'table'. | |
103 | |
104 """ | |
105 d['data_tables'][table] = [] | |
106 | |
107 | |
108 def add_data_table_entry(d, table, entry): | |
109 """Add an entry to a data table | |
110 | |
111 Appends an entry to the data table 'table'. 'entry' | |
112 should be a dictionary where the keys are the names of | |
113 columns in the data table. | |
114 | |
115 Raises an exception if the named data table doesn't | |
116 exist. | |
117 | |
118 """ | |
119 try: | |
120 d['data_tables'][table].append(entry) | |
121 except KeyError: | |
122 raise Exception("add_data_table_entry: no table '%s'" % table) | |
123 | |
124 | |
125 def get_ftp_file(ftp, filename): | |
126 """ | |
127 """ | |
128 try: | |
129 ftp.retrbinary("RETR " + filename, open(filename, 'wb').write) | |
130 except: | |
131 print("Error") | |
132 | |
133 | |
134 def download_archive(db, version, ext): | |
135 """ | |
136 | |
137 """ | |
138 filepath = "%s_%s.%s" % (db, version, ext) | |
139 if protocol[db] == "http": | |
140 url = "%s%s.%s" % (baseUrl[db], version, ext) | |
141 r = requests.get(url, stream=True) | |
142 r.raise_for_status() | |
143 with open(filepath, "wb") as fd: | |
144 for chunk in r.iter_content(chunk_size=128): | |
145 fd.write(chunk) | |
146 elif protocol[db] == "ftp": | |
147 ftp = ftplib.FTP(baseUrl[db]) | |
148 ftp.login("anonymous", "ftplib-example-1") | |
149 if db == "greengenes" and version == "13_8": | |
150 ftp.cwd("%s%s" % (ftp_dir[db], "13_5")) | |
151 else: | |
152 ftp.cwd("%s%s" % (ftp_dir[db], version)) | |
153 filepath = "%s%s%s.%s" % ( | |
154 ftp_file_prefix[db], | |
155 version, | |
156 ftp_file_suffix[db], | |
157 ext) | |
158 get_ftp_file(ftp, filepath) | |
159 ftp.quit() | |
160 return filepath | |
161 | |
162 | |
163 def find_archive_content_path(archive_content_path): | |
164 """ | |
165 """ | |
166 content = os.listdir(archive_content_path) | |
167 archive_content = [] | |
168 for x in content: | |
169 if not x.startswith(".") and not x.startswith("_"): | |
170 archive_content.append(x) | |
171 if len(archive_content) == 1: | |
172 archive_content_path = os.path.join( | |
173 archive_content_path, | |
174 archive_content[0]) | |
175 return archive_content_path | |
176 | |
177 | |
178 def extract_archive(filepath, ext, db): | |
179 """ | |
180 """ | |
181 archive_content_path = "tmp" | |
182 if ext == "tar.gz" or ext == "tgz": | |
183 tar = tarfile.open(filepath) | |
184 tar.extractall(path=archive_content_path) | |
185 tar.close() | |
186 archive_content_path = find_archive_content_path(archive_content_path) | |
187 elif ext == "zip": | |
188 zip_ref = zipfile.ZipFile(filepath, 'r') | |
189 zip_ref.extractall(archive_content_path) | |
190 zip_ref.close() | |
191 archive_content_path = find_archive_content_path(archive_content_path) | |
192 return archive_content_path | |
193 | |
194 | |
195 def move_unite_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir): | |
196 """ | |
197 | |
198 """ | |
199 archive_content = os.listdir(archive_content_path) | |
200 for content in archive_content: | |
201 content_filepath = os.path.join(archive_content_path, content) | |
202 content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0]) | |
203 content_filename_prefix = "%s_%s" % (filename_prefix, content) | |
204 if content.find("refs") != -1: | |
205 move_file( | |
206 content_filepath, | |
207 content_filename_prefix, | |
208 content_name_prefix, | |
209 data_tables, | |
210 os.path.join(target_dir, "rep_set"), | |
211 "rep_set") | |
212 elif content.find("taxonomy") != -1: | |
213 move_file( | |
214 content_filepath, | |
215 content_filename_prefix, | |
216 content_name_prefix, | |
217 data_tables, | |
218 os.path.join(target_dir, "taxonomy"), | |
219 "taxonomy") | |
220 | |
221 | |
222 def move_file(input_filepath, filename, name, data_tables, target_dir, filetype): | |
223 """ | |
224 """ | |
225 output_filepath = os.path.join(target_dir, filename) | |
226 os.rename(input_filepath, output_filepath) | |
227 add_data_table_entry( | |
228 data_tables, | |
229 "qiime_%s" % (filetype), | |
230 dict( | |
231 dbkey=filename, | |
232 value="1.0", | |
233 name=name, | |
234 path=output_filepath)) | |
235 | |
236 | |
237 def move_dir_content(input_path, filename_prefix, name_prefix, data_tables, target_dir, filetype): | |
238 """ | |
239 """ | |
240 for content in os.listdir(input_path): | |
241 if content.startswith("."): | |
242 continue | |
243 content_path = os.path.join(input_path, content) | |
244 content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0]) | |
245 content_filename_prefix = "%s_%s" % (filename_prefix, content) | |
246 if os.path.isdir(content_path): | |
247 move_dir_content( | |
248 content_path, | |
249 content_filename_prefix, | |
250 content_name_prefix, | |
251 data_tables, | |
252 target_dir, | |
253 filetype) | |
254 else: | |
255 move_file( | |
256 content_path, | |
257 content_filename_prefix, | |
258 content_name_prefix, | |
259 data_tables, | |
260 target_dir, | |
261 filetype) | |
262 | |
263 | |
264 def move_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir, db, version): | |
265 """ | |
266 """ | |
267 for filetype in filetypes: | |
268 if filetype == "rep_set_aligned": | |
269 if db == "greengenes" and version == "12_10": | |
270 continue | |
271 filetype_target_dir = os.path.join( | |
272 target_dir, | |
273 filetype) | |
274 filetype_path = os.path.join( | |
275 archive_content_path, | |
276 filetype) | |
277 move_dir_content( | |
278 filetype_path, | |
279 filename_prefix, | |
280 name_prefix, | |
281 data_tables, | |
282 filetype_target_dir, | |
283 filetype) | |
284 | |
285 | |
286 def download_db(data_tables, db, version, target_dir): | |
287 """Download QIIME database | |
288 | |
289 Creates references to the specified file(s) on the Galaxy | |
290 server in the appropriate data table (determined from the | |
291 file extension). | |
292 | |
293 The 'data_tables' dictionary should have been created using | |
294 the 'create_data_tables_dict' and 'add_data_table' functions. | |
295 | |
296 Arguments: | |
297 data_tables: a dictionary containing the data table info | |
298 db: name of the database | |
299 version: version of the database | |
300 table_name: name of the table | |
301 target_dir: directory to put copy or link to the data file | |
302 | |
303 """ | |
304 ext = extension[db] | |
305 if db == "silva": | |
306 ext = ext[version] | |
307 | |
308 print("Download archive") | |
309 filepath = download_archive(db, version, ext) | |
310 | |
311 print("Extract archive %s" % filepath) | |
312 archive_content_path = extract_archive(filepath, ext, db) | |
313 | |
314 print("Moving file from %s" % archive_content_path) | |
315 filename_prefix = "%s_%s" % (db, version) | |
316 name_prefix = "%s (%s)" % (db, version) | |
317 if db == "greengenes" or db == "silva": | |
318 move_files( | |
319 archive_content_path, | |
320 filename_prefix, | |
321 name_prefix, | |
322 data_tables, | |
323 target_dir, | |
324 db, | |
325 version) | |
326 elif db == "unite": | |
327 move_unite_files( | |
328 archive_content_path, | |
329 filename_prefix, | |
330 name_prefix, | |
331 data_tables, | |
332 target_dir) | |
333 | |
334 | |
335 if __name__ == "__main__": | |
336 print("Starting...") | |
337 | |
338 # Read command line | |
339 parser = argparse.ArgumentParser( | |
340 description='Download QIIME reference database') | |
341 parser.add_argument('--database', help="Database name") | |
342 parser.add_argument('--version', help="Database version") | |
343 parser.add_argument('--jsonfile', help="Output JSON file") | |
344 args = parser.parse_args() | |
345 | |
346 jsonfile = args.jsonfile | |
347 | |
348 # Read the input JSON | |
349 params, target_dir = read_input_json(jsonfile) | |
350 | |
351 # Make the target directory | |
352 print("Making %s" % target_dir) | |
353 os.mkdir(target_dir) | |
354 os.mkdir(os.path.join(target_dir, "rep_set")) | |
355 os.mkdir(os.path.join(target_dir, "rep_set_aligned")) | |
356 os.mkdir(os.path.join(target_dir, "taxonomy")) | |
357 os.mkdir(os.path.join(target_dir, "trees")) | |
358 | |
359 # Set up data tables dictionary | |
360 data_tables = create_data_tables_dict() | |
361 add_data_table(data_tables, "qiime_rep_set") | |
362 add_data_table(data_tables, "qiime_rep_set_aligned") | |
363 add_data_table(data_tables, "qiime_taxonomy") | |
364 add_data_table(data_tables, "qiime_trees") | |
365 | |
366 # Fetch data from specified data sources | |
367 download_db( | |
368 data_tables, | |
369 args.database, | |
370 args.version, | |
371 target_dir) | |
372 | |
373 # Write output JSON | |
374 print("Outputting JSON") | |
375 print(str(json.dumps(data_tables))) | |
376 with open(jsonfile, 'w') as out: | |
377 json.dump(data_tables, out) | |
378 print("Done.") |