comparison data_manager/data_manager_qiime_download.py @ 0:f8608fddfb23 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_qiime_database_downloader commit 4934eb34300b5fa54d62d8b67e5b6e989e963ac9
author iuc
date Mon, 15 May 2017 11:08:43 -0400
parents
children 9e86c09a6cae
comparison
equal deleted inserted replaced
-1:000000000000 0:f8608fddfb23
1 #!/usr/bin/env python
2 # Data manager for reference data for the QIIME Galaxy tools
3
4 import argparse
5 import ftplib
6 import json
7 import os
8 import tarfile
9 import zipfile
10
11 import requests
12
13
14 protocol = {
15 "unite": "http",
16 "greengenes": "ftp",
17 "silva": "http",
18 "img": "ftp"
19 }
20 baseUrl = {
21 "unite": "http://unite.ut.ee/sh_files/sh_qiime_release_",
22 "greengenes": "greengenes.microbio.me",
23 "silva": "http://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_",
24 "img": "ftp.microbio.me"
25 }
26 ftp_dir = {
27 "greengenes": "/greengenes_release/gg_",
28 "img": ""
29 }
30 ftp_file_prefix = {
31 "greengenes": "gg_",
32 "img": ""
33 }
34 ftp_file_suffix = {
35 "greengenes": "_otus",
36 "img": ""
37 }
38 extension = {
39 "unite": "zip",
40 "greengenes": "tar.gz",
41 "silva": {
42 "104_release": "tgz",
43 "108_release": "tgz",
44 "108_release_curated": "tgz",
45 "111_release": "tgz",
46 "119_consensus_majority_taxonomy": "zip",
47 "119_release": "zip",
48 "119_release_aligned_rep_files": "tar.gz",
49 "123_release": "zip",
50 "128_release": "tgz"},
51 "img": "tgz"
52 }
53 filetypes = ["rep_set", "rep_set_aligned", "taxonomy", "trees"]
54
55
56 # Utility functions for interacting with Galaxy JSON
57 def read_input_json(jsonfile):
58 """Read the JSON supplied from the data manager tool
59
60 Returns a tuple (param_dict,extra_files_path)
61
62 'param_dict' is an arbitrary dictionary of parameters
63 input into the tool; 'extra_files_path' is the path
64 to a directory where output files must be put for the
65 receiving data manager to pick them up.
66
67 NB the directory pointed to by 'extra_files_path'
68 doesn't exist initially, it is the job of the script
69 to create it if necessary.
70
71 """
72 params = json.loads(open(jsonfile).read())
73 return (params['param_dict'],
74 params['output_data'][0]['extra_files_path'])
75
76
77 # Utility functions for creating data table dictionaries
78 #
79 # Example usage:
80 # >>> d = create_data_tables_dict()
81 # >>> add_data_table(d,'my_data')
82 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
83 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
84 # >>> print str(json.dumps(d))
85 def create_data_tables_dict():
86 """Return a dictionary for storing data table information
87
88 Returns a dictionary that can be used with 'add_data_table'
89 and 'add_data_table_entry' to store information about a
90 data table. It can be converted to JSON to be sent back to
91 the data manager.
92
93 """
94 d = {}
95 d['data_tables'] = {}
96 return d
97
98
99 def add_data_table(d, table):
100 """Add a data table to the data tables dictionary
101
102 Creates a placeholder for a data table called 'table'.
103
104 """
105 d['data_tables'][table] = []
106
107
108 def add_data_table_entry(d, table, entry):
109 """Add an entry to a data table
110
111 Appends an entry to the data table 'table'. 'entry'
112 should be a dictionary where the keys are the names of
113 columns in the data table.
114
115 Raises an exception if the named data table doesn't
116 exist.
117
118 """
119 try:
120 d['data_tables'][table].append(entry)
121 except KeyError:
122 raise Exception("add_data_table_entry: no table '%s'" % table)
123
124
125 def get_ftp_file(ftp, filename):
126 """
127 """
128 try:
129 ftp.retrbinary("RETR " + filename, open(filename, 'wb').write)
130 except:
131 print("Error")
132
133
134 def download_archive(db, version, ext):
135 """
136
137 """
138 filepath = "%s_%s.%s" % (db, version, ext)
139 if protocol[db] == "http":
140 url = "%s%s.%s" % (baseUrl[db], version, ext)
141 r = requests.get(url, stream=True)
142 r.raise_for_status()
143 with open(filepath, "wb") as fd:
144 for chunk in r.iter_content(chunk_size=128):
145 fd.write(chunk)
146 elif protocol[db] == "ftp":
147 ftp = ftplib.FTP(baseUrl[db])
148 ftp.login("anonymous", "ftplib-example-1")
149 if db == "greengenes" and version == "13_8":
150 ftp.cwd("%s%s" % (ftp_dir[db], "13_5"))
151 else:
152 ftp.cwd("%s%s" % (ftp_dir[db], version))
153 filepath = "%s%s%s.%s" % (
154 ftp_file_prefix[db],
155 version,
156 ftp_file_suffix[db],
157 ext)
158 get_ftp_file(ftp, filepath)
159 ftp.quit()
160 return filepath
161
162
163 def find_archive_content_path(archive_content_path):
164 """
165 """
166 content = os.listdir(archive_content_path)
167 archive_content = []
168 for x in content:
169 if not x.startswith(".") and not x.startswith("_"):
170 archive_content.append(x)
171 if len(archive_content) == 1:
172 archive_content_path = os.path.join(
173 archive_content_path,
174 archive_content[0])
175 return archive_content_path
176
177
178 def extract_archive(filepath, ext, db):
179 """
180 """
181 archive_content_path = "tmp"
182 if ext == "tar.gz" or ext == "tgz":
183 tar = tarfile.open(filepath)
184 tar.extractall(path=archive_content_path)
185 tar.close()
186 archive_content_path = find_archive_content_path(archive_content_path)
187 elif ext == "zip":
188 zip_ref = zipfile.ZipFile(filepath, 'r')
189 zip_ref.extractall(archive_content_path)
190 zip_ref.close()
191 archive_content_path = find_archive_content_path(archive_content_path)
192 return archive_content_path
193
194
195 def move_unite_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir):
196 """
197
198 """
199 archive_content = os.listdir(archive_content_path)
200 for content in archive_content:
201 content_filepath = os.path.join(archive_content_path, content)
202 content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0])
203 content_filename_prefix = "%s_%s" % (filename_prefix, content)
204 if content.find("refs") != -1:
205 move_file(
206 content_filepath,
207 content_filename_prefix,
208 content_name_prefix,
209 data_tables,
210 os.path.join(target_dir, "rep_set"),
211 "rep_set")
212 elif content.find("taxonomy") != -1:
213 move_file(
214 content_filepath,
215 content_filename_prefix,
216 content_name_prefix,
217 data_tables,
218 os.path.join(target_dir, "taxonomy"),
219 "taxonomy")
220
221
222 def move_file(input_filepath, filename, name, data_tables, target_dir, filetype):
223 """
224 """
225 output_filepath = os.path.join(target_dir, filename)
226 os.rename(input_filepath, output_filepath)
227 add_data_table_entry(
228 data_tables,
229 "qiime_%s" % (filetype),
230 dict(
231 dbkey=filename,
232 value="1.0",
233 name=name,
234 path=output_filepath))
235
236
237 def move_dir_content(input_path, filename_prefix, name_prefix, data_tables, target_dir, filetype):
238 """
239 """
240 for content in os.listdir(input_path):
241 if content.startswith("."):
242 continue
243 content_path = os.path.join(input_path, content)
244 content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0])
245 content_filename_prefix = "%s_%s" % (filename_prefix, content)
246 if os.path.isdir(content_path):
247 move_dir_content(
248 content_path,
249 content_filename_prefix,
250 content_name_prefix,
251 data_tables,
252 target_dir,
253 filetype)
254 else:
255 move_file(
256 content_path,
257 content_filename_prefix,
258 content_name_prefix,
259 data_tables,
260 target_dir,
261 filetype)
262
263
264 def move_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir, db, version):
265 """
266 """
267 for filetype in filetypes:
268 if filetype == "rep_set_aligned":
269 if db == "greengenes" and version == "12_10":
270 continue
271 filetype_target_dir = os.path.join(
272 target_dir,
273 filetype)
274 filetype_path = os.path.join(
275 archive_content_path,
276 filetype)
277 move_dir_content(
278 filetype_path,
279 filename_prefix,
280 name_prefix,
281 data_tables,
282 filetype_target_dir,
283 filetype)
284
285
286 def download_db(data_tables, db, version, target_dir):
287 """Download QIIME database
288
289 Creates references to the specified file(s) on the Galaxy
290 server in the appropriate data table (determined from the
291 file extension).
292
293 The 'data_tables' dictionary should have been created using
294 the 'create_data_tables_dict' and 'add_data_table' functions.
295
296 Arguments:
297 data_tables: a dictionary containing the data table info
298 db: name of the database
299 version: version of the database
300 table_name: name of the table
301 target_dir: directory to put copy or link to the data file
302
303 """
304 ext = extension[db]
305 if db == "silva":
306 ext = ext[version]
307
308 print("Download archive")
309 filepath = download_archive(db, version, ext)
310
311 print("Extract archive %s" % filepath)
312 archive_content_path = extract_archive(filepath, ext, db)
313
314 print("Moving file from %s" % archive_content_path)
315 filename_prefix = "%s_%s" % (db, version)
316 name_prefix = "%s (%s)" % (db, version)
317 if db == "greengenes" or db == "silva":
318 move_files(
319 archive_content_path,
320 filename_prefix,
321 name_prefix,
322 data_tables,
323 target_dir,
324 db,
325 version)
326 elif db == "unite":
327 move_unite_files(
328 archive_content_path,
329 filename_prefix,
330 name_prefix,
331 data_tables,
332 target_dir)
333
334
335 if __name__ == "__main__":
336 print("Starting...")
337
338 # Read command line
339 parser = argparse.ArgumentParser(
340 description='Download QIIME reference database')
341 parser.add_argument('--database', help="Database name")
342 parser.add_argument('--version', help="Database version")
343 parser.add_argument('--jsonfile', help="Output JSON file")
344 args = parser.parse_args()
345
346 jsonfile = args.jsonfile
347
348 # Read the input JSON
349 params, target_dir = read_input_json(jsonfile)
350
351 # Make the target directory
352 print("Making %s" % target_dir)
353 os.mkdir(target_dir)
354 os.mkdir(os.path.join(target_dir, "rep_set"))
355 os.mkdir(os.path.join(target_dir, "rep_set_aligned"))
356 os.mkdir(os.path.join(target_dir, "taxonomy"))
357 os.mkdir(os.path.join(target_dir, "trees"))
358
359 # Set up data tables dictionary
360 data_tables = create_data_tables_dict()
361 add_data_table(data_tables, "qiime_rep_set")
362 add_data_table(data_tables, "qiime_rep_set_aligned")
363 add_data_table(data_tables, "qiime_taxonomy")
364 add_data_table(data_tables, "qiime_trees")
365
366 # Fetch data from specified data sources
367 download_db(
368 data_tables,
369 args.database,
370 args.version,
371 target_dir)
372
373 # Write output JSON
374 print("Outputting JSON")
375 print(str(json.dumps(data_tables)))
376 with open(jsonfile, 'w') as out:
377 json.dump(data_tables, out)
378 print("Done.")