Mercurial > repos > iuc > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 0:ab7a7e798c34 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit c1b936b54b7133106b3181df1e104986613a5bea
author | iuc |
---|---|
date | Mon, 06 Nov 2017 06:21:50 -0500 |
parents | |
children | aec831b54a5b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ab7a7e798c34 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | |
4 import json | |
5 import optparse | |
6 import os | |
7 import shutil | |
8 import sys | |
9 import tarfile | |
10 import tempfile | |
11 import urllib2 | |
12 import zipfile | |
13 | |
14 # When extracting files from archives, skip names that | |
15 # start with the following strings | |
16 IGNORE_PATHS = ('.', '__MACOSX/', '__') | |
17 | |
18 # Map file extensions to data table names | |
19 MOTHUR_FILE_TYPES = {".map": "map", | |
20 ".fasta": "aligndb", | |
21 ".align": "aligndb", | |
22 ".pat": "lookup", | |
23 ".tax": "taxonomy"} | |
24 | |
25 # Reference data URLs | |
26 MOTHUR_REFERENCE_DATA = { | |
27 # Look up data | |
28 # http://www.mothur.org/wiki/Lookup_files | |
29 "lookup_titanium": { | |
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip", ] | |
31 }, | |
32 "lookup_gsflx": { | |
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip", ] | |
34 }, | |
35 "lookup_gs20": { | |
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip", ] | |
37 }, | |
38 # RDP reference files | |
39 # http://www.mothur.org/wiki/RDP_reference_files | |
40 "RDP_v16": { | |
41 "16S rRNA RDP training set 16": | |
42 ["https://mothur.org/w/images/d/dc/Trainset16_022016.rdp.tgz", ], | |
43 "16S rRNA PDS training set 16": | |
44 ["https://mothur.org/w/images/c/c3/Trainset16_022016.pds.tgz", ], | |
45 }, | |
46 "RDP_v14": { | |
47 "16S rRNA RDP training set 14": | |
48 ["https://mothur.org/w/images/6/6c/Trainset14_032015.rdp.tgz", ], | |
49 "16S rRNA PDS training set 14": | |
50 ["https://mothur.org/w/images/8/88/Trainset14_032015.pds.tgz", ], | |
51 }, | |
52 "RDP_v10": { | |
53 "16S rRNA RDP training set 10": | |
54 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz", ], | |
55 "16S rRNA PDS training set 10": | |
56 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz", ], | |
57 }, | |
58 "RDP_v9": { | |
59 "16S rRNA RDP training set 9": | |
60 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip", ], | |
61 "16S rRNA PDS training set 9": | |
62 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip", ], | |
63 }, | |
64 "RDP_v7": { | |
65 "16S rRNA RDP training set 7": | |
66 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip", ], | |
67 "16S rRNA PDS training set 7": | |
68 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip", ], | |
69 "8S rRNA Fungi training set 7": | |
70 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip", ], | |
71 }, | |
72 "RDP_v6": { | |
73 "RDP training set 6": | |
74 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip", ], | |
75 }, | |
76 # Silva reference files | |
77 # http://www.mothur.org/wiki/Silva_reference_files | |
78 "silva_release_128": { | |
79 "SILVA release 128": | |
80 ["https://mothur.org/w/images/b/b4/Silva.nr_v128.tgz", | |
81 "https://mothur.org/w/images/a/a4/Silva.seed_v128.tgz", ], | |
82 }, | |
83 "silva_release_123": { | |
84 "SILVA release 123": | |
85 ["https://mothur.org/w/images/b/be/Silva.nr_v123.tgz", | |
86 "https://mothur.org/w/images/1/15/Silva.seed_v123.tgz", ], | |
87 }, | |
88 "silva_release_119": { | |
89 "SILVA release 119": | |
90 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz", | |
91 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz", ], | |
92 }, | |
93 "silva_release_102": { | |
94 "SILVA release 102": | |
95 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip", | |
96 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip", | |
97 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip", ], | |
98 }, | |
99 "silva_gold_bacteria": { | |
100 "SILVA gold": | |
101 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip", ], | |
102 }, | |
103 # Greengenes | |
104 # http://www.mothur.org/wiki/Greengenes-formatted_databases | |
105 "greengenes_August2013": { | |
106 "Greengenes August 2013": | |
107 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz", | |
108 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz", ], | |
109 }, | |
110 "greengenes_May2013": { | |
111 "Greengenes May 2013": | |
112 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz", | |
113 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz", ], | |
114 }, | |
115 "greengenes_old": { | |
116 "Greengenes pre-May 2013": | |
117 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip", | |
118 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz", ], | |
119 }, | |
120 "greengenes_gold_alignment": { | |
121 "Greengenes gold alignment": | |
122 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip", ], | |
123 }, | |
124 # Secondary structure maps | |
125 # http://www.mothur.org/wiki/Secondary_structure_map | |
126 "secondary_structure_maps_silva": { | |
127 "SILVA": | |
128 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip", ], | |
129 }, | |
130 "secondary_structure_maps_greengenes": { | |
131 "Greengenes": | |
132 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip", ], | |
133 }, | |
134 # Lane masks: not used here? | |
135 "lane_masks": { | |
136 "Greengenes-compatible": | |
137 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter", | |
138 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter", | |
139 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter", ], | |
140 "SILVA-compatible": | |
141 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter", ] | |
142 }, | |
143 } | |
144 | |
145 | |
146 # Utility functions for interacting with Galaxy JSON | |
147 def read_input_json(jsonfile): | |
148 """Read the JSON supplied from the data manager tool | |
149 | |
150 Returns a tuple (param_dict,extra_files_path) | |
151 | |
152 'param_dict' is an arbitrary dictionary of parameters | |
153 input into the tool; 'extra_files_path' is the path | |
154 to a directory where output files must be put for the | |
155 receiving data manager to pick them up. | |
156 | |
157 NB the directory pointed to by 'extra_files_path' | |
158 doesn't exist initially, it is the job of the script | |
159 to create it if necessary. | |
160 | |
161 """ | |
162 params = json.loads(open(jsonfile).read()) | |
163 return (params['param_dict'], | |
164 params['output_data'][0]['extra_files_path']) | |
165 | |
166 | |
167 # Utility functions for creating data table dictionaries | |
168 # | |
169 # Example usage: | |
170 # >>> d = create_data_tables_dict() | |
171 # >>> add_data_table(d,'my_data') | |
172 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
173 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
174 # >>> print str(json.dumps(d)) | |
175 def create_data_tables_dict(): | |
176 """Return a dictionary for storing data table information | |
177 | |
178 Returns a dictionary that can be used with 'add_data_table' | |
179 and 'add_data_table_entry' to store information about a | |
180 data table. It can be converted to JSON to be sent back to | |
181 the data manager. | |
182 | |
183 """ | |
184 d = {} | |
185 d['data_tables'] = {} | |
186 return d | |
187 | |
188 | |
189 def add_data_table(d, table): | |
190 """Add a data table to the data tables dictionary | |
191 | |
192 Creates a placeholder for a data table called 'table'. | |
193 | |
194 """ | |
195 d['data_tables'][table] = [] | |
196 | |
197 | |
198 def add_data_table_entry(d, table, entry): | |
199 """Add an entry to a data table | |
200 | |
201 Appends an entry to the data table 'table'. 'entry' | |
202 should be a dictionary where the keys are the names of | |
203 columns in the data table. | |
204 | |
205 Raises an exception if the named data table doesn't | |
206 exist. | |
207 | |
208 """ | |
209 try: | |
210 d['data_tables'][table].append(entry) | |
211 except KeyError: | |
212 raise Exception("add_data_table_entry: no table '%s'" % table) | |
213 | |
214 | |
215 # Utility functions for downloading and unpacking archive files | |
216 def download_file(url, target=None, wd=None): | |
217 """Download a file from a URL | |
218 | |
219 Fetches a file from the specified URL. | |
220 | |
221 If 'target' is specified then the file is saved to this | |
222 name; otherwise it's saved as the basename of the URL. | |
223 | |
224 If 'wd' is specified then it is used as the 'working | |
225 directory' where the file will be save on the local | |
226 system. | |
227 | |
228 Returns the name that the file is saved with. | |
229 | |
230 """ | |
231 print "Downloading %s" % url | |
232 if not target: | |
233 target = os.path.basename(url) | |
234 if wd: | |
235 target = os.path.join(wd, target) | |
236 print "Saving to %s" % target | |
237 open(target, 'wb').write(urllib2.urlopen(url).read()) | |
238 return target | |
239 | |
240 | |
241 def unpack_zip_archive(filen, wd=None): | |
242 """Extract files from a ZIP archive | |
243 | |
244 Given a ZIP archive, extract the files it contains | |
245 and return a list of the resulting file names and | |
246 paths. | |
247 | |
248 'wd' specifies the working directory to extract | |
249 the files to, otherwise they are extracted to the | |
250 current working directory. | |
251 | |
252 Once all the files are extracted the ZIP archive | |
253 file is deleted from the file system. | |
254 | |
255 """ | |
256 if not zipfile.is_zipfile(filen): | |
257 print "%s: not ZIP formatted file" | |
258 return [filen] | |
259 file_list = [] | |
260 z = zipfile.ZipFile(filen) | |
261 for name in z.namelist(): | |
262 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | |
263 print "Ignoring %s" % name | |
264 continue | |
265 if wd: | |
266 target = os.path.join(wd, name) | |
267 else: | |
268 target = name | |
269 if name.endswith('/'): | |
270 # Make directory | |
271 print "Creating dir %s" % target | |
272 try: | |
273 os.makedirs(target) | |
274 except OSError: | |
275 pass | |
276 else: | |
277 # Extract file | |
278 print "Extracting %s" % name | |
279 try: | |
280 os.makedirs(os.path.dirname(target)) | |
281 except OSError: | |
282 pass | |
283 open(target, 'wb').write(z.read(name)) | |
284 file_list.append(target) | |
285 print "Removing %s" % filen | |
286 os.remove(filen) | |
287 return file_list | |
288 | |
289 | |
290 def unpack_tar_archive(filen, wd=None): | |
291 """Extract files from a TAR archive | |
292 | |
293 Given a TAR archive (which optionally can be | |
294 compressed with either gzip or bz2), extract the | |
295 files it contains and return a list of the | |
296 resulting file names and paths. | |
297 | |
298 'wd' specifies the working directory to extract | |
299 the files to, otherwise they are extracted to the | |
300 current working directory. | |
301 | |
302 Once all the files are extracted the TAR archive | |
303 file is deleted from the file system. | |
304 | |
305 """ | |
306 file_list = [] | |
307 if not tarfile.is_tarfile(filen): | |
308 print "%s: not TAR file" | |
309 return [filen] | |
310 t = tarfile.open(filen) | |
311 for name in t.getnames(): | |
312 # Check for unwanted files | |
313 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | |
314 print "Ignoring %s" % name | |
315 continue | |
316 # Extract file | |
317 print "Extracting %s" % name | |
318 t.extract(name, wd) | |
319 if wd: | |
320 target = os.path.join(wd, name) | |
321 else: | |
322 target = name | |
323 file_list.append(target) | |
324 print "Removing %s" % filen | |
325 os.remove(filen) | |
326 return file_list | |
327 | |
328 | |
329 def unpack_archive(filen, wd=None): | |
330 """Extract files from an archive | |
331 | |
332 Wrapper function that calls the appropriate | |
333 unpacking function depending on the archive | |
334 type, and returns a list of files that have | |
335 been extracted. | |
336 | |
337 'wd' specifies the working directory to extract | |
338 the files to, otherwise they are extracted to the | |
339 current working directory. | |
340 | |
341 """ | |
342 print "Unpack %s" % filen | |
343 ext = os.path.splitext(filen)[1] | |
344 print "Extension: %s" % ext | |
345 if ext == ".zip": | |
346 return unpack_zip_archive(filen, wd=wd) | |
347 elif ext == ".tgz": | |
348 return unpack_tar_archive(filen, wd=wd) | |
349 else: | |
350 return [filen] | |
351 | |
352 | |
353 def fetch_files(urls, wd=None, files=None): | |
354 """Download and unpack files from a list of URLs | |
355 | |
356 Given a list of URLs, download and unpack each | |
357 one, and return a list of the extracted files. | |
358 | |
359 'wd' specifies the working directory to extract | |
360 the files to, otherwise they are extracted to the | |
361 current working directory. | |
362 | |
363 If 'files' is given then the list of extracted | |
364 files will be appended to this list before being | |
365 returned. | |
366 | |
367 """ | |
368 if files is None: | |
369 files = [] | |
370 for url in urls: | |
371 filen = download_file(url, wd=wd) | |
372 files.extend(unpack_archive(filen, wd=wd)) | |
373 return files | |
374 | |
375 | |
376 # Utility functions specific to the Mothur reference data | |
377 def identify_type(filen): | |
378 """Return the data table name based on the file name | |
379 | |
380 """ | |
381 ext = os.path.splitext(filen)[1] | |
382 try: | |
383 return MOTHUR_FILE_TYPES[ext] | |
384 except KeyError: | |
385 print "WARNING: unknown file type for " + filen + ", skipping" | |
386 return None | |
387 | |
388 | |
389 def get_name(filen): | |
390 """Generate a descriptive name based on the file name | |
391 """ | |
392 # type_ = identify_type(filen) | |
393 name = os.path.splitext(os.path.basename(filen))[0] | |
394 for delim in ('.', '_'): | |
395 name = name.replace(delim, ' ') | |
396 return name | |
397 | |
398 | |
399 def fetch_from_mothur_website(data_tables, target_dir, datasets): | |
400 """Fetch reference data from the Mothur website | |
401 | |
402 For each dataset in the list 'datasets', download (and if | |
403 necessary unpack) the related files from the Mothur website, | |
404 copy them to the data manager's target directory, and add | |
405 references to the files to the appropriate data table. | |
406 | |
407 The 'data_tables' dictionary should have been created using | |
408 the 'create_data_tables_dict' and 'add_data_table' functions. | |
409 | |
410 Arguments: | |
411 data_tables: a dictionary containing the data table info | |
412 target_dir: directory to put the downloaded files | |
413 datasets: a list of dataset names corresponding to keys in | |
414 the MOTHUR_REFERENCE_DATA dictionary | |
415 """ | |
416 # Make working dir | |
417 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) | |
418 print "Working dir %s" % wd | |
419 # Iterate over all requested reference data URLs | |
420 for dataset in datasets: | |
421 print "Handling dataset '%s'" % dataset | |
422 for name in MOTHUR_REFERENCE_DATA[dataset]: | |
423 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): | |
424 type_ = identify_type(f) | |
425 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) | |
426 print "%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)) | |
427 if type_ is not None: | |
428 # Move to target dir | |
429 ref_data_file = os.path.basename(f) | |
430 f1 = os.path.join(target_dir, ref_data_file) | |
431 print "Moving %s to %s" % (f, f1) | |
432 os.rename(f, f1) | |
433 # Add entry to data table | |
434 table_name = "mothur_%s" % type_ | |
435 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | |
436 # Remove working dir | |
437 print "Removing %s" % wd | |
438 shutil.rmtree(wd) | |
439 | |
440 | |
441 def files_from_filesystem_paths(paths): | |
442 """Return list of file paths from arbitrary input paths | |
443 | |
444 Given a list of filesystem paths, return a list of | |
445 full paths corresponding to all files found recursively | |
446 from under those paths. | |
447 | |
448 """ | |
449 # Collect files to add | |
450 files = [] | |
451 for path in paths: | |
452 path = os.path.abspath(path) | |
453 print "Examining '%s'..." % path | |
454 if os.path.isfile(path): | |
455 # Store full path for file | |
456 files.append(path) | |
457 elif os.path.isdir(path): | |
458 # Descend into directory and collect the files | |
459 for f in os.listdir(path): | |
460 files.extend(files_from_filesystem_paths((os.path.join(path, f), ))) | |
461 else: | |
462 print "Not a file or directory, ignored" | |
463 return files | |
464 | |
465 | |
466 def import_from_server(data_tables, target_dir, paths, description, link_to_data=False): | |
467 """Import reference data from filesystem paths | |
468 | |
469 Creates references to the specified file(s) on the Galaxy | |
470 server in the appropriate data table (determined from the | |
471 file extension). | |
472 | |
473 The 'data_tables' dictionary should have been created using | |
474 the 'create_data_tables_dict' and 'add_data_table' functions. | |
475 | |
476 Arguments: | |
477 data_tables: a dictionary containing the data table info | |
478 target_dir: directory to put copy or link to the data file | |
479 paths: list of file and/or directory paths to import | |
480 description: text to associate with the files | |
481 link_to_data: boolean, if False then copy the data file | |
482 into Galaxy (default); if True then make a symlink to | |
483 the data file | |
484 | |
485 """ | |
486 # Collect list of files based on input paths | |
487 files = files_from_filesystem_paths(paths) | |
488 # Handle each file individually | |
489 for f in files: | |
490 type_ = identify_type(f) | |
491 if type_ is None: | |
492 print "%s: unrecognised type, skipped" % f | |
493 continue | |
494 ref_data_file = os.path.basename(f) | |
495 target_file = os.path.join(target_dir, ref_data_file) | |
496 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | |
497 if description: | |
498 entry_name += " (%s)" % description | |
499 print "%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file) | |
500 # Link to or copy the data | |
501 if link_to_data: | |
502 os.symlink(f, target_file) | |
503 else: | |
504 shutil.copyfile(f, target_file) | |
505 # Add entry to data table | |
506 table_name = "mothur_%s" % type_ | |
507 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | |
508 | |
509 | |
510 if __name__ == "__main__": | |
511 print "Starting..." | |
512 | |
513 # Read command line | |
514 parser = optparse.OptionParser() | |
515 parser.add_option('--source', action='store', dest='data_source') | |
516 parser.add_option('--datasets', action='store', dest='datasets', default='') | |
517 parser.add_option('--paths', action='store', dest='paths', default=[]) | |
518 parser.add_option('--description', action='store', dest='description', default='') | |
519 parser.add_option('--link', action='store_true', dest='link_to_data') | |
520 options, args = parser.parse_args() | |
521 print "options: %s" % options | |
522 print "args : %s" % args | |
523 | |
524 # Check for JSON file | |
525 if len(args) != 1: | |
526 sys.stderr.write("Need to supply JSON file name") | |
527 sys.exit(1) | |
528 | |
529 jsonfile = args[0] | |
530 | |
531 # Read the input JSON | |
532 params, target_dir = read_input_json(jsonfile) | |
533 | |
534 # Make the target directory | |
535 print "Making %s" % target_dir | |
536 os.mkdir(target_dir) | |
537 | |
538 # Set up data tables dictionary | |
539 data_tables = create_data_tables_dict() | |
540 add_data_table(data_tables, 'mothur_lookup') | |
541 add_data_table(data_tables, 'mothur_aligndb') | |
542 add_data_table(data_tables, 'mothur_map') | |
543 add_data_table(data_tables, 'mothur_taxonomy') | |
544 | |
545 # Fetch data from specified data sources | |
546 if options.data_source == 'mothur_website': | |
547 datasets = options.datasets.split(',') | |
548 fetch_from_mothur_website(data_tables, target_dir, datasets) | |
549 elif options.data_source == 'filesystem_paths': | |
550 # Check description text | |
551 description = options.description.strip() | |
552 # Get list of paths (need to remove any escapes for '\n' and '\r' | |
553 # that might have been inserted by Galaxy) | |
554 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() | |
555 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) | |
556 # Write output JSON | |
557 print "Outputting JSON" | |
558 print str(json.dumps(data_tables)) | |
559 open(jsonfile, 'wb').write(json.dumps(data_tables)) | |
560 print "Done." |