comparison data_manager/fetch_mothur_reference_data.py @ 0:ab7a7e798c34 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit c1b936b54b7133106b3181df1e104986613a5bea
author iuc
date Mon, 06 Nov 2017 06:21:50 -0500
parents
children aec831b54a5b
comparison
equal deleted inserted replaced
-1:000000000000 0:ab7a7e798c34
1 #!/usr/bin/env python
2 #
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
4 import json
5 import optparse
6 import os
7 import shutil
8 import sys
9 import tarfile
10 import tempfile
11 import urllib2
12 import zipfile
13
14 # When extracting files from archives, skip names that
15 # start with the following strings
16 IGNORE_PATHS = ('.', '__MACOSX/', '__')
17
18 # Map file extensions to data table names
19 MOTHUR_FILE_TYPES = {".map": "map",
20 ".fasta": "aligndb",
21 ".align": "aligndb",
22 ".pat": "lookup",
23 ".tax": "taxonomy"}
24
25 # Reference data URLs
26 MOTHUR_REFERENCE_DATA = {
27 # Look up data
28 # http://www.mothur.org/wiki/Lookup_files
29 "lookup_titanium": {
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip", ]
31 },
32 "lookup_gsflx": {
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip", ]
34 },
35 "lookup_gs20": {
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip", ]
37 },
38 # RDP reference files
39 # http://www.mothur.org/wiki/RDP_reference_files
40 "RDP_v16": {
41 "16S rRNA RDP training set 16":
42 ["https://mothur.org/w/images/d/dc/Trainset16_022016.rdp.tgz", ],
43 "16S rRNA PDS training set 16":
44 ["https://mothur.org/w/images/c/c3/Trainset16_022016.pds.tgz", ],
45 },
46 "RDP_v14": {
47 "16S rRNA RDP training set 14":
48 ["https://mothur.org/w/images/6/6c/Trainset14_032015.rdp.tgz", ],
49 "16S rRNA PDS training set 14":
50 ["https://mothur.org/w/images/8/88/Trainset14_032015.pds.tgz", ],
51 },
52 "RDP_v10": {
53 "16S rRNA RDP training set 10":
54 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz", ],
55 "16S rRNA PDS training set 10":
56 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz", ],
57 },
58 "RDP_v9": {
59 "16S rRNA RDP training set 9":
60 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip", ],
61 "16S rRNA PDS training set 9":
62 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip", ],
63 },
64 "RDP_v7": {
65 "16S rRNA RDP training set 7":
66 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip", ],
67 "16S rRNA PDS training set 7":
68 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip", ],
69 "8S rRNA Fungi training set 7":
70 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip", ],
71 },
72 "RDP_v6": {
73 "RDP training set 6":
74 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip", ],
75 },
76 # Silva reference files
77 # http://www.mothur.org/wiki/Silva_reference_files
78 "silva_release_128": {
79 "SILVA release 128":
80 ["https://mothur.org/w/images/b/b4/Silva.nr_v128.tgz",
81 "https://mothur.org/w/images/a/a4/Silva.seed_v128.tgz", ],
82 },
83 "silva_release_123": {
84 "SILVA release 123":
85 ["https://mothur.org/w/images/b/be/Silva.nr_v123.tgz",
86 "https://mothur.org/w/images/1/15/Silva.seed_v123.tgz", ],
87 },
88 "silva_release_119": {
89 "SILVA release 119":
90 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz",
91 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz", ],
92 },
93 "silva_release_102": {
94 "SILVA release 102":
95 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip",
96 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip",
97 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip", ],
98 },
99 "silva_gold_bacteria": {
100 "SILVA gold":
101 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip", ],
102 },
103 # Greengenes
104 # http://www.mothur.org/wiki/Greengenes-formatted_databases
105 "greengenes_August2013": {
106 "Greengenes August 2013":
107 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz",
108 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz", ],
109 },
110 "greengenes_May2013": {
111 "Greengenes May 2013":
112 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz",
113 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz", ],
114 },
115 "greengenes_old": {
116 "Greengenes pre-May 2013":
117 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip",
118 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz", ],
119 },
120 "greengenes_gold_alignment": {
121 "Greengenes gold alignment":
122 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip", ],
123 },
124 # Secondary structure maps
125 # http://www.mothur.org/wiki/Secondary_structure_map
126 "secondary_structure_maps_silva": {
127 "SILVA":
128 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip", ],
129 },
130 "secondary_structure_maps_greengenes": {
131 "Greengenes":
132 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip", ],
133 },
134 # Lane masks: not used here?
135 "lane_masks": {
136 "Greengenes-compatible":
137 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter",
138 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter",
139 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter", ],
140 "SILVA-compatible":
141 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter", ]
142 },
143 }
144
145
146 # Utility functions for interacting with Galaxy JSON
147 def read_input_json(jsonfile):
148 """Read the JSON supplied from the data manager tool
149
150 Returns a tuple (param_dict,extra_files_path)
151
152 'param_dict' is an arbitrary dictionary of parameters
153 input into the tool; 'extra_files_path' is the path
154 to a directory where output files must be put for the
155 receiving data manager to pick them up.
156
157 NB the directory pointed to by 'extra_files_path'
158 doesn't exist initially, it is the job of the script
159 to create it if necessary.
160
161 """
162 params = json.loads(open(jsonfile).read())
163 return (params['param_dict'],
164 params['output_data'][0]['extra_files_path'])
165
166
167 # Utility functions for creating data table dictionaries
168 #
169 # Example usage:
170 # >>> d = create_data_tables_dict()
171 # >>> add_data_table(d,'my_data')
172 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
173 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
174 # >>> print str(json.dumps(d))
175 def create_data_tables_dict():
176 """Return a dictionary for storing data table information
177
178 Returns a dictionary that can be used with 'add_data_table'
179 and 'add_data_table_entry' to store information about a
180 data table. It can be converted to JSON to be sent back to
181 the data manager.
182
183 """
184 d = {}
185 d['data_tables'] = {}
186 return d
187
188
189 def add_data_table(d, table):
190 """Add a data table to the data tables dictionary
191
192 Creates a placeholder for a data table called 'table'.
193
194 """
195 d['data_tables'][table] = []
196
197
198 def add_data_table_entry(d, table, entry):
199 """Add an entry to a data table
200
201 Appends an entry to the data table 'table'. 'entry'
202 should be a dictionary where the keys are the names of
203 columns in the data table.
204
205 Raises an exception if the named data table doesn't
206 exist.
207
208 """
209 try:
210 d['data_tables'][table].append(entry)
211 except KeyError:
212 raise Exception("add_data_table_entry: no table '%s'" % table)
213
214
215 # Utility functions for downloading and unpacking archive files
216 def download_file(url, target=None, wd=None):
217 """Download a file from a URL
218
219 Fetches a file from the specified URL.
220
221 If 'target' is specified then the file is saved to this
222 name; otherwise it's saved as the basename of the URL.
223
224 If 'wd' is specified then it is used as the 'working
225 directory' where the file will be save on the local
226 system.
227
228 Returns the name that the file is saved with.
229
230 """
231 print "Downloading %s" % url
232 if not target:
233 target = os.path.basename(url)
234 if wd:
235 target = os.path.join(wd, target)
236 print "Saving to %s" % target
237 open(target, 'wb').write(urllib2.urlopen(url).read())
238 return target
239
240
241 def unpack_zip_archive(filen, wd=None):
242 """Extract files from a ZIP archive
243
244 Given a ZIP archive, extract the files it contains
245 and return a list of the resulting file names and
246 paths.
247
248 'wd' specifies the working directory to extract
249 the files to, otherwise they are extracted to the
250 current working directory.
251
252 Once all the files are extracted the ZIP archive
253 file is deleted from the file system.
254
255 """
256 if not zipfile.is_zipfile(filen):
257 print "%s: not ZIP formatted file"
258 return [filen]
259 file_list = []
260 z = zipfile.ZipFile(filen)
261 for name in z.namelist():
262 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
263 print "Ignoring %s" % name
264 continue
265 if wd:
266 target = os.path.join(wd, name)
267 else:
268 target = name
269 if name.endswith('/'):
270 # Make directory
271 print "Creating dir %s" % target
272 try:
273 os.makedirs(target)
274 except OSError:
275 pass
276 else:
277 # Extract file
278 print "Extracting %s" % name
279 try:
280 os.makedirs(os.path.dirname(target))
281 except OSError:
282 pass
283 open(target, 'wb').write(z.read(name))
284 file_list.append(target)
285 print "Removing %s" % filen
286 os.remove(filen)
287 return file_list
288
289
290 def unpack_tar_archive(filen, wd=None):
291 """Extract files from a TAR archive
292
293 Given a TAR archive (which optionally can be
294 compressed with either gzip or bz2), extract the
295 files it contains and return a list of the
296 resulting file names and paths.
297
298 'wd' specifies the working directory to extract
299 the files to, otherwise they are extracted to the
300 current working directory.
301
302 Once all the files are extracted the TAR archive
303 file is deleted from the file system.
304
305 """
306 file_list = []
307 if not tarfile.is_tarfile(filen):
308 print "%s: not TAR file"
309 return [filen]
310 t = tarfile.open(filen)
311 for name in t.getnames():
312 # Check for unwanted files
313 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
314 print "Ignoring %s" % name
315 continue
316 # Extract file
317 print "Extracting %s" % name
318 t.extract(name, wd)
319 if wd:
320 target = os.path.join(wd, name)
321 else:
322 target = name
323 file_list.append(target)
324 print "Removing %s" % filen
325 os.remove(filen)
326 return file_list
327
328
329 def unpack_archive(filen, wd=None):
330 """Extract files from an archive
331
332 Wrapper function that calls the appropriate
333 unpacking function depending on the archive
334 type, and returns a list of files that have
335 been extracted.
336
337 'wd' specifies the working directory to extract
338 the files to, otherwise they are extracted to the
339 current working directory.
340
341 """
342 print "Unpack %s" % filen
343 ext = os.path.splitext(filen)[1]
344 print "Extension: %s" % ext
345 if ext == ".zip":
346 return unpack_zip_archive(filen, wd=wd)
347 elif ext == ".tgz":
348 return unpack_tar_archive(filen, wd=wd)
349 else:
350 return [filen]
351
352
353 def fetch_files(urls, wd=None, files=None):
354 """Download and unpack files from a list of URLs
355
356 Given a list of URLs, download and unpack each
357 one, and return a list of the extracted files.
358
359 'wd' specifies the working directory to extract
360 the files to, otherwise they are extracted to the
361 current working directory.
362
363 If 'files' is given then the list of extracted
364 files will be appended to this list before being
365 returned.
366
367 """
368 if files is None:
369 files = []
370 for url in urls:
371 filen = download_file(url, wd=wd)
372 files.extend(unpack_archive(filen, wd=wd))
373 return files
374
375
376 # Utility functions specific to the Mothur reference data
377 def identify_type(filen):
378 """Return the data table name based on the file name
379
380 """
381 ext = os.path.splitext(filen)[1]
382 try:
383 return MOTHUR_FILE_TYPES[ext]
384 except KeyError:
385 print "WARNING: unknown file type for " + filen + ", skipping"
386 return None
387
388
389 def get_name(filen):
390 """Generate a descriptive name based on the file name
391 """
392 # type_ = identify_type(filen)
393 name = os.path.splitext(os.path.basename(filen))[0]
394 for delim in ('.', '_'):
395 name = name.replace(delim, ' ')
396 return name
397
398
399 def fetch_from_mothur_website(data_tables, target_dir, datasets):
400 """Fetch reference data from the Mothur website
401
402 For each dataset in the list 'datasets', download (and if
403 necessary unpack) the related files from the Mothur website,
404 copy them to the data manager's target directory, and add
405 references to the files to the appropriate data table.
406
407 The 'data_tables' dictionary should have been created using
408 the 'create_data_tables_dict' and 'add_data_table' functions.
409
410 Arguments:
411 data_tables: a dictionary containing the data table info
412 target_dir: directory to put the downloaded files
413 datasets: a list of dataset names corresponding to keys in
414 the MOTHUR_REFERENCE_DATA dictionary
415 """
416 # Make working dir
417 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
418 print "Working dir %s" % wd
419 # Iterate over all requested reference data URLs
420 for dataset in datasets:
421 print "Handling dataset '%s'" % dataset
422 for name in MOTHUR_REFERENCE_DATA[dataset]:
423 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
424 type_ = identify_type(f)
425 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name)
426 print "%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))
427 if type_ is not None:
428 # Move to target dir
429 ref_data_file = os.path.basename(f)
430 f1 = os.path.join(target_dir, ref_data_file)
431 print "Moving %s to %s" % (f, f1)
432 os.rename(f, f1)
433 # Add entry to data table
434 table_name = "mothur_%s" % type_
435 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
436 # Remove working dir
437 print "Removing %s" % wd
438 shutil.rmtree(wd)
439
440
441 def files_from_filesystem_paths(paths):
442 """Return list of file paths from arbitrary input paths
443
444 Given a list of filesystem paths, return a list of
445 full paths corresponding to all files found recursively
446 from under those paths.
447
448 """
449 # Collect files to add
450 files = []
451 for path in paths:
452 path = os.path.abspath(path)
453 print "Examining '%s'..." % path
454 if os.path.isfile(path):
455 # Store full path for file
456 files.append(path)
457 elif os.path.isdir(path):
458 # Descend into directory and collect the files
459 for f in os.listdir(path):
460 files.extend(files_from_filesystem_paths((os.path.join(path, f), )))
461 else:
462 print "Not a file or directory, ignored"
463 return files
464
465
466 def import_from_server(data_tables, target_dir, paths, description, link_to_data=False):
467 """Import reference data from filesystem paths
468
469 Creates references to the specified file(s) on the Galaxy
470 server in the appropriate data table (determined from the
471 file extension).
472
473 The 'data_tables' dictionary should have been created using
474 the 'create_data_tables_dict' and 'add_data_table' functions.
475
476 Arguments:
477 data_tables: a dictionary containing the data table info
478 target_dir: directory to put copy or link to the data file
479 paths: list of file and/or directory paths to import
480 description: text to associate with the files
481 link_to_data: boolean, if False then copy the data file
482 into Galaxy (default); if True then make a symlink to
483 the data file
484
485 """
486 # Collect list of files based on input paths
487 files = files_from_filesystem_paths(paths)
488 # Handle each file individually
489 for f in files:
490 type_ = identify_type(f)
491 if type_ is None:
492 print "%s: unrecognised type, skipped" % f
493 continue
494 ref_data_file = os.path.basename(f)
495 target_file = os.path.join(target_dir, ref_data_file)
496 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
497 if description:
498 entry_name += " (%s)" % description
499 print "%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)
500 # Link to or copy the data
501 if link_to_data:
502 os.symlink(f, target_file)
503 else:
504 shutil.copyfile(f, target_file)
505 # Add entry to data table
506 table_name = "mothur_%s" % type_
507 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
508
509
510 if __name__ == "__main__":
511 print "Starting..."
512
513 # Read command line
514 parser = optparse.OptionParser()
515 parser.add_option('--source', action='store', dest='data_source')
516 parser.add_option('--datasets', action='store', dest='datasets', default='')
517 parser.add_option('--paths', action='store', dest='paths', default=[])
518 parser.add_option('--description', action='store', dest='description', default='')
519 parser.add_option('--link', action='store_true', dest='link_to_data')
520 options, args = parser.parse_args()
521 print "options: %s" % options
522 print "args : %s" % args
523
524 # Check for JSON file
525 if len(args) != 1:
526 sys.stderr.write("Need to supply JSON file name")
527 sys.exit(1)
528
529 jsonfile = args[0]
530
531 # Read the input JSON
532 params, target_dir = read_input_json(jsonfile)
533
534 # Make the target directory
535 print "Making %s" % target_dir
536 os.mkdir(target_dir)
537
538 # Set up data tables dictionary
539 data_tables = create_data_tables_dict()
540 add_data_table(data_tables, 'mothur_lookup')
541 add_data_table(data_tables, 'mothur_aligndb')
542 add_data_table(data_tables, 'mothur_map')
543 add_data_table(data_tables, 'mothur_taxonomy')
544
545 # Fetch data from specified data sources
546 if options.data_source == 'mothur_website':
547 datasets = options.datasets.split(',')
548 fetch_from_mothur_website(data_tables, target_dir, datasets)
549 elif options.data_source == 'filesystem_paths':
550 # Check description text
551 description = options.description.strip()
552 # Get list of paths (need to remove any escapes for '\n' and '\r'
553 # that might have been inserted by Galaxy)
554 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
555 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data)
556 # Write output JSON
557 print "Outputting JSON"
558 print str(json.dumps(data_tables))
559 open(jsonfile, 'wb').write(json.dumps(data_tables))
560 print "Done."