Mercurial > repos > iuc > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 4:0db22932bc39 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
author | iuc |
---|---|
date | Fri, 25 Jun 2021 09:37:05 +0000 |
parents | 9d09724f2bf1 |
children |
comparison
equal
deleted
inserted
replaced
3:9d09724f2bf1 | 4:0db22932bc39 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
2 # | 2 # |
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools |
4 import io | |
4 import json | 5 import json |
5 import optparse | 6 import optparse |
6 import os | 7 import os |
7 import shutil | 8 import shutil |
8 import sys | 9 import sys |
9 import tarfile | 10 import tarfile |
10 import tempfile | 11 import tempfile |
11 import urllib2 | 12 import urllib.error |
13 import urllib.parse | |
14 import urllib.request | |
12 import zipfile | 15 import zipfile |
13 from functools import reduce | 16 from functools import reduce |
14 | 17 |
15 # When extracting files from archives, skip names that | 18 # When extracting files from archives, skip names that |
16 # start with the following strings | 19 # start with the following strings |
36 "lookup_gs20": { | 39 "lookup_gs20": { |
37 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] | 40 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] |
38 }, | 41 }, |
39 # RDP reference files | 42 # RDP reference files |
40 # http://www.mothur.org/wiki/RDP_reference_files | 43 # http://www.mothur.org/wiki/RDP_reference_files |
44 "RDP_v18": { | |
45 "16S rRNA RDP training set 18": | |
46 [ | |
47 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ], | |
48 "16S rRNA PDS training set 18": | |
49 [ | |
50 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ], | |
51 }, | |
41 "RDP_v16": { | 52 "RDP_v16": { |
42 "16S rRNA RDP training set 16": | 53 "16S rRNA RDP training set 16": |
43 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], | 54 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], |
44 "16S rRNA PDS training set 16": | 55 "16S rRNA PDS training set 16": |
45 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], | 56 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], |
74 "RDP training set 6": | 85 "RDP training set 6": |
75 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], | 86 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], |
76 }, | 87 }, |
77 # Silva reference files | 88 # Silva reference files |
78 # http://www.mothur.org/wiki/Silva_reference_files | 89 # http://www.mothur.org/wiki/Silva_reference_files |
90 "silva_release_138.1": { | |
91 "SILVA release 138.1": | |
92 [ | |
93 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz", | |
94 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ], | |
95 }, | |
79 "silva_release_128": { | 96 "silva_release_128": { |
80 "SILVA release 128": | 97 "SILVA release 128": |
81 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", | 98 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", |
82 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], | 99 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], |
83 }, | 100 }, |
228 system. | 245 system. |
229 | 246 |
230 Returns the name that the file is saved with. | 247 Returns the name that the file is saved with. |
231 | 248 |
232 """ | 249 """ |
233 print("Downloading %s" % url) | 250 print(f"Downloading {url}") |
234 if not target: | 251 if not target: |
235 target = os.path.basename(url) | 252 target = os.path.basename(url) |
236 if wd: | 253 if wd: |
237 target = os.path.join(wd, target) | 254 target = os.path.join(wd, target) |
238 print("Saving to %s" % target) | 255 print(f"Saving to {target}") |
239 with open(target, 'wb') as fh: | 256 with open(target, 'wb') as fh: |
240 fh.write(urllib2.urlopen(url).read()) | 257 url_h = urllib.request.urlopen(url) |
258 while True: | |
259 buffer = url_h.read(io.DEFAULT_BUFFER_SIZE) | |
260 if buffer == b"": | |
261 break | |
262 fh.write(buffer) | |
241 return target | 263 return target |
242 | 264 |
243 | 265 |
244 def unpack_zip_archive(filen, wd=None): | 266 def unpack_zip_archive(filen, wd=None): |
245 """Extract files from a ZIP archive | 267 """Extract files from a ZIP archive |
255 Once all the files are extracted the ZIP archive | 277 Once all the files are extracted the ZIP archive |
256 file is deleted from the file system. | 278 file is deleted from the file system. |
257 | 279 |
258 """ | 280 """ |
259 if not zipfile.is_zipfile(filen): | 281 if not zipfile.is_zipfile(filen): |
260 print("%s: not ZIP formatted file") | 282 print(f"{filen}: not ZIP formatted file") |
261 return [filen] | 283 return [filen] |
262 file_list = [] | 284 file_list = [] |
263 with zipfile.ZipFile(filen) as z: | 285 with zipfile.ZipFile(filen) as z: |
264 for name in z.namelist(): | 286 for name in z.namelist(): |
265 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 287 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
266 print("Ignoring %s" % name) | 288 print(f"Ignoring {name}") |
267 continue | 289 continue |
268 if wd: | 290 if wd: |
269 target = os.path.join(wd, name) | 291 target = os.path.join(wd, name) |
270 else: | 292 else: |
271 target = name | 293 target = name |
272 if name.endswith('/'): | 294 if name.endswith('/'): |
273 # Make directory | 295 # Make directory |
274 print("Creating dir %s" % target) | 296 print(f"Creating dir {target}") |
275 try: | 297 try: |
276 os.makedirs(target) | 298 os.makedirs(target) |
277 except OSError: | 299 except OSError: |
278 pass | 300 pass |
279 else: | 301 else: |
280 # Extract file | 302 # Extract file |
281 print("Extracting %s" % name) | 303 print("Extracting {target}") |
282 try: | 304 try: |
283 os.makedirs(os.path.dirname(target)) | 305 os.makedirs(os.path.dirname(target)) |
284 except OSError: | 306 except OSError: |
285 pass | 307 pass |
286 with open(target, 'wb') as fh: | 308 with open(target, 'wb') as fh: |
287 fh.write(z.read(name)) | 309 fh.write(z.read(name)) |
288 file_list.append(target) | 310 file_list.append(target) |
289 print("Removing %s" % filen) | 311 print(f"Removing {filen}") |
290 os.remove(filen) | 312 os.remove(filen) |
291 return file_list | 313 return file_list |
292 | 314 |
293 | 315 |
294 def unpack_tar_archive(filen, wd=None): | 316 def unpack_tar_archive(filen, wd=None): |
307 file is deleted from the file system. | 329 file is deleted from the file system. |
308 | 330 |
309 """ | 331 """ |
310 file_list = [] | 332 file_list = [] |
311 if not tarfile.is_tarfile(filen): | 333 if not tarfile.is_tarfile(filen): |
312 print("%s: not TAR file") | 334 print(f"{filen}: not TAR file") |
313 return [filen] | 335 return [filen] |
314 with tarfile.open(filen) as t: | 336 with tarfile.open(filen) as t: |
315 for name in t.getnames(): | 337 for name in t.getnames(): |
316 # Check for unwanted files | 338 # Check for unwanted files |
317 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 339 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
318 print("Ignoring %s" % name) | 340 print(f"Ignoring {name}") |
319 continue | 341 continue |
320 # Extract file | 342 # Extract file |
321 print("Extracting %s" % name) | 343 print(f"Extracting {name}") |
322 t.extract(name, wd) | 344 t.extract(name, wd) |
323 if wd: | 345 if wd: |
324 target = os.path.join(wd, name) | 346 target = os.path.join(wd, name) |
325 else: | 347 else: |
326 target = name | 348 target = name |
327 file_list.append(target) | 349 file_list.append(target) |
328 print("Removing %s" % filen) | 350 print(f"Removing {filen}") |
329 os.remove(filen) | 351 os.remove(filen) |
330 return file_list | 352 return file_list |
331 | 353 |
332 | 354 |
333 def unpack_archive(filen, wd=None): | 355 def unpack_archive(filen, wd=None): |
341 'wd' specifies the working directory to extract | 363 'wd' specifies the working directory to extract |
342 the files to, otherwise they are extracted to the | 364 the files to, otherwise they are extracted to the |
343 current working directory. | 365 current working directory. |
344 | 366 |
345 """ | 367 """ |
346 print("Unpack %s" % filen) | 368 print(f"Unpack {filen}") |
347 ext = os.path.splitext(filen)[1] | 369 ext = os.path.splitext(filen)[1] |
348 print("Extension: %s" % ext) | 370 print(f"Extension: {ext}") |
349 if ext == ".zip": | 371 if ext == ".zip": |
350 return unpack_zip_archive(filen, wd=wd) | 372 return unpack_zip_archive(filen, wd=wd) |
351 elif ext == ".tgz": | 373 elif ext == ".tgz": |
352 return unpack_tar_archive(filen, wd=wd) | 374 return unpack_tar_archive(filen, wd=wd) |
353 else: | 375 else: |
384 """ | 406 """ |
385 ext = os.path.splitext(filen)[1] | 407 ext = os.path.splitext(filen)[1] |
386 try: | 408 try: |
387 return MOTHUR_FILE_TYPES[ext] | 409 return MOTHUR_FILE_TYPES[ext] |
388 except KeyError: | 410 except KeyError: |
389 print("WARNING: unknown file type for " + filen + ", skipping") | 411 print(f"WARNING: unknown file type for {filen}, skipping") |
390 return None | 412 return None |
391 | 413 |
392 | 414 |
393 def get_name(filen): | 415 def get_name(filen): |
394 """Generate a descriptive name based on the file name | 416 """Generate a descriptive name based on the file name |
417 datasets: a list of dataset names corresponding to keys in | 439 datasets: a list of dataset names corresponding to keys in |
418 the MOTHUR_REFERENCE_DATA dictionary | 440 the MOTHUR_REFERENCE_DATA dictionary |
419 """ | 441 """ |
420 # Make working dir | 442 # Make working dir |
421 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) | 443 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) |
422 print("Working dir %s" % wd) | 444 print(f"Working dir {wd}") |
423 # Iterate over all requested reference data URLs | 445 # Iterate over all requested reference data URLs |
424 for dataset in datasets: | 446 for dataset in datasets: |
425 print("Handling dataset '%s'" % dataset) | 447 print(f"Handling dataset '{dataset}'") |
426 for name in MOTHUR_REFERENCE_DATA[dataset]: | 448 for name in MOTHUR_REFERENCE_DATA[dataset]: |
427 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): | 449 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): |
428 type_ = identify_type(f) | 450 type_ = identify_type(f) |
429 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) | 451 name_from_file = os.path.splitext(os.path.basename(f))[0] |
430 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))) | 452 entry_name = f"{name_from_file} ({name})" |
453 print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}") | |
431 if type_ is not None: | 454 if type_ is not None: |
432 # Move to target dir | 455 # Move to target dir |
433 ref_data_file = os.path.basename(f) | 456 ref_data_file = os.path.basename(f) |
434 f1 = os.path.join(target_dir, ref_data_file) | 457 f1 = os.path.join(target_dir, ref_data_file) |
435 print("Moving %s to %s" % (f, f1)) | 458 print(f"Moving {f} to {f1}") |
436 os.rename(f, f1) | 459 shutil.move(f, f1) |
437 # Add entry to data table | 460 # Add entry to data table |
438 table_name = "mothur_%s" % type_ | 461 table_name = f"mothur_{type_}" |
439 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 462 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
440 # Remove working dir | 463 # Remove working dir |
441 print("Removing %s" % wd) | 464 print(f"Removing {wd}") |
442 shutil.rmtree(wd) | 465 shutil.rmtree(wd) |
443 | 466 |
444 | 467 |
445 def files_from_filesystem_paths(paths): | 468 def files_from_filesystem_paths(paths): |
446 """Return list of file paths from arbitrary input paths | 469 """Return list of file paths from arbitrary input paths |
452 """ | 475 """ |
453 # Collect files to add | 476 # Collect files to add |
454 files = [] | 477 files = [] |
455 for path in paths: | 478 for path in paths: |
456 path = os.path.abspath(path) | 479 path = os.path.abspath(path) |
457 print("Examining '%s'..." % path) | 480 print(f"Examining '{path}'...") |
458 if os.path.isfile(path): | 481 if os.path.isfile(path): |
459 # Store full path for file | 482 # Store full path for file |
460 files.append(path) | 483 files.append(path) |
461 elif os.path.isdir(path): | 484 elif os.path.isdir(path): |
462 # Descend into directory and collect the files | 485 # Descend into directory and collect the files |
491 files = files_from_filesystem_paths(paths) | 514 files = files_from_filesystem_paths(paths) |
492 # Handle each file individually | 515 # Handle each file individually |
493 for f in files: | 516 for f in files: |
494 type_ = identify_type(f) | 517 type_ = identify_type(f) |
495 if type_ is None: | 518 if type_ is None: |
496 print("%s: unrecognised type, skipped" % f) | 519 print(f"{f}: unrecognised type, skipped") |
497 continue | 520 continue |
498 ref_data_file = os.path.basename(f) | 521 ref_data_file = os.path.basename(f) |
499 target_file = os.path.join(target_dir, ref_data_file) | 522 target_file = os.path.join(target_dir, ref_data_file) |
500 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | 523 entry_name = "%s" % os.path.splitext(ref_data_file)[0] |
501 if description: | 524 if description: |
502 entry_name += " (%s)" % description | 525 entry_name += " (%s)" % description |
503 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)) | 526 print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}") |
504 # Link to or copy the data | 527 # Link to or copy the data |
505 if link_to_data: | 528 if link_to_data: |
506 os.symlink(f, target_file) | 529 os.symlink(f, target_file) |
507 else: | 530 else: |
508 shutil.copyfile(f, target_file) | 531 shutil.copyfile(f, target_file) |
509 # Add entry to data table | 532 # Add entry to data table |
510 table_name = "mothur_%s" % type_ | 533 table_name = f"mothur_{type_}" |
511 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 534 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
512 | 535 |
513 | 536 |
514 if __name__ == "__main__": | 537 if __name__ == "__main__": |
515 print("Starting...") | 538 print("Starting...") |
520 parser.add_option('--datasets', action='store', dest='datasets', default='') | 543 parser.add_option('--datasets', action='store', dest='datasets', default='') |
521 parser.add_option('--paths', action='store', dest='paths', default=[]) | 544 parser.add_option('--paths', action='store', dest='paths', default=[]) |
522 parser.add_option('--description', action='store', dest='description', default='') | 545 parser.add_option('--description', action='store', dest='description', default='') |
523 parser.add_option('--link', action='store_true', dest='link_to_data') | 546 parser.add_option('--link', action='store_true', dest='link_to_data') |
524 options, args = parser.parse_args() | 547 options, args = parser.parse_args() |
525 print("options: %s" % options) | 548 print(f"options: {options}") |
526 print("args : %s" % args) | 549 print(f"args : {args}") |
527 | 550 |
528 # Check for JSON file | 551 # Check for JSON file |
529 if len(args) != 1: | 552 if len(args) != 1: |
530 sys.stderr.write("Need to supply JSON file name") | 553 sys.stderr.write("Need to supply JSON file name") |
531 sys.exit(1) | 554 sys.exit(1) |
534 | 557 |
535 # Read the input JSON | 558 # Read the input JSON |
536 params, target_dir = read_input_json(jsonfile) | 559 params, target_dir = read_input_json(jsonfile) |
537 | 560 |
538 # Make the target directory | 561 # Make the target directory |
539 print("Making %s" % target_dir) | 562 print(f"Making {target_dir}") |
540 os.mkdir(target_dir) | 563 os.mkdir(target_dir) |
541 | 564 |
542 # Set up data tables dictionary | 565 # Set up data tables dictionary |
543 data_tables = create_data_tables_dict() | 566 data_tables = create_data_tables_dict() |
544 add_data_table(data_tables, 'mothur_lookup') | 567 add_data_table(data_tables, 'mothur_lookup') |