comparison data_manager/fetch_mothur_reference_data.py @ 4:0db22932bc39 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
author iuc
date Fri, 25 Jun 2021 09:37:05 +0000
parents 9d09724f2bf1
children
comparison
equal deleted inserted replaced
3:9d09724f2bf1 4:0db22932bc39
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 # 2 #
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
4 import io
4 import json 5 import json
5 import optparse 6 import optparse
6 import os 7 import os
7 import shutil 8 import shutil
8 import sys 9 import sys
9 import tarfile 10 import tarfile
10 import tempfile 11 import tempfile
11 import urllib2 12 import urllib.error
13 import urllib.parse
14 import urllib.request
12 import zipfile 15 import zipfile
13 from functools import reduce 16 from functools import reduce
14 17
15 # When extracting files from archives, skip names that 18 # When extracting files from archives, skip names that
16 # start with the following strings 19 # start with the following strings
36 "lookup_gs20": { 39 "lookup_gs20": {
37 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] 40 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ]
38 }, 41 },
39 # RDP reference files 42 # RDP reference files
40 # http://www.mothur.org/wiki/RDP_reference_files 43 # http://www.mothur.org/wiki/RDP_reference_files
44 "RDP_v18": {
45 "16S rRNA RDP training set 18":
46 [
47 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ],
48 "16S rRNA PDS training set 18":
49 [
50 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ],
51 },
41 "RDP_v16": { 52 "RDP_v16": {
42 "16S rRNA RDP training set 16": 53 "16S rRNA RDP training set 16":
43 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], 54 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ],
44 "16S rRNA PDS training set 16": 55 "16S rRNA PDS training set 16":
45 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], 56 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ],
74 "RDP training set 6": 85 "RDP training set 6":
75 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], 86 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ],
76 }, 87 },
77 # Silva reference files 88 # Silva reference files
78 # http://www.mothur.org/wiki/Silva_reference_files 89 # http://www.mothur.org/wiki/Silva_reference_files
90 "silva_release_138.1": {
91 "SILVA release 138.1":
92 [
93 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz",
94 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ],
95 },
79 "silva_release_128": { 96 "silva_release_128": {
80 "SILVA release 128": 97 "SILVA release 128":
81 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", 98 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz",
82 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], 99 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ],
83 }, 100 },
228 system. 245 system.
229 246
230 Returns the name that the file is saved with. 247 Returns the name that the file is saved with.
231 248
232 """ 249 """
233 print("Downloading %s" % url) 250 print(f"Downloading {url}")
234 if not target: 251 if not target:
235 target = os.path.basename(url) 252 target = os.path.basename(url)
236 if wd: 253 if wd:
237 target = os.path.join(wd, target) 254 target = os.path.join(wd, target)
238 print("Saving to %s" % target) 255 print(f"Saving to {target}")
239 with open(target, 'wb') as fh: 256 with open(target, 'wb') as fh:
240 fh.write(urllib2.urlopen(url).read()) 257 url_h = urllib.request.urlopen(url)
258 while True:
259 buffer = url_h.read(io.DEFAULT_BUFFER_SIZE)
260 if buffer == b"":
261 break
262 fh.write(buffer)
241 return target 263 return target
242 264
243 265
244 def unpack_zip_archive(filen, wd=None): 266 def unpack_zip_archive(filen, wd=None):
245 """Extract files from a ZIP archive 267 """Extract files from a ZIP archive
255 Once all the files are extracted the ZIP archive 277 Once all the files are extracted the ZIP archive
256 file is deleted from the file system. 278 file is deleted from the file system.
257 279
258 """ 280 """
259 if not zipfile.is_zipfile(filen): 281 if not zipfile.is_zipfile(filen):
260 print("%s: not ZIP formatted file") 282 print(f"{filen}: not ZIP formatted file")
261 return [filen] 283 return [filen]
262 file_list = [] 284 file_list = []
263 with zipfile.ZipFile(filen) as z: 285 with zipfile.ZipFile(filen) as z:
264 for name in z.namelist(): 286 for name in z.namelist():
265 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 287 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
266 print("Ignoring %s" % name) 288 print(f"Ignoring {name}")
267 continue 289 continue
268 if wd: 290 if wd:
269 target = os.path.join(wd, name) 291 target = os.path.join(wd, name)
270 else: 292 else:
271 target = name 293 target = name
272 if name.endswith('/'): 294 if name.endswith('/'):
273 # Make directory 295 # Make directory
274 print("Creating dir %s" % target) 296 print(f"Creating dir {target}")
275 try: 297 try:
276 os.makedirs(target) 298 os.makedirs(target)
277 except OSError: 299 except OSError:
278 pass 300 pass
279 else: 301 else:
280 # Extract file 302 # Extract file
281 print("Extracting %s" % name) 303 print("Extracting {target}")
282 try: 304 try:
283 os.makedirs(os.path.dirname(target)) 305 os.makedirs(os.path.dirname(target))
284 except OSError: 306 except OSError:
285 pass 307 pass
286 with open(target, 'wb') as fh: 308 with open(target, 'wb') as fh:
287 fh.write(z.read(name)) 309 fh.write(z.read(name))
288 file_list.append(target) 310 file_list.append(target)
289 print("Removing %s" % filen) 311 print(f"Removing {filen}")
290 os.remove(filen) 312 os.remove(filen)
291 return file_list 313 return file_list
292 314
293 315
294 def unpack_tar_archive(filen, wd=None): 316 def unpack_tar_archive(filen, wd=None):
307 file is deleted from the file system. 329 file is deleted from the file system.
308 330
309 """ 331 """
310 file_list = [] 332 file_list = []
311 if not tarfile.is_tarfile(filen): 333 if not tarfile.is_tarfile(filen):
312 print("%s: not TAR file") 334 print(f"{filen}: not TAR file")
313 return [filen] 335 return [filen]
314 with tarfile.open(filen) as t: 336 with tarfile.open(filen) as t:
315 for name in t.getnames(): 337 for name in t.getnames():
316 # Check for unwanted files 338 # Check for unwanted files
317 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 339 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
318 print("Ignoring %s" % name) 340 print(f"Ignoring {name}")
319 continue 341 continue
320 # Extract file 342 # Extract file
321 print("Extracting %s" % name) 343 print(f"Extracting {name}")
322 t.extract(name, wd) 344 t.extract(name, wd)
323 if wd: 345 if wd:
324 target = os.path.join(wd, name) 346 target = os.path.join(wd, name)
325 else: 347 else:
326 target = name 348 target = name
327 file_list.append(target) 349 file_list.append(target)
328 print("Removing %s" % filen) 350 print(f"Removing {filen}")
329 os.remove(filen) 351 os.remove(filen)
330 return file_list 352 return file_list
331 353
332 354
333 def unpack_archive(filen, wd=None): 355 def unpack_archive(filen, wd=None):
341 'wd' specifies the working directory to extract 363 'wd' specifies the working directory to extract
342 the files to, otherwise they are extracted to the 364 the files to, otherwise they are extracted to the
343 current working directory. 365 current working directory.
344 366
345 """ 367 """
346 print("Unpack %s" % filen) 368 print(f"Unpack {filen}")
347 ext = os.path.splitext(filen)[1] 369 ext = os.path.splitext(filen)[1]
348 print("Extension: %s" % ext) 370 print(f"Extension: {ext}")
349 if ext == ".zip": 371 if ext == ".zip":
350 return unpack_zip_archive(filen, wd=wd) 372 return unpack_zip_archive(filen, wd=wd)
351 elif ext == ".tgz": 373 elif ext == ".tgz":
352 return unpack_tar_archive(filen, wd=wd) 374 return unpack_tar_archive(filen, wd=wd)
353 else: 375 else:
384 """ 406 """
385 ext = os.path.splitext(filen)[1] 407 ext = os.path.splitext(filen)[1]
386 try: 408 try:
387 return MOTHUR_FILE_TYPES[ext] 409 return MOTHUR_FILE_TYPES[ext]
388 except KeyError: 410 except KeyError:
389 print("WARNING: unknown file type for " + filen + ", skipping") 411 print(f"WARNING: unknown file type for {filen}, skipping")
390 return None 412 return None
391 413
392 414
393 def get_name(filen): 415 def get_name(filen):
394 """Generate a descriptive name based on the file name 416 """Generate a descriptive name based on the file name
417 datasets: a list of dataset names corresponding to keys in 439 datasets: a list of dataset names corresponding to keys in
418 the MOTHUR_REFERENCE_DATA dictionary 440 the MOTHUR_REFERENCE_DATA dictionary
419 """ 441 """
420 # Make working dir 442 # Make working dir
421 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) 443 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
422 print("Working dir %s" % wd) 444 print(f"Working dir {wd}")
423 # Iterate over all requested reference data URLs 445 # Iterate over all requested reference data URLs
424 for dataset in datasets: 446 for dataset in datasets:
425 print("Handling dataset '%s'" % dataset) 447 print(f"Handling dataset '{dataset}'")
426 for name in MOTHUR_REFERENCE_DATA[dataset]: 448 for name in MOTHUR_REFERENCE_DATA[dataset]:
427 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): 449 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
428 type_ = identify_type(f) 450 type_ = identify_type(f)
429 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) 451 name_from_file = os.path.splitext(os.path.basename(f))[0]
430 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))) 452 entry_name = f"{name_from_file} ({name})"
453 print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}")
431 if type_ is not None: 454 if type_ is not None:
432 # Move to target dir 455 # Move to target dir
433 ref_data_file = os.path.basename(f) 456 ref_data_file = os.path.basename(f)
434 f1 = os.path.join(target_dir, ref_data_file) 457 f1 = os.path.join(target_dir, ref_data_file)
435 print("Moving %s to %s" % (f, f1)) 458 print(f"Moving {f} to {f1}")
436 os.rename(f, f1) 459 shutil.move(f, f1)
437 # Add entry to data table 460 # Add entry to data table
438 table_name = "mothur_%s" % type_ 461 table_name = f"mothur_{type_}"
439 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 462 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
440 # Remove working dir 463 # Remove working dir
441 print("Removing %s" % wd) 464 print(f"Removing {wd}")
442 shutil.rmtree(wd) 465 shutil.rmtree(wd)
443 466
444 467
445 def files_from_filesystem_paths(paths): 468 def files_from_filesystem_paths(paths):
446 """Return list of file paths from arbitrary input paths 469 """Return list of file paths from arbitrary input paths
452 """ 475 """
453 # Collect files to add 476 # Collect files to add
454 files = [] 477 files = []
455 for path in paths: 478 for path in paths:
456 path = os.path.abspath(path) 479 path = os.path.abspath(path)
457 print("Examining '%s'..." % path) 480 print(f"Examining '{path}'...")
458 if os.path.isfile(path): 481 if os.path.isfile(path):
459 # Store full path for file 482 # Store full path for file
460 files.append(path) 483 files.append(path)
461 elif os.path.isdir(path): 484 elif os.path.isdir(path):
462 # Descend into directory and collect the files 485 # Descend into directory and collect the files
491 files = files_from_filesystem_paths(paths) 514 files = files_from_filesystem_paths(paths)
492 # Handle each file individually 515 # Handle each file individually
493 for f in files: 516 for f in files:
494 type_ = identify_type(f) 517 type_ = identify_type(f)
495 if type_ is None: 518 if type_ is None:
496 print("%s: unrecognised type, skipped" % f) 519 print(f"{f}: unrecognised type, skipped")
497 continue 520 continue
498 ref_data_file = os.path.basename(f) 521 ref_data_file = os.path.basename(f)
499 target_file = os.path.join(target_dir, ref_data_file) 522 target_file = os.path.join(target_dir, ref_data_file)
500 entry_name = "%s" % os.path.splitext(ref_data_file)[0] 523 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
501 if description: 524 if description:
502 entry_name += " (%s)" % description 525 entry_name += " (%s)" % description
503 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)) 526 print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}")
504 # Link to or copy the data 527 # Link to or copy the data
505 if link_to_data: 528 if link_to_data:
506 os.symlink(f, target_file) 529 os.symlink(f, target_file)
507 else: 530 else:
508 shutil.copyfile(f, target_file) 531 shutil.copyfile(f, target_file)
509 # Add entry to data table 532 # Add entry to data table
510 table_name = "mothur_%s" % type_ 533 table_name = f"mothur_{type_}"
511 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 534 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
512 535
513 536
514 if __name__ == "__main__": 537 if __name__ == "__main__":
515 print("Starting...") 538 print("Starting...")
520 parser.add_option('--datasets', action='store', dest='datasets', default='') 543 parser.add_option('--datasets', action='store', dest='datasets', default='')
521 parser.add_option('--paths', action='store', dest='paths', default=[]) 544 parser.add_option('--paths', action='store', dest='paths', default=[])
522 parser.add_option('--description', action='store', dest='description', default='') 545 parser.add_option('--description', action='store', dest='description', default='')
523 parser.add_option('--link', action='store_true', dest='link_to_data') 546 parser.add_option('--link', action='store_true', dest='link_to_data')
524 options, args = parser.parse_args() 547 options, args = parser.parse_args()
525 print("options: %s" % options) 548 print(f"options: {options}")
526 print("args : %s" % args) 549 print(f"args : {args}")
527 550
528 # Check for JSON file 551 # Check for JSON file
529 if len(args) != 1: 552 if len(args) != 1:
530 sys.stderr.write("Need to supply JSON file name") 553 sys.stderr.write("Need to supply JSON file name")
531 sys.exit(1) 554 sys.exit(1)
534 557
535 # Read the input JSON 558 # Read the input JSON
536 params, target_dir = read_input_json(jsonfile) 559 params, target_dir = read_input_json(jsonfile)
537 560
538 # Make the target directory 561 # Make the target directory
539 print("Making %s" % target_dir) 562 print(f"Making {target_dir}")
540 os.mkdir(target_dir) 563 os.mkdir(target_dir)
541 564
542 # Set up data tables dictionary 565 # Set up data tables dictionary
543 data_tables = create_data_tables_dict() 566 data_tables = create_data_tables_dict()
544 add_data_table(data_tables, 'mothur_lookup') 567 add_data_table(data_tables, 'mothur_lookup')