comparison data_manager/fetch_mothur_reference_data.py @ 1:aec831b54a5b draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit 57f71aa633a43ab02bbf05acd0c6d7f406e01f1e"
author iuc
date Thu, 28 Nov 2019 15:47:32 -0500
parents ab7a7e798c34
children 0e532fc0a0a6
comparison
equal deleted inserted replaced
0:ab7a7e798c34 1:aec831b54a5b
226 system. 226 system.
227 227
228 Returns the name that the file is saved with. 228 Returns the name that the file is saved with.
229 229
230 """ 230 """
231 print "Downloading %s" % url 231 print("Downloading %s" % url)
232 if not target: 232 if not target:
233 target = os.path.basename(url) 233 target = os.path.basename(url)
234 if wd: 234 if wd:
235 target = os.path.join(wd, target) 235 target = os.path.join(wd, target)
236 print "Saving to %s" % target 236 print("Saving to %s" % target)
237 open(target, 'wb').write(urllib2.urlopen(url).read()) 237 open(target, 'wb').write(urllib2.urlopen(url).read())
238 return target 238 return target
239 239
240 240
241 def unpack_zip_archive(filen, wd=None): 241 def unpack_zip_archive(filen, wd=None):
252 Once all the files are extracted the ZIP archive 252 Once all the files are extracted the ZIP archive
253 file is deleted from the file system. 253 file is deleted from the file system.
254 254
255 """ 255 """
256 if not zipfile.is_zipfile(filen): 256 if not zipfile.is_zipfile(filen):
257 print "%s: not ZIP formatted file" 257 print("%s: not ZIP formatted file")
258 return [filen] 258 return [filen]
259 file_list = [] 259 file_list = []
260 z = zipfile.ZipFile(filen) 260 z = zipfile.ZipFile(filen)
261 for name in z.namelist(): 261 for name in z.namelist():
262 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 262 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
263 print "Ignoring %s" % name 263 print("Ignoring %s" % name)
264 continue 264 continue
265 if wd: 265 if wd:
266 target = os.path.join(wd, name) 266 target = os.path.join(wd, name)
267 else: 267 else:
268 target = name 268 target = name
269 if name.endswith('/'): 269 if name.endswith('/'):
270 # Make directory 270 # Make directory
271 print "Creating dir %s" % target 271 print("Creating dir %s" % target)
272 try: 272 try:
273 os.makedirs(target) 273 os.makedirs(target)
274 except OSError: 274 except OSError:
275 pass 275 pass
276 else: 276 else:
277 # Extract file 277 # Extract file
278 print "Extracting %s" % name 278 print("Extracting %s" % name)
279 try: 279 try:
280 os.makedirs(os.path.dirname(target)) 280 os.makedirs(os.path.dirname(target))
281 except OSError: 281 except OSError:
282 pass 282 pass
283 open(target, 'wb').write(z.read(name)) 283 open(target, 'wb').write(z.read(name))
284 file_list.append(target) 284 file_list.append(target)
285 print "Removing %s" % filen 285 print("Removing %s" % filen)
286 os.remove(filen) 286 os.remove(filen)
287 return file_list 287 return file_list
288 288
289 289
290 def unpack_tar_archive(filen, wd=None): 290 def unpack_tar_archive(filen, wd=None):
303 file is deleted from the file system. 303 file is deleted from the file system.
304 304
305 """ 305 """
306 file_list = [] 306 file_list = []
307 if not tarfile.is_tarfile(filen): 307 if not tarfile.is_tarfile(filen):
308 print "%s: not TAR file" 308 print("%s: not TAR file")
309 return [filen] 309 return [filen]
310 t = tarfile.open(filen) 310 t = tarfile.open(filen)
311 for name in t.getnames(): 311 for name in t.getnames():
312 # Check for unwanted files 312 # Check for unwanted files
313 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 313 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
314 print "Ignoring %s" % name 314 print("Ignoring %s" % name)
315 continue 315 continue
316 # Extract file 316 # Extract file
317 print "Extracting %s" % name 317 print("Extracting %s" % name)
318 t.extract(name, wd) 318 t.extract(name, wd)
319 if wd: 319 if wd:
320 target = os.path.join(wd, name) 320 target = os.path.join(wd, name)
321 else: 321 else:
322 target = name 322 target = name
323 file_list.append(target) 323 file_list.append(target)
324 print "Removing %s" % filen 324 print("Removing %s" % filen)
325 os.remove(filen) 325 os.remove(filen)
326 return file_list 326 return file_list
327 327
328 328
329 def unpack_archive(filen, wd=None): 329 def unpack_archive(filen, wd=None):
337 'wd' specifies the working directory to extract 337 'wd' specifies the working directory to extract
338 the files to, otherwise they are extracted to the 338 the files to, otherwise they are extracted to the
339 current working directory. 339 current working directory.
340 340
341 """ 341 """
342 print "Unpack %s" % filen 342 print("Unpack %s" % filen)
343 ext = os.path.splitext(filen)[1] 343 ext = os.path.splitext(filen)[1]
344 print "Extension: %s" % ext 344 print("Extension: %s" % ext)
345 if ext == ".zip": 345 if ext == ".zip":
346 return unpack_zip_archive(filen, wd=wd) 346 return unpack_zip_archive(filen, wd=wd)
347 elif ext == ".tgz": 347 elif ext == ".tgz":
348 return unpack_tar_archive(filen, wd=wd) 348 return unpack_tar_archive(filen, wd=wd)
349 else: 349 else:
380 """ 380 """
381 ext = os.path.splitext(filen)[1] 381 ext = os.path.splitext(filen)[1]
382 try: 382 try:
383 return MOTHUR_FILE_TYPES[ext] 383 return MOTHUR_FILE_TYPES[ext]
384 except KeyError: 384 except KeyError:
385 print "WARNING: unknown file type for " + filen + ", skipping" 385 print("WARNING: unknown file type for " + filen + ", skipping")
386 return None 386 return None
387 387
388 388
389 def get_name(filen): 389 def get_name(filen):
390 """Generate a descriptive name based on the file name 390 """Generate a descriptive name based on the file name
413 datasets: a list of dataset names corresponding to keys in 413 datasets: a list of dataset names corresponding to keys in
414 the MOTHUR_REFERENCE_DATA dictionary 414 the MOTHUR_REFERENCE_DATA dictionary
415 """ 415 """
416 # Make working dir 416 # Make working dir
417 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) 417 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
418 print "Working dir %s" % wd 418 print("Working dir %s" % wd)
419 # Iterate over all requested reference data URLs 419 # Iterate over all requested reference data URLs
420 for dataset in datasets: 420 for dataset in datasets:
421 print "Handling dataset '%s'" % dataset 421 print("Handling dataset '%s'" % dataset)
422 for name in MOTHUR_REFERENCE_DATA[dataset]: 422 for name in MOTHUR_REFERENCE_DATA[dataset]:
423 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): 423 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
424 type_ = identify_type(f) 424 type_ = identify_type(f)
425 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) 425 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name)
426 print "%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)) 426 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)))
427 if type_ is not None: 427 if type_ is not None:
428 # Move to target dir 428 # Move to target dir
429 ref_data_file = os.path.basename(f) 429 ref_data_file = os.path.basename(f)
430 f1 = os.path.join(target_dir, ref_data_file) 430 f1 = os.path.join(target_dir, ref_data_file)
431 print "Moving %s to %s" % (f, f1) 431 print("Moving %s to %s" % (f, f1))
432 os.rename(f, f1) 432 os.rename(f, f1)
433 # Add entry to data table 433 # Add entry to data table
434 table_name = "mothur_%s" % type_ 434 table_name = "mothur_%s" % type_
435 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 435 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
436 # Remove working dir 436 # Remove working dir
437 print "Removing %s" % wd 437 print("Removing %s" % wd)
438 shutil.rmtree(wd) 438 shutil.rmtree(wd)
439 439
440 440
441 def files_from_filesystem_paths(paths): 441 def files_from_filesystem_paths(paths):
442 """Return list of file paths from arbitrary input paths 442 """Return list of file paths from arbitrary input paths
448 """ 448 """
449 # Collect files to add 449 # Collect files to add
450 files = [] 450 files = []
451 for path in paths: 451 for path in paths:
452 path = os.path.abspath(path) 452 path = os.path.abspath(path)
453 print "Examining '%s'..." % path 453 print("Examining '%s'..." % path)
454 if os.path.isfile(path): 454 if os.path.isfile(path):
455 # Store full path for file 455 # Store full path for file
456 files.append(path) 456 files.append(path)
457 elif os.path.isdir(path): 457 elif os.path.isdir(path):
458 # Descend into directory and collect the files 458 # Descend into directory and collect the files
459 for f in os.listdir(path): 459 for f in os.listdir(path):
460 files.extend(files_from_filesystem_paths((os.path.join(path, f), ))) 460 files.extend(files_from_filesystem_paths((os.path.join(path, f), )))
461 else: 461 else:
462 print "Not a file or directory, ignored" 462 print("Not a file or directory, ignored")
463 return files 463 return files
464 464
465 465
466 def import_from_server(data_tables, target_dir, paths, description, link_to_data=False): 466 def import_from_server(data_tables, target_dir, paths, description, link_to_data=False):
467 """Import reference data from filesystem paths 467 """Import reference data from filesystem paths
487 files = files_from_filesystem_paths(paths) 487 files = files_from_filesystem_paths(paths)
488 # Handle each file individually 488 # Handle each file individually
489 for f in files: 489 for f in files:
490 type_ = identify_type(f) 490 type_ = identify_type(f)
491 if type_ is None: 491 if type_ is None:
492 print "%s: unrecognised type, skipped" % f 492 print("%s: unrecognised type, skipped" % f)
493 continue 493 continue
494 ref_data_file = os.path.basename(f) 494 ref_data_file = os.path.basename(f)
495 target_file = os.path.join(target_dir, ref_data_file) 495 target_file = os.path.join(target_dir, ref_data_file)
496 entry_name = "%s" % os.path.splitext(ref_data_file)[0] 496 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
497 if description: 497 if description:
498 entry_name += " (%s)" % description 498 entry_name += " (%s)" % description
499 print "%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file) 499 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file))
500 # Link to or copy the data 500 # Link to or copy the data
501 if link_to_data: 501 if link_to_data:
502 os.symlink(f, target_file) 502 os.symlink(f, target_file)
503 else: 503 else:
504 shutil.copyfile(f, target_file) 504 shutil.copyfile(f, target_file)
506 table_name = "mothur_%s" % type_ 506 table_name = "mothur_%s" % type_
507 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 507 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
508 508
509 509
510 if __name__ == "__main__": 510 if __name__ == "__main__":
511 print "Starting..." 511 print("Starting...")
512 512
513 # Read command line 513 # Read command line
514 parser = optparse.OptionParser() 514 parser = optparse.OptionParser()
515 parser.add_option('--source', action='store', dest='data_source') 515 parser.add_option('--source', action='store', dest='data_source')
516 parser.add_option('--datasets', action='store', dest='datasets', default='') 516 parser.add_option('--datasets', action='store', dest='datasets', default='')
517 parser.add_option('--paths', action='store', dest='paths', default=[]) 517 parser.add_option('--paths', action='store', dest='paths', default=[])
518 parser.add_option('--description', action='store', dest='description', default='') 518 parser.add_option('--description', action='store', dest='description', default='')
519 parser.add_option('--link', action='store_true', dest='link_to_data') 519 parser.add_option('--link', action='store_true', dest='link_to_data')
520 options, args = parser.parse_args() 520 options, args = parser.parse_args()
521 print "options: %s" % options 521 print("options: %s" % options)
522 print "args : %s" % args 522 print("args : %s" % args)
523 523
524 # Check for JSON file 524 # Check for JSON file
525 if len(args) != 1: 525 if len(args) != 1:
526 sys.stderr.write("Need to supply JSON file name") 526 sys.stderr.write("Need to supply JSON file name")
527 sys.exit(1) 527 sys.exit(1)
530 530
531 # Read the input JSON 531 # Read the input JSON
532 params, target_dir = read_input_json(jsonfile) 532 params, target_dir = read_input_json(jsonfile)
533 533
534 # Make the target directory 534 # Make the target directory
535 print "Making %s" % target_dir 535 print("Making %s" % target_dir)
536 os.mkdir(target_dir) 536 os.mkdir(target_dir)
537 537
538 # Set up data tables dictionary 538 # Set up data tables dictionary
539 data_tables = create_data_tables_dict() 539 data_tables = create_data_tables_dict()
540 add_data_table(data_tables, 'mothur_lookup') 540 add_data_table(data_tables, 'mothur_lookup')
552 # Get list of paths (need to remove any escapes for '\n' and '\r' 552 # Get list of paths (need to remove any escapes for '\n' and '\r'
553 # that might have been inserted by Galaxy) 553 # that might have been inserted by Galaxy)
554 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() 554 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
555 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) 555 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data)
556 # Write output JSON 556 # Write output JSON
557 print "Outputting JSON" 557 print("Outputting JSON")
558 print str(json.dumps(data_tables)) 558 print(json.dumps(data_tables))
559 open(jsonfile, 'wb').write(json.dumps(data_tables)) 559 open(jsonfile, 'w').write(json.dumps(data_tables, sort_keys=True))
560 print "Done." 560 print("Done.")