data_manager_qiime_database_downloader: data_manager/data_manager_qiime

comparison data_manager/data_manager_qiime_download.py @ 0:f8608fddfb23 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_qiime_database_downloader commit 4934eb34300b5fa54d62d8b67e5b6e989e963ac9

author	iuc
date	Mon, 15 May 2017 11:08:43 -0400
parents
children	9e86c09a6cae

comparison

equal deleted inserted replaced

--1:000000000000
+:f8608fddfb23
+#!/usr/bin/env python
+# Data manager for reference data for the QIIME Galaxy tools
+import argparse
+import ftplib
+import json
+import os
+import tarfile
+import zipfile
+import requests
+protocol = {
+"unite": "http",
+"greengenes": "ftp",
+"silva": "http",
+"img": "ftp"
+}
+baseUrl = {
+"unite": "http://unite.ut.ee/sh_files/sh_qiime_release_",
+"greengenes": "greengenes.microbio.me",
+"silva": "http://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_",
+"img": "ftp.microbio.me"
+}
+ftp_dir = {
+"greengenes": "/greengenes_release/gg_",
+"img": ""
+}
+ftp_file_prefix = {
+"greengenes": "gg_",
+"img": ""
+}
+ftp_file_suffix = {
+"greengenes": "_otus",
+"img": ""
+}
+extension = {
+"unite": "zip",
+"greengenes": "tar.gz",
+"silva": {
+"104_release": "tgz",
+"108_release": "tgz",
+"108_release_curated": "tgz",
+"111_release": "tgz",
+"119_consensus_majority_taxonomy": "zip",
+"119_release": "zip",
+"119_release_aligned_rep_files": "tar.gz",
+"123_release": "zip",
+"128_release": "tgz"},
+"img": "tgz"
+}
+filetypes = ["rep_set", "rep_set_aligned", "taxonomy", "trees"]
+# Utility functions for interacting with Galaxy JSON
+def read_input_json(jsonfile):
+"""Read the JSON supplied from the data manager tool
+Returns a tuple (param_dict,extra_files_path)
+'param_dict' is an arbitrary dictionary of parameters
+input into the tool; 'extra_files_path' is the path
+to a directory where output files must be put for the
+receiving data manager to pick them up.
+NB the directory pointed to by 'extra_files_path'
+doesn't exist initially, it is the job of the script
+to create it if necessary.
+"""
+params = json.loads(open(jsonfile).read())
+return (params['param_dict'],
+params['output_data'][0]['extra_files_path'])
+# Utility functions for creating data table dictionaries
+#
+# Example usage:
+# >>> d = create_data_tables_dict()
+# >>> add_data_table(d,'my_data')
+# >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
+# >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
+# >>> print str(json.dumps(d))
+def create_data_tables_dict():
+"""Return a dictionary for storing data table information
+Returns a dictionary that can be used with 'add_data_table'
+and 'add_data_table_entry' to store information about a
+data table. It can be converted to JSON to be sent back to
+the data manager.
+"""
+d = {}
+d['data_tables'] = {}
+return d
+def add_data_table(d, table):
+"""Add a data table to the data tables dictionary
+Creates a placeholder for a data table called 'table'.
+"""
+d['data_tables'][table] = []
+def add_data_table_entry(d, table, entry):
+"""Add an entry to a data table
+Appends an entry to the data table 'table'. 'entry'
+should be a dictionary where the keys are the names of
+columns in the data table.
+Raises an exception if the named data table doesn't
+exist.
+"""
+try:
+d['data_tables'][table].append(entry)
+except KeyError:
+raise Exception("add_data_table_entry: no table '%s'" % table)
+def get_ftp_file(ftp, filename):
+"""
+"""
+try:
+ftp.retrbinary("RETR " + filename, open(filename, 'wb').write)
+except:
+print("Error")
+def download_archive(db, version, ext):
+"""
+"""
+filepath = "%s_%s.%s" % (db, version, ext)
+if protocol[db] == "http":
+url = "%s%s.%s" % (baseUrl[db], version, ext)
+r = requests.get(url, stream=True)
+r.raise_for_status()
+with open(filepath, "wb") as fd:
+for chunk in r.iter_content(chunk_size=128):
+fd.write(chunk)
+elif protocol[db] == "ftp":
+ftp = ftplib.FTP(baseUrl[db])
+ftp.login("anonymous", "ftplib-example-1")
+if db == "greengenes" and version == "13_8":
+ftp.cwd("%s%s" % (ftp_dir[db], "13_5"))
+else:
+ftp.cwd("%s%s" % (ftp_dir[db], version))
+filepath = "%s%s%s.%s" % (
+ftp_file_prefix[db],
+version,
+ftp_file_suffix[db],
+ext)
+get_ftp_file(ftp, filepath)
+ftp.quit()
+return filepath
+def find_archive_content_path(archive_content_path):
+"""
+"""
+content = os.listdir(archive_content_path)
+archive_content = []
+for x in content:
+if not x.startswith(".") and not x.startswith("_"):
+archive_content.append(x)
+if len(archive_content) == 1:
+archive_content_path = os.path.join(
+archive_content_path,
+archive_content[0])
+return archive_content_path
+def extract_archive(filepath, ext, db):
+"""
+"""
+archive_content_path = "tmp"
+if ext == "tar.gz" or ext == "tgz":
+tar = tarfile.open(filepath)
+tar.extractall(path=archive_content_path)
+tar.close()
+archive_content_path = find_archive_content_path(archive_content_path)
+elif ext == "zip":
+zip_ref = zipfile.ZipFile(filepath, 'r')
+zip_ref.extractall(archive_content_path)
+zip_ref.close()
+archive_content_path = find_archive_content_path(archive_content_path)
+return archive_content_path
+def move_unite_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir):
+"""
+"""
+archive_content = os.listdir(archive_content_path)
+for content in archive_content:
+content_filepath = os.path.join(archive_content_path, content)
+content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0])
+content_filename_prefix = "%s_%s" % (filename_prefix, content)
+if content.find("refs") != -1:
+move_file(
+content_filepath,
+content_filename_prefix,
+content_name_prefix,
+data_tables,
+os.path.join(target_dir, "rep_set"),
+"rep_set")
+elif content.find("taxonomy") != -1:
+move_file(
+content_filepath,
+content_filename_prefix,
+content_name_prefix,
+data_tables,
+os.path.join(target_dir, "taxonomy"),
+"taxonomy")
+def move_file(input_filepath, filename, name, data_tables, target_dir, filetype):
+"""
+"""
+output_filepath = os.path.join(target_dir, filename)
+os.rename(input_filepath, output_filepath)
+add_data_table_entry(
+data_tables,
+"qiime_%s" % (filetype),
+dict(
+dbkey=filename,
+value="1.0",
+name=name,
+path=output_filepath))
+def move_dir_content(input_path, filename_prefix, name_prefix, data_tables, target_dir, filetype):
+"""
+"""
+for content in os.listdir(input_path):
+if content.startswith("."):
+continue
+content_path = os.path.join(input_path, content)
+content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0])
+content_filename_prefix = "%s_%s" % (filename_prefix, content)
+if os.path.isdir(content_path):
+move_dir_content(
+content_path,
+content_filename_prefix,
+content_name_prefix,
+data_tables,
+target_dir,
+filetype)
+else:
+move_file(
+content_path,
+content_filename_prefix,
+content_name_prefix,
+data_tables,
+target_dir,
+filetype)
+def move_files(archive_content_path, filename_prefix, name_prefix, data_tables, target_dir, db, version):
+"""
+"""
+for filetype in filetypes:
+if filetype == "rep_set_aligned":
+if db == "greengenes" and version == "12_10":
+continue
+filetype_target_dir = os.path.join(
+target_dir,
+filetype)
+filetype_path = os.path.join(
+archive_content_path,
+filetype)
+move_dir_content(
+filetype_path,
+filename_prefix,
+name_prefix,
+data_tables,
+filetype_target_dir,
+filetype)
+def download_db(data_tables, db, version, target_dir):
+"""Download QIIME database
+Creates references to the specified file(s) on the Galaxy
+server in the appropriate data table (determined from the
+file extension).
+The 'data_tables' dictionary should have been created using
+the 'create_data_tables_dict' and 'add_data_table' functions.
+Arguments:
+data_tables: a dictionary containing the data table info
+db: name of the database
+version: version of the database
+table_name: name of the table
+target_dir: directory to put copy or link to the data file
+"""
+ext = extension[db]
+if db == "silva":
+ext = ext[version]
+print("Download archive")
+filepath = download_archive(db, version, ext)
+print("Extract archive %s" % filepath)
+archive_content_path = extract_archive(filepath, ext, db)
+print("Moving file from %s" % archive_content_path)
+filename_prefix = "%s_%s" % (db, version)
+name_prefix = "%s (%s)" % (db, version)
+if db == "greengenes" or db == "silva":
+move_files(
+archive_content_path,
+filename_prefix,
+name_prefix,
+data_tables,
+target_dir,
+db,
+version)
+elif db == "unite":
+move_unite_files(
+archive_content_path,
+filename_prefix,
+name_prefix,
+data_tables,
+target_dir)
+if __name__ == "__main__":
+print("Starting...")
+# Read command line
+parser = argparse.ArgumentParser(
+description='Download QIIME reference database')
+parser.add_argument('--database', help="Database name")
+parser.add_argument('--version', help="Database version")
+parser.add_argument('--jsonfile', help="Output JSON file")
+args = parser.parse_args()
+jsonfile = args.jsonfile
+# Read the input JSON
+params, target_dir = read_input_json(jsonfile)
+# Make the target directory
+print("Making %s" % target_dir)
+os.mkdir(target_dir)
+os.mkdir(os.path.join(target_dir, "rep_set"))
+os.mkdir(os.path.join(target_dir, "rep_set_aligned"))
+os.mkdir(os.path.join(target_dir, "taxonomy"))
+os.mkdir(os.path.join(target_dir, "trees"))
+# Set up data tables dictionary
+data_tables = create_data_tables_dict()
+add_data_table(data_tables, "qiime_rep_set")
+add_data_table(data_tables, "qiime_rep_set_aligned")
+add_data_table(data_tables, "qiime_taxonomy")
+add_data_table(data_tables, "qiime_trees")
+# Fetch data from specified data sources
+download_db(
+data_tables,
+args.database,
+args.version,
+target_dir)
+# Write output JSON
+print("Outputting JSON")
+print(str(json.dumps(data_tables)))
+with open(jsonfile, 'w') as out:
+json.dump(data_tables, out)
+print("Done.")

Mercurial > repos > iuc > data_manager_qiime_database_downloader

comparison data_manager/data_manager_qiime_download.py @ 0:f8608fddfb23 draft