Mercurial > repos > iuc > data_manager_mapseq
changeset 0:dbf2735e8480 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/ commit 66e797aaa79b92c282a8127260cdfd5702207e35
author | iuc |
---|---|
date | Wed, 13 Sep 2023 19:54:19 +0000 |
parents | |
children | 4cd97cc67061 |
files | data_manager_fetch_mapseq_db.py macros.xml mapseq_db_fetcher.xml readme.md |
diffstat | 4 files changed, 217 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_fetch_mapseq_db.py Wed Sep 13 19:54:19 2023 +0000 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import shutil +import tarfile +from datetime import datetime + +import wget + +DB_paths = { + "mgnify_lsu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_lsu-20200130.tar.gz", + "mgnify_ssu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_ssu-20200130.tar.gz", + "mgnify_its_unite": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/UNITE-20200214.tar.gz", + "mgnify_its_itsonedb": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/ITSoneDB-20200214.tar.gz", + "test_lsu": "https://zenodo.org/record/8205348/files/test_lsu.tar.gz", +} + +DB_names = { + "mgnify_lsu": "MGnify LSU (v5.0.7) - silva_lsu-20200130", + "mgnify_ssu": "MGnify SSU (v5.0.7) - silva_ssu-20200130", + "mgnify_its_unite": "MGnify ITS ITSonedb (v5.0.7) - ITSoneDB-20200214", + "mgnify_its_itsonedb": "MGnify ITS UNITE (v5.0.7) - UNITE-20200214", + "test_lsu": "Trimmed LSU Test DB", +} + + +def download_untar_store(url, tmp_path, dest_path): + """ + Download a tar.gz file containing one folder, + extract that folder and move the content inside dest_path + """ + + extract_path = os.path.join(tmp_path, "extract") + + os.makedirs(tmp_path, exist_ok=True) + + # download data + filename = wget.download(url, out=tmp_path) + tarfile_path = os.path.join(tmp_path, filename) + tar = tarfile.open(tarfile_path) + tar.extractall(extract_path) + + if len(list(os.listdir(extract_path))) > 1: + print("More then one folder in zipped file, aborting !") + else: + for folder in os.listdir(extract_path): + folder_path = os.path.join(extract_path, folder) + + print(f"Copy data to {dest_path}") + shutil.copytree(folder_path, dest_path) + print("Done !") + + shutil.rmtree(tmp_path) + + +def main(): + # Parse Command Line + parser = argparse.ArgumentParser(description="Create data manager JSON.") + parser.add_argument("--out", dest="output", action="store", help="JSON filename") + parser.add_argument("--version", dest="version", action="store", help="Version of the DB") + parser.add_argument("--database-type", dest="db_type", action="store", help="Db type") + parser.add_argument( + "--test", + action="store_true", + help="option to test the script with an lighted database", + ) + + args = parser.parse_args() + + # the output file of a DM is a json containing args that can be used by the DM + # most tools mainly use these args to find the extra_files_path for the DM, which can be used + # to store the DB data + with open(args.output) as fh: + params = json.load(fh) + + print(params) + + workdir = params["output_data"][0]["extra_files_path"] + os.mkdir(workdir) + + time = datetime.utcnow().strftime("%Y-%m-%d") + db_value = f"{args.db_type}_from_{time}" + + # output paths + db_path = os.path.join(workdir, db_value) + tmp_path = os.path.join(workdir, "tmp") + + # create DB + if args.test: + url = DB_paths["test_lsu"] + else: + url = DB_paths[args.db_type] + + # download data + download_untar_store(url, tmp_path, db_path) + + db_name = DB_names[args.db_type] + # Update Data Manager JSON and write to file + data_manager_entry = { + "data_tables": { + "mapseq_db": { + "value": db_value, + "name": f"{db_name} downloaded at {time}", + "version": args.version, + "path": db_path, + } + } + } + + with open(os.path.join(args.output), "w+") as fh: + json.dump(data_manager_entry, fh, sort_keys=True) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Sep 13 19:54:19 2023 +0000 @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<macros> + <token name="@TOOL_VERSION@">1.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">22.05</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.2">python-wget</requirement> + </requirements> + </xml> + <xml name="version"> + <version_command> + echo "1.0" + </version_command> + </xml> + <xml name="citations"> + <citations> + <citation type="doi"> + 10.1093/nar/gkz1035 + </citation> + </citations> + </xml> + <xml name="creator"> + <creator> + <person givenName="Paul" familyName="Zierep" email="zierep@informatik.uni-freiburg.de" /> + </creator> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mapseq_db_fetcher.xml Wed Sep 13 19:54:19 2023 +0000 @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<tool id="mapseq_db_fetcher" name="Mapseq DB fetcher" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Fetches the DB required for mapseq</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version" /> + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/data_manager_fetch_mapseq_db.py' + --out '${out_file}' + --version '${version}' + --database-type '${database_type}' + $test_data_manager + ]]> + </command> + <inputs> + <!-- <param name="test_data_manager" type="hidden" /> --> + <param name="test_data_manager" type="boolean" truevalue="--test" falsevalue="" checked="False" label="Download minimal test DB and create mock data table entry." /> + + <param name="database_type" type="select" multiple="false" label="Database Type"> + <option value="mgnify_lsu">MGnify LSU (v5.0.7)</option> + <option value="mgnify_ssu">MGnify SSU (v5.0.7)</option> + <option value="mgnify_its_itsonedb">MGnify ITS ITSonedb (v5.0.7)</option> + <option value="mgnify_its_unite">MGnify ITS UNITE (v5.0.7)</option> + </param> + + <!-- <param name="test_data_manager" type="text" value=""/> --> + <param argument="--version" type="text" value="5.0.7" help="Check MGnify GitHub (https://github.com/EBI-Metagenomics/pipeline-v5/releases) for the version."/> + </inputs> + <outputs> + <data format="data_manager_json" name="out_file" /> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="test_data_manager" value="--test"/> + <param name="version" value="5.0.7"/> + <param name="database_type" value="mgnify_lsu"/> + <output name="out_file"> + <assert_contents> + <has_text text="mgnify_lsu"/> + <has_text text="5.0.7"/> + </assert_contents> + </output> + </test> + </tests> + <help> + Downloads preformatted DBs form MGnify that can be used for mapseq. + The download paths were taken from: https://github.com/EBI-Metagenomics/pipeline-v5/ + </help> + <expand macro="citations" /> + <expand macro="creator" /> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.md Wed Sep 13 19:54:19 2023 +0000 @@ -0,0 +1,18 @@ +# test data explained + +## the tool will expect one file each with the corresponing endings in the DB folder + +Trimmed version of: +* OTU table (id for each taxon) (*.otu) +* Ref. fasta DB (*.fasta) +* Taxan assignemnt of each ref. DB sequence (*.txt) +* clustering of the ref. sequences (starting with 0) corresponding to the ref. sequences (*.mscluster) + +## Command to get DBs + +``` +wget ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_lsu-20200130.tar.gz +mkdir temp +tar xvzf silva_ssu-20200130.tar.gz -C temp +mv temp/* silva_ssu-20200130 +``` \ No newline at end of file