Mercurial > repos > iuc > data_manager_mapseq

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_fetch_mapseq_db.py	Wed Sep 13 19:54:19 2023 +0000
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import shutil
+import tarfile
+from datetime import datetime
+
+import wget
+
+DB_paths = {
+    "mgnify_lsu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_lsu-20200130.tar.gz",
+    "mgnify_ssu": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_ssu-20200130.tar.gz",
+    "mgnify_its_unite": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/UNITE-20200214.tar.gz",
+    "mgnify_its_itsonedb": "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/ITSoneDB-20200214.tar.gz",
+    "test_lsu": "https://zenodo.org/record/8205348/files/test_lsu.tar.gz",
+}
+
+DB_names = {
+    "mgnify_lsu": "MGnify LSU (v5.0.7) - silva_lsu-20200130",
+    "mgnify_ssu": "MGnify SSU (v5.0.7) - silva_ssu-20200130",
+    "mgnify_its_unite": "MGnify ITS ITSonedb (v5.0.7) - ITSoneDB-20200214",
+    "mgnify_its_itsonedb": "MGnify ITS UNITE (v5.0.7) - UNITE-20200214",
+    "test_lsu": "Trimmed LSU Test DB",
+}
+
+
+def download_untar_store(url, tmp_path, dest_path):
+    """
+    Download a tar.gz file containing one folder,
+    extract that folder and move the content inside dest_path
+    """
+
+    extract_path = os.path.join(tmp_path, "extract")
+
+    os.makedirs(tmp_path, exist_ok=True)
+
+    # download data
+    filename = wget.download(url, out=tmp_path)
+    tarfile_path = os.path.join(tmp_path, filename)
+    tar = tarfile.open(tarfile_path)
+    tar.extractall(extract_path)
+
+    if len(list(os.listdir(extract_path))) > 1:
+        print("More then one folder in zipped file, aborting !")
+    else:
+        for folder in os.listdir(extract_path):
+            folder_path = os.path.join(extract_path, folder)
+
+            print(f"Copy data to {dest_path}")
+            shutil.copytree(folder_path, dest_path)
+            print("Done !")
+
+    shutil.rmtree(tmp_path)
+
+
+def main():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(description="Create data manager JSON.")
+    parser.add_argument("--out", dest="output", action="store", help="JSON filename")
+    parser.add_argument("--version", dest="version", action="store", help="Version of the DB")
+    parser.add_argument("--database-type", dest="db_type", action="store", help="Db type")
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="option to test the script with an lighted database",
+    )
+
+    args = parser.parse_args()
+
+    # the output file of a DM is a json containing args that can be used by the DM
+    # most tools mainly use these args to find the extra_files_path for the DM, which can be used
+    # to store the DB data
+    with open(args.output) as fh:
+        params = json.load(fh)
+
+    print(params)
+
+    workdir = params["output_data"][0]["extra_files_path"]
+    os.mkdir(workdir)
+
+    time = datetime.utcnow().strftime("%Y-%m-%d")
+    db_value = f"{args.db_type}_from_{time}"
+
+    # output paths
+    db_path = os.path.join(workdir, db_value)
+    tmp_path = os.path.join(workdir, "tmp")
+
+    # create DB
+    if args.test:
+        url = DB_paths["test_lsu"]
+    else:
+        url = DB_paths[args.db_type]
+
+    # download data
+    download_untar_store(url, tmp_path, db_path)
+
+    db_name = DB_names[args.db_type]
+    # Update Data Manager JSON and write to file
+    data_manager_entry = {
+        "data_tables": {
+            "mapseq_db": {
+                "value": db_value,
+                "name": f"{db_name} downloaded at {time}",
+                "version": args.version,
+                "path": db_path,
+            }
+        }
+    }
+
+    with open(os.path.join(args.output), "w+") as fh:
+        json.dump(data_manager_entry, fh, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Sep 13 19:54:19 2023 +0000
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<macros>
+    <token name="@TOOL_VERSION@">1.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">22.05</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="3.2">python-wget</requirement>
+        </requirements>
+    </xml>
+    <xml name="version">
+        <version_command>
+            echo "1.0"
+        </version_command>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">
+                10.1093/nar/gkz1035
+            </citation>
+        </citations>
+    </xml>
+    <xml name="creator">
+        <creator>
+            <person givenName="Paul" familyName="Zierep" email="zierep@informatik.uni-freiburg.de" />
+        </creator>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mapseq_db_fetcher.xml	Wed Sep 13 19:54:19 2023 +0000
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<tool id="mapseq_db_fetcher" name="Mapseq DB fetcher" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Fetches the DB required for mapseq</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version" />
+    <command detect_errors="exit_code">
+    <![CDATA[
+        python '$__tool_directory__/data_manager_fetch_mapseq_db.py'
+        --out '${out_file}'
+        --version '${version}'
+        --database-type '${database_type}'
+        $test_data_manager
+    ]]>
+    </command>
+    <inputs>
+    <!-- <param name="test_data_manager" type="hidden" /> -->
+    <param name="test_data_manager" type="boolean" truevalue="--test" falsevalue="" checked="False" label="Download minimal test DB and create mock data table entry." />
+
+    <param name="database_type" type="select" multiple="false" label="Database Type">
+        <option value="mgnify_lsu">MGnify LSU (v5.0.7)</option>
+        <option value="mgnify_ssu">MGnify SSU (v5.0.7)</option>
+        <option value="mgnify_its_itsonedb">MGnify ITS ITSonedb (v5.0.7)</option>
+        <option value="mgnify_its_unite">MGnify ITS UNITE (v5.0.7)</option>
+    </param>
+
+    <!-- <param name="test_data_manager" type="text" value=""/> -->
+    <param argument="--version" type="text" value="5.0.7" help="Check MGnify GitHub (https://github.com/EBI-Metagenomics/pipeline-v5/releases) for the version."/>
+    </inputs>
+    <outputs>
+        <data format="data_manager_json" name="out_file" />
+    </outputs>
+    <tests>
+    <test expect_num_outputs="1">
+        <param name="test_data_manager" value="--test"/>
+        <param name="version" value="5.0.7"/>
+        <param name="database_type" value="mgnify_lsu"/>
+        <output name="out_file">
+                <assert_contents>
+                    <has_text text="mgnify_lsu"/>
+                    <has_text text="5.0.7"/>
+                </assert_contents>
+        </output>
+    </test>
+    </tests>
+    <help>
+    Downloads preformatted DBs form MGnify that can be used for mapseq.
+    The download paths were taken from: https://github.com/EBI-Metagenomics/pipeline-v5/
+    </help>
+    <expand macro="citations" />
+    <expand macro="creator" />
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.md	Wed Sep 13 19:54:19 2023 +0000
@@ -0,0 +1,18 @@
+# test data explained
+
+## the tool will expect one file each with the corresponing endings in the DB folder
+
+Trimmed version of:
+* OTU table (id for each taxon) (*.otu)
+* Ref. fasta DB (*.fasta)
+* Taxan assignemnt of each ref. DB sequence (*.txt)
+* clustering of the ref. sequences (starting with 0) corresponding to the ref. sequences (*.mscluster)
+
+## Command to get DBs
+
+```
+wget ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/silva_lsu-20200130.tar.gz
+mkdir temp
+tar xvzf silva_ssu-20200130.tar.gz -C temp
+mv temp/* silva_ssu-20200130
+```
\ No newline at end of file