# HG changeset patch # User iuc # Date 1573256852 18000 # Node ID f57c13f5878bee8a41d9191b5900c7eaa831a01f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_dada2 commit f8b6b6e72914ad6bcca8423dfa03f59bde80992e" diff -r 000000000000 -r f57c13f5878b data_manager/dada2_fetcher.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/dada2_fetcher.xml Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,181 @@ + + + Download reference databases + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1093/nar/gks1219 + > + 10.1093/nar/gkt1244 + + 10.1128/AEM.03006-05 + + 10.15156/BIO/786343 + + + 10.1186/s12864-015-2265-y + + 10.1093/nar/gks1160 + + + diff -r 000000000000 -r f57c13f5878b data_manager/data_manager.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager.py Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,133 @@ +import argparse +import json +import os +try: + # For Python 3.0 and later + from urllib.request import Request, urlopen +except ImportError: + # Fall back to Python 2 imports + from urllib2 import Request, urlopen + +DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" + +FILE2NAME = { + "silva_132": "Silva version 132", + "silva_128": "Silva version 128", + "rdp_16": "RDP trainset 16", + "rdp_14": "RDP trainset 14", + "greengenes_13.84": "GreenGenes version 13.84", + "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", + "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", + "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", + "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", + "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", + "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", + "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" +} + +FILE2TAXURL = { + "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", + "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", + "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", + "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", + "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", + "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", + "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", + "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", + "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", + "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", + "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", + "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" +} + +FILE2SPECIESURL = { + "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", + "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", + "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", + "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" +} + +FILE2TAXLEVELS = { + "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" +} + + +def url_download(url, fname, workdir): + """ + download url to workdir/fname + """ + file_path = os.path.join(workdir, fname) + if not os.path.exists(workdir): + os.makedirs(workdir) + src = None + dst = None + try: + req = Request(url) + src = urlopen(req) + with open(file_path, 'wb') as dst: + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + finally: + if src: + src.close() + +# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) + if fname.startswith("unite"): + import glob + import gzip + import shutil + import zipfile + # unzip download + zip_ref = zipfile.ZipFile(file_path, 'r') + zip_ref.extractall(workdir) + zip_ref.close() + # gzip top level fasta file + fastas = glob.glob("%s/*fasta" % workdir) + if len(fastas) != 1: + msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas)) + raise Exception(msg) + with open(fastas[0], 'rb') as f_in: + with gzip.open(file_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + +def remote_dataset(dataset, outjson): + + with open(outjson) as jf: + params = json.loads(jf.read()) + + workdir = params['output_data'][0]['extra_files_path'] + os.mkdir(workdir) + url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir) + + data_manager_json = {"data_tables": {}} + data_manager_entry = {} + data_manager_entry['value'] = dataset + data_manager_entry['name'] = FILE2NAME[dataset] + data_manager_entry['path'] = dataset + ".taxonomy" + data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) + data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry + + if FILE2SPECIESURL.get(dataset, False ): + url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir) + data_manager_entry = {} + data_manager_entry['value'] = dataset + data_manager_entry['name'] = FILE2NAME[dataset] + data_manager_entry['path'] = dataset + ".species" + data_manager_json["data_tables"]["dada2_species"] = data_manager_entry + + with file(outjson, 'w') as jf: + jf.write(json.dumps(data_manager_json)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Create data manager json.') + parser.add_argument('--out', action='store', help='JSON filename') + parser.add_argument('--dataset', action='store', help='Download data set name') + args = parser.parse_args() + + remote_dataset(args.dataset, args.out) diff -r 000000000000 -r f57c13f5878b data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,34 @@ + + + + + + + + + + ${path} + dada2/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path} + abspath + + + + + + + + + + + ${path} + dada2/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path} + abspath + + + + + diff -r 000000000000 -r f57c13f5878b test-data/PR24.11.1_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/PR24.11.1_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "PR2_4.11.1.taxonomy", "name": "Protist Ribosomal Reference database (PR2) 4.11.1", "value": "PR2_4.11.1", "taxlevels": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/RefSeq_RDP2018_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/RefSeq_RDP2018_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "RefSeq_RDP_2018_05.taxonomy", "name": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "value": "RefSeq_RDP_2018_05", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/dada2_species.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dada2_species.loc Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for species assignment, using three +# tab separated columns: +# +# +# +# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html diff -r 000000000000 -r f57c13f5878b test-data/dada2_taxonomy.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dada2_taxonomy.loc Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for taxonomy assignment, using three +# tab separated columns: +# +# +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html +# +# taxlevels is a comma separated list of taxonomy levels diff -r 000000000000 -r f57c13f5878b test-data/greengenes13.84_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/greengenes13.84_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "greengenes_13.84.taxonomy", "name": "GreenGenes version 13.84", "value": "greengenes_13.84", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/gtdb2018_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtdb2018_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "gtdb_2018_11.taxonomy", "name": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", "value": "gtdb_2018_11", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/hitdb1_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/hitdb1_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "hitdb_1.taxonomy", "name": "HitDB version 1 (Human InTestinal 16S rRNA)", "value": "hitdb_1", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/rdp16_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rdp16_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_species": {"path": "rdp_16.species", "name": "RDP trainset 16", "value": "rdp_16"}, "dada2_taxonomy": {"path": "rdp_16.taxonomy", "name": "RDP trainset 16", "value": "rdp_16", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/silva132_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/silva132_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_species": {"path": "silva_132.species", "name": "Silva version 132", "value": "silva_132"}, "dada2_taxonomy": {"path": "silva_132.taxonomy", "name": "Silva version 132", "value": "silva_132", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/silvaeuk132_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/silvaeuk132_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "silva_euk_18S_132.taxonomy", "name": "Silva version 132 Eukaryotic 18S", "value": "silva_euk_18S_132", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/unite8fungi_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/unite8fungi_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "unite_8.0_fungi.taxonomy", "name": "UNITE: General Fasta release 8.0 for Fungi", "value": "unite_8.0_fungi", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b test-data/unite8fungisingletons_json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/unite8fungisingletons_json Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"dada2_taxonomy": {"path": "unite_8.0_fungi_singletons.taxonomy", "name": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", "value": "unite_8.0_fungi_singletons", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} \ No newline at end of file diff -r 000000000000 -r f57c13f5878b tool-data/dada2_species.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dada2_species.loc.sample Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for species assignment, using three +# tab separated columns: +# +# +# +# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html diff -r 000000000000 -r f57c13f5878b tool-data/dada2_taxonomy.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dada2_taxonomy.loc.sample Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for taxonomy assignment, using three +# tab separated columns: +# +# +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html +# +# taxlevels is a comma separated list of taxonomy levels diff -r 000000000000 -r f57c13f5878b tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,11 @@ + + + + value, name, path + +
+ + value, name, path, taxlevels + +
+
diff -r 000000000000 -r f57c13f5878b tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Fri Nov 08 18:47:32 2019 -0500 @@ -0,0 +1,11 @@ + + + + value, name, path + +
+ + value, name, path, taxlevels + +
+