Mercurial > repos > iuc > data_manager_dada2
comparison data_manager/data_manager.py @ 0:f57c13f5878b draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_dada2 commit f8b6b6e72914ad6bcca8423dfa03f59bde80992e"
author | iuc |
---|---|
date | Fri, 08 Nov 2019 18:47:32 -0500 |
parents | |
children | bf7b2c14cabc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f57c13f5878b |
---|---|
1 import argparse | |
2 import json | |
3 import os | |
4 try: | |
5 # For Python 3.0 and later | |
6 from urllib.request import Request, urlopen | |
7 except ImportError: | |
8 # Fall back to Python 2 imports | |
9 from urllib2 import Request, urlopen | |
10 | |
11 DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" | |
12 | |
13 FILE2NAME = { | |
14 "silva_132": "Silva version 132", | |
15 "silva_128": "Silva version 128", | |
16 "rdp_16": "RDP trainset 16", | |
17 "rdp_14": "RDP trainset 14", | |
18 "greengenes_13.84": "GreenGenes version 13.84", | |
19 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", | |
20 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", | |
21 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", | |
22 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", | |
23 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", | |
24 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", | |
25 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" | |
26 } | |
27 | |
28 FILE2TAXURL = { | |
29 "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", | |
30 "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", | |
31 "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", | |
32 "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", | |
33 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", | |
34 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", | |
35 "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", | |
36 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", | |
37 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", | |
38 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", | |
39 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", | |
40 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" | |
41 } | |
42 | |
43 FILE2SPECIESURL = { | |
44 "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", | |
45 "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", | |
46 "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", | |
47 "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" | |
48 } | |
49 | |
50 FILE2TAXLEVELS = { | |
51 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" | |
52 } | |
53 | |
54 | |
55 def url_download(url, fname, workdir): | |
56 """ | |
57 download url to workdir/fname | |
58 """ | |
59 file_path = os.path.join(workdir, fname) | |
60 if not os.path.exists(workdir): | |
61 os.makedirs(workdir) | |
62 src = None | |
63 dst = None | |
64 try: | |
65 req = Request(url) | |
66 src = urlopen(req) | |
67 with open(file_path, 'wb') as dst: | |
68 while True: | |
69 chunk = src.read(2**10) | |
70 if chunk: | |
71 dst.write(chunk) | |
72 else: | |
73 break | |
74 finally: | |
75 if src: | |
76 src.close() | |
77 | |
78 # special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) | |
79 if fname.startswith("unite"): | |
80 import glob | |
81 import gzip | |
82 import shutil | |
83 import zipfile | |
84 # unzip download | |
85 zip_ref = zipfile.ZipFile(file_path, 'r') | |
86 zip_ref.extractall(workdir) | |
87 zip_ref.close() | |
88 # gzip top level fasta file | |
89 fastas = glob.glob("%s/*fasta" % workdir) | |
90 if len(fastas) != 1: | |
91 msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas)) | |
92 raise Exception(msg) | |
93 with open(fastas[0], 'rb') as f_in: | |
94 with gzip.open(file_path, 'wb') as f_out: | |
95 shutil.copyfileobj(f_in, f_out) | |
96 | |
97 | |
98 def remote_dataset(dataset, outjson): | |
99 | |
100 with open(outjson) as jf: | |
101 params = json.loads(jf.read()) | |
102 | |
103 workdir = params['output_data'][0]['extra_files_path'] | |
104 os.mkdir(workdir) | |
105 url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir) | |
106 | |
107 data_manager_json = {"data_tables": {}} | |
108 data_manager_entry = {} | |
109 data_manager_entry['value'] = dataset | |
110 data_manager_entry['name'] = FILE2NAME[dataset] | |
111 data_manager_entry['path'] = dataset + ".taxonomy" | |
112 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) | |
113 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry | |
114 | |
115 if FILE2SPECIESURL.get(dataset, False ): | |
116 url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir) | |
117 data_manager_entry = {} | |
118 data_manager_entry['value'] = dataset | |
119 data_manager_entry['name'] = FILE2NAME[dataset] | |
120 data_manager_entry['path'] = dataset + ".species" | |
121 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry | |
122 | |
123 with file(outjson, 'w') as jf: | |
124 jf.write(json.dumps(data_manager_json)) | |
125 | |
126 | |
127 if __name__ == '__main__': | |
128 parser = argparse.ArgumentParser(description='Create data manager json.') | |
129 parser.add_argument('--out', action='store', help='JSON filename') | |
130 parser.add_argument('--dataset', action='store', help='Download data set name') | |
131 args = parser.parse_args() | |
132 | |
133 remote_dataset(args.dataset, args.out) |