comparison data_manager/gtdbtk_database_installer.py @ 2:6ab422fba1a3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit ad14947c3e13babe90a6878b45608fe56a16150d
author iuc
date Tue, 13 Aug 2024 21:13:43 +0000
parents 629464b96c2e
children c4830a9870fa
comparison
equal deleted inserted replaced
1:2814c058a087 2:6ab422fba1a3
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import gzip
4 import json 5 import json
5 import os 6 import os
6 import shutil 7 import shutil
7 import sys 8 import sys
8 import tarfile 9 import tarfile
10 from datetime import datetime
9 from urllib.parse import urlparse 11 from urllib.parse import urlparse
10 from urllib.request import Request 12 from urllib.request import Request, urlopen
11 from urllib.request import urlopen 13
14 # rather provide the urls based on the release, less error potential for the admins !
15 urls = {
16 "202": {
17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
18 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz",
19 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz",
20 },
21 "207": {
22 "full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz",
23 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz",
24 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz",
25 },
26 "214": {
27 "full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz",
28 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/ar53_taxonomy_r214.tsv.gz",
29 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/bac120_taxonomy_r214.tsv.gz",
30 },
31 "220": {
32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
35 },
36 "test": { # using VERSION to check if files are there
37 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
38 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
39 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
40 },
41 }
12 42
13 43
14 def url_download(url, target_directory): 44 def url_download(url, target_directory):
15 url_parts = urlparse(url) 45 url_parts = urlparse(url)
16 tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path))) 46 tarball = os.path.abspath(
47 os.path.join(target_directory, os.path.basename(url_parts.path))
48 )
17 src = None 49 src = None
18 dst = None 50 dst = None
19 try: 51 try:
20 req = Request(url) 52 req = Request(url)
21 src = urlopen(req) 53 src = urlopen(req)
22 with open(tarball, 'wb') as dst: 54 with open(tarball, "wb") as dst:
23 while True: 55 while True:
24 chunk = src.read(2**10) 56 chunk = src.read(2**10)
25 if chunk: 57 if chunk:
26 dst.write(chunk) 58 dst.write(chunk)
27 else: 59 else:
30 sys.exit(str(e)) 62 sys.exit(str(e))
31 finally: 63 finally:
32 if src is not None: 64 if src is not None:
33 src.close() 65 src.close()
34 if tarfile.is_tarfile(tarball): 66 if tarfile.is_tarfile(tarball):
35 fh = tarfile.open(tarball, 'r:*') 67 fh = tarfile.open(tarball, "r:*")
36 else: 68 else:
37 return tarball 69 # unzip metadata file
70 if ".gz" in tarball:
71 with gzip.open(tarball, "rb") as f_in:
72 unzipped_file = tarball.strip(".gz")
73 with open(unzipped_file, "wb") as f_out:
74 shutil.copyfileobj(f_in, f_out)
75 os.remove(tarball)
76 folder_of_unzipped_file = os.path.dirname(unzipped_file)
77 return folder_of_unzipped_file
78 else:
79 # this is basically only the return for the test not using a tarfile
80 return tarball
38 fh.extractall(target_directory) 81 fh.extractall(target_directory)
39 fh.close() 82 fh.close()
40 os.remove(tarball) 83 os.remove(tarball)
41 # The tarball extraction will create a directory named 84 # The tarball extraction will create a directory named
42 # something like release202 in the target_directory, so 85 # something like release202 in the target_directory, so
50 shutil.move(item_path, target_directory) 93 shutil.move(item_path, target_directory)
51 os.rmdir(subdir_path) 94 os.rmdir(subdir_path)
52 return target_directory 95 return target_directory
53 96
54 97
55 def download(database_id, database_name, url, out_file): 98 def download(database_name, release, meta, test, out_file):
56 99
57 with open(out_file) as fh: 100 with open(out_file) as fh:
58 params = json.load(fh) 101 params = json.load(fh)
59 102
60 target_directory = params['output_data'][0]['extra_files_path'] 103 target_directory = params["output_data"][0]["extra_files_path"]
61 os.makedirs(target_directory) 104 os.makedirs(target_directory)
62 file_path = url_download(url, target_directory) 105
106 if test:
107 release = "test"
108
109 # download both taxonomy metadata tables
110 if meta:
111 url = urls[release]["meta_ar"]
112 file_path = url_download(url, target_directory)
113 url = urls[release]["meta_bac"]
114 file_path = url_download(url, target_directory)
115 # download the full DB
116 else:
117 url = urls[release]["full"]
118 file_path = url_download(url, target_directory)
119
120 time = datetime.utcnow().strftime("%Y-%m-%d")
63 121
64 data_manager_json = {"data_tables": {}} 122 data_manager_json = {"data_tables": {}}
65 data_manager_entry = {} 123 data_manager_entry = {}
66 data_manager_entry['value'] = database_id 124 data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}"
67 data_manager_entry['name'] = database_name 125 data_manager_entry["name"] = database_name
68 data_manager_entry['path'] = file_path 126 data_manager_entry["path"] = file_path
69 data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry 127 data_manager_entry["version"] = release
70 128
71 with open(out_file, 'w') as fh: 129 # store in dedicated metadata table
130 if meta:
131 data_manager_json["data_tables"][
132 "gtdbtk_database_metadata_versioned"
133 ] = data_manager_entry
134 else:
135 data_manager_json["data_tables"][
136 "gtdbtk_database_versioned"
137 ] = data_manager_entry
138
139 with open(out_file, "w") as fh:
72 json.dump(data_manager_json, fh, sort_keys=True) 140 json.dump(data_manager_json, fh, sort_keys=True)
73 141
74 142
75 parser = argparse.ArgumentParser() 143 parser = argparse.ArgumentParser()
76 144
77 parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name') 145 parser.add_argument(
78 parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id') 146 "--database_name", dest="database_name", help="GTDB-Tk database display name"
79 parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version') 147 )
80 parser.add_argument('--out_file', dest='out_file', help='JSON output file') 148
149 parser.add_argument("--version", dest="version", help="DB version")
150
151 parser.add_argument(
152 "--release", dest="release", help="Release of the GTDB-Tk database version"
153 )
154 parser.add_argument("--out_file", dest="out_file", help="JSON output file")
155 parser.add_argument(
156 "--meta",
157 dest="meta",
158 action="store_true",
159 help="Store meta data flag",
160 )
161
162 parser.add_argument(
163 "--test",
164 dest="test",
165 action="store_true",
166 help="Run test",
167 )
81 168
82 args = parser.parse_args() 169 args = parser.parse_args()
83 170
84 download(args.database_id, args.database_name, args.url, args.out_file) 171 download(
172 args.database_name,
173 args.release,
174 args.meta,
175 args.test,
176 args.out_file,
177 )