comparison data_manager/bakta_build_database.py @ 3:3e73c97f025d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 487cb35fe55883ac6eeb8dda58b56c9ca2ec0a85
author iuc
date Fri, 23 Jun 2023 21:37:05 +0000
parents bb463043c93e
children d74850cf4e42
comparison
equal deleted inserted replaced
2:adfd6bf710bd 3:3e73c97f025d
1 import argparse 1 import argparse
2 import hashlib 2 import hashlib
3 import json 3 import json
4 import os 4 import os
5 import re
5 import sys 6 import sys
6 import tarfile 7 import tarfile
7 from datetime import datetime 8 from datetime import datetime
8 from pathlib import Path 9 from pathlib import Path
9 10
14 class GetBaktaDatabaseInfo: 15 class GetBaktaDatabaseInfo:
15 """ 16 """
16 Extract bakta database information to make a json file for data_manager 17 Extract bakta database information to make a json file for data_manager
17 """ 18 """
18 19
19 def __init__(self, 20 def __init__(
20 data_table_name="bakta_database", 21 self,
21 db_name=Path.cwd().joinpath("db"), 22 data_table_name="bakta_database",
22 db_version="latest", 23 db_name=Path.cwd().joinpath("db"),
23 test_mode=False): 24 db_version="latest",
25 tarball_name="db.tar.gz",
26 test_mode=False,
27 ):
24 self.bakta_table_list = None 28 self.bakta_table_list = None
25 self.db_url = None 29 self.db_url = None
30 self.db_type = ""
26 self.data_table_entry = None 31 self.data_table_entry = None
27 self.data_table_name = data_table_name 32 self.data_table_name = data_table_name
28 self.db_name = db_name 33 self.db_name = db_name
34 self.tar_name = tarball_name
29 self.db_version = db_version 35 self.db_version = db_version
30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' 36 self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json"
31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' 37 self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json"
32 self.test_mode = test_mode 38 self.test_mode = test_mode
39
40 def get_database_type(self):
41 self.light_db = bool(re.search(pattern="light", string=self.db_version))
42 self.db_version = self.db_version.split(sep="_")[0]
43 if self.light_db:
44 self.db_type = "light"
45 self.tar_name = "db-light.tar.gz"
46 self.md5 = self.fetch_db_versions()["md5-light"]
47 else:
48 self.md5 = self.fetch_db_versions()["md5"]
33 49
34 def get_data_table_format(self): 50 def get_data_table_format(self):
35 """ 51 """
36 Skeleton of a data_table format 52 Skeleton of a data_table format
37 return: a data table formated for json output 53 return: a data table formated for json output
38 """ 54 """
39 self.data_table_entry = { 55 self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
40 "data_tables": {
41 self.data_table_name: {}
42 }
43 }
44 return self.data_table_entry 56 return self.data_table_entry
45 57
46 def fetch_db_versions(self, db_version="latest"): 58 def fetch_db_versions(self):
47 """ 59 """
48 List bakta database info related to the db_version selected 60 List bakta database info related to the db_version selected
49 """ 61 """
50 if self.test_mode is True: 62
63 if self.test_mode:
51 self.DB_VERSIONS_URL = self.DB_TEST_URL 64 self.DB_VERSIONS_URL = self.DB_TEST_URL
52 try: 65 try:
53 with requests.get(self.DB_VERSIONS_URL) as resp: 66 with requests.get(self.DB_VERSIONS_URL) as resp:
54 versions = json.loads(resp.content) 67 versions = json.loads(resp.content)
55 except IOError as e: 68 except IOError as e:
56 print(e, file=sys.stderr) 69 print(e, file=sys.stderr)
57 raise e 70 raise e
58 else: 71
59 if db_version == "latest": 72 if self.db_version == "latest":
60 db_date_list = [] 73 db_date_list = []
61 for db_dic in versions: 74 for db_dic in versions:
62 db_date_list.append(datetime.strptime(db_dic["date"], 75 db_date_list.append(
63 '%Y-%m-%d').date()) 76 datetime.strptime(db_dic["date"], "%Y-%m-%d").date()
64 filtered_version = max(versions, key=lambda x: x['date']) 77 )
65 else: 78 filtered_version = max(versions, key=lambda x: x["date"])
66 filtered_version = None 79 else:
67 for item in versions: 80 filtered_version = None
68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: 81 for item in versions:
69 filtered_version = item 82 if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version:
70 break 83 filtered_version = item
71 if filtered_version is None: 84 break
72 print("No matching version detected in the list") 85 if filtered_version is None:
73 if filtered_version is not None: 86 print("No matching version detected in the list")
74 self.db_url = f"https://zenodo.org/record/" \ 87 else:
75 f"{filtered_version['record']}/files/db.tar.gz" 88 self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}"
76 self.db_version = db_version 89 return filtered_version
77 return filtered_version
78 90
79 def get_data_manager(self, bakta_database_info): 91 def get_data_manager(self, bakta_database_info):
80 self.bakta_table_list = self.get_data_table_format() 92 self.bakta_table_list = self.get_data_table_format()
81 bakta_name = f"V{bakta_database_info['major']}." \ 93 bakta_name = (
82 f"{bakta_database_info['minor']}_" \ 94 f"V{bakta_database_info['major']}."
83 f"{bakta_database_info['date']}" 95 f"{bakta_database_info['minor']}{self.db_type}_"
84 tool_version = str(f"{bakta_database_info['software-min']['major']}." 96 f"{bakta_database_info['date']}"
85 f"{bakta_database_info['software-min']['minor']}") 97 )
86 data_info = dict(value=bakta_name, 98 tool_version = str(
87 dbkey=bakta_database_info['record'], 99 f"{bakta_database_info['software-min']['major']}."
88 bakta_version=tool_version, 100 f"{bakta_database_info['software-min']['minor']}"
89 path="db") 101 )
102 data_info = dict(
103 value=bakta_name,
104 dbkey=bakta_database_info["record"],
105 bakta_version=tool_version,
106 path="db",
107 )
90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] 108 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info]
91 return self.bakta_table_list 109 return self.bakta_table_list
92 110
93 111
94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): 112 class InstallBaktaDatabase(GetBaktaDatabaseInfo):
96 Download the bakta database, 114 Download the bakta database,
97 check md5 sum, 115 check md5 sum,
98 untar the download db and update for the amrfinderplus database 116 untar the download db and update for the amrfinderplus database
99 """ 117 """
100 118
101 def __init__(self, 119 def __init__(
102 db_dir=Path.cwd(), 120 self, db_dir=Path.cwd(), db_name="bakta", db_version="latest", test_mode=False
103 db_name="bakta", 121 ):
104 tarball_name="db.tar.gz",
105 test_mode=False):
106 super().__init__() 122 super().__init__()
107 self.md5 = None 123 self.md5 = None
124 self.db_version = db_version
108 self.db_dir = db_dir 125 self.db_dir = db_dir
109 self.db_name = db_name 126 self.db_name = db_name
110 self.tarball_name = tarball_name 127 self.tarball_path = ""
111 self.tarball_path = None
112 self.test_mode = test_mode 128 self.test_mode = test_mode
129 self.get_database_type()
113 130
114 def download(self): 131 def download(self):
115 self.db_name = f'{self.db_name}_{self.db_version}' 132 self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}"
116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) 133 bakta_path = Path(self.db_dir).joinpath(self.tar_name)
117 try: 134 try:
118 with bakta_path.open('wb') as fh_out, \ 135 with bakta_path.open("wb") as fh_out, requests.get(
119 requests.get(self.db_url, stream=True) as resp: 136 self.db_url, stream=True) as resp:
120 total_length = resp.headers.get('content-length') 137 total_length = resp.headers.get("content-length")
121 if total_length is None: # no content length header 138 if total_length is None: # no content length header
122 for data in resp.iter_content(chunk_size=1024 * 1024): 139 for data in resp.iter_content(chunk_size=1024 * 1024):
123 fh_out.write(data) 140 fh_out.write(data)
124 else: 141 else:
125 for data in resp.iter_content(chunk_size=1024 * 1024): 142 for data in resp.iter_content(chunk_size=1024 * 1024):
126 fh_out.write(data) 143 fh_out.write(data)
127 print(f'Download bakta database {self.db_version}') 144 print(f"Download bakta database {self.db_version}")
128 self.tarball_path = bakta_path 145 self.tarball_path = bakta_path
129 except IOError: 146 except IOError:
130 print(f'ERROR: Could not download file from Zenodo!' 147 print(
131 f' url={self.db_url}, path={self.tarball_name}') 148 f"ERROR: Could not download file from Zenodo!"
149 f" url={self.db_url}, to={self.tarball_path}"
150 )
132 151
133 def untar(self): 152 def untar(self):
134 db_path = Path(self.db_dir).as_posix() 153 db_path = Path(self.db_dir).as_posix()
135 try: 154 try:
136 with self.tarball_path.open('rb') as fh_in, \ 155 with self.tarball_path.open("rb") as fh_in, tarfile.open(
137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: 156 fileobj=fh_in, mode="r:gz"
157 ) as tar_file:
138 tar_file.extractall(path=db_path) 158 tar_file.extractall(path=db_path)
139 print(f'Untar the database in {db_path}') 159 print(f"Untar the database in {db_path}")
140 return db_path 160 return db_path
141 except OSError: 161 except OSError:
142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' 162 sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {self.db_name}")
143 f'to {self.db_name}')
144 163
145 def calc_md5_sum(self, buffer_size=1048576): 164 def calc_md5_sum(self, buffer_size=1048576):
146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) 165 tarball_path = Path(self.db_dir).joinpath(self.tar_name)
147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"]
148 md5 = hashlib.md5() 166 md5 = hashlib.md5()
149 with tarball_path.open('rb') as fh: 167 with tarball_path.open("rb") as fh:
150 data = fh.read(buffer_size) 168 data = fh.read(buffer_size)
151 while data: 169 while data:
152 md5.update(data) 170 md5.update(data)
153 data = fh.read(buffer_size) 171 data = fh.read(buffer_size)
154 if md5.hexdigest() == self.md5: 172 if md5.hexdigest() == self.md5:
155 print('\t...md5 control database OK') 173 print("\t...md5 control database OK")
156 else: 174 else:
157 print(f"Error: corrupt database file! " 175 print(
158 f"calculated md5 = {md5.hexdigest()}" 176 f"Error: corrupt database file! "
159 f" different from {self.md5} ") 177 f"calculated md5 = {md5.hexdigest()}"
160 178 f" different from {self.md5} "
161 179 )
162 """
163 This is the method to download the amrfinderplus database need by bakta.
164 Deprecated to use the amrfinderplus data_manager
165 def update_amrfinderplus_db(self):
166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db"
167 if self.db_version == "test":
168 cmd = [
169 'amrfinder_update',
170 '--database', str(amrfinderplus_db_path),
171 '--force_update',
172 '--help'
173 ]
174 else:
175 cmd = [
176 'amrfinder_update',
177 '--database', str(amrfinderplus_db_path),
178 '--force_update'
179 ]
180 proc = sp.run(
181 cmd,
182 universal_newlines=True
183 )
184 if proc.returncode != 0:
185 print(f"ERROR: AMRFinderPlus failed! "
186 f"command: 'amrfinder_update --force_update"
187 f" --database {amrfinderplus_db_path}'")
188 else:
189 print("AMRFinderPlus database download")
190 """
191 180
192 181
193 def parse_arguments(): 182 def parse_arguments():
194 # parse options and arguments 183 # parse options and arguments
195 arg_parser = argparse.ArgumentParser() 184 arg_parser = argparse.ArgumentParser()
196 arg_parser.add_argument("data_manager_json") 185 arg_parser.add_argument("data_manager_json")
197 arg_parser.add_argument("-d", "--database_version", 186 arg_parser.add_argument(
198 help='Select the database version ' 187 "-d",
199 '(major and minor eg. 4.0),' 188 "--database_version",
200 'default is the latest version', 189 help="Select the database version "
201 default="latest", 190 "(major and minor eg. 4.0),"
202 required=True) 191 "default is the latest version",
203 arg_parser.add_argument("-t", "--test", action='store_true', 192 default="latest",
204 help="option to test the script with an empty database") 193 required=True,
194 )
195 arg_parser.add_argument(
196 "-t",
197 "--test",
198 action="store_true",
199 help="option to test the script with an empty database",
200 )
205 return arg_parser.parse_args() 201 return arg_parser.parse_args()
206 202
207 203
208 def main(): 204 def main():
209 all_args = parse_arguments() 205 all_args = parse_arguments()
210 with open(all_args.data_manager_json) as fh: 206 with open(all_args.data_manager_json) as fh:
211 params = json.load(fh) 207 params = json.load(fh)
212 target_dir = params['output_data'][0]['extra_files_path'] 208 target_dir = params["output_data"][0]["extra_files_path"]
213 os.makedirs(target_dir) 209 os.makedirs(target_dir)
214 # init the class to download bakta db 210 # init the class to download bakta db
215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) 211 bakta_upload = InstallBaktaDatabase(
216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) 212 test_mode=all_args.test, db_version=all_args.database_version
213 )
214 bakta_db = bakta_upload.fetch_db_versions()
217 # update the path for galaxy 215 # update the path for galaxy
218 bakta_upload.db_dir = target_dir 216 bakta_upload.db_dir = target_dir
219 # download the database 217 # download the database
220 bakta_upload.download() 218 bakta_upload.download()
221 # check md5 sum 219 # check md5 sum
222 bakta_upload.calc_md5_sum() 220 bakta_upload.calc_md5_sum()
223 # untar db 221 # untar db
224 bakta_upload.untar() 222 bakta_upload.untar()
225 # make the data_manager metadata 223 # make the data_manager metadata
226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) 224 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db)
227 with open(all_args.data_manager_json, 'w') as fh: 225 with open(all_args.data_manager_json, "w") as fh:
228 json.dump(bakta_data_manager, fh, sort_keys=True) 226 json.dump(bakta_data_manager, fh, sort_keys=True)
229 227
230 228
231 if __name__ == '__main__': 229 if __name__ == "__main__":
232 main() 230 main()