comparison data_manager/bakta_build_database.py @ 0:a19189a128cb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bakta commit fba6deae1d3707e0c14202433d0495e157745afd
author iuc
date Sat, 10 Dec 2022 21:52:28 +0000
parents
children bb463043c93e
comparison
equal deleted inserted replaced
-1:000000000000 0:a19189a128cb
1 import argparse
2 import hashlib
3 import json
4 import os
5 import sys
6 import tarfile
7 from datetime import datetime
8 from pathlib import Path
9
10
11 import requests
12
13
14 class GetBaktaDatabaseInfo:
15 """
16 Extract bakta database information to make a json file for data_manager
17 """
18
19 def __init__(self,
20 data_table_name="bakta_database",
21 db_name=Path.cwd().joinpath("db"),
22 db_version="latest",
23 test_mode=False):
24 self.bakta_table_list = None
25 self.db_url = None
26 self.data_table_entry = None
27 self.data_table_name = data_table_name
28 self.db_name = db_name
29 self.db_version = db_version
30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json'
31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json'
32 self.test_mode = test_mode
33
34 def get_data_table_format(self):
35 """
36 Skeleton of a data_table format
37 return: a data table formated for json output
38 """
39 self.data_table_entry = {
40 "data_tables": {
41 self.data_table_name: {}
42 }
43 }
44 return self.data_table_entry
45
46 def fetch_db_versions(self, db_version="latest"):
47 """
48 List bakta database info related to the db_version selected
49 """
50 if self.test_mode is True:
51 self.DB_VERSIONS_URL = self.DB_TEST_URL
52 try:
53 with requests.get(self.DB_VERSIONS_URL) as resp:
54 versions = json.loads(resp.content)
55 except IOError as e:
56 print(e, file=sys.stderr)
57 raise e
58 else:
59 if db_version == "latest":
60 db_date_list = []
61 for db_dic in versions:
62 db_date_list.append(datetime.strptime(db_dic["date"],
63 '%Y-%m-%d').date())
64 filtered_version = max(versions, key=lambda x: x['date'])
65 else:
66 filtered_version = None
67 for item in versions:
68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version:
69 filtered_version = item
70 break
71 if filtered_version is None:
72 print("No matching version detected in the list")
73 if filtered_version is not None:
74 self.db_url = f"https://zenodo.org/record/" \
75 f"{filtered_version['record']}/files/db.tar.gz"
76 self.db_version = db_version
77 return filtered_version
78
79 def get_data_manager(self, bakta_database_info):
80 self.bakta_table_list = self.get_data_table_format()
81 bakta_value = f"V{bakta_database_info['major']}." \
82 f"{bakta_database_info['minor']}_" \
83 f"{bakta_database_info['date']}"
84 tool_version = str(f"{bakta_database_info['software-min']['major']}."
85 f"{bakta_database_info['software-min']['minor']}")
86 data_info = dict(value=bakta_database_info['record'],
87 dbkey=bakta_value,
88 bakta_version=tool_version,
89 path="db")
90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info]
91 return self.bakta_table_list
92
93
94 class InstallBaktaDatabase(GetBaktaDatabaseInfo):
95 """
96 Download the bakta database,
97 check md5 sum,
98 untar the download db and update for the amrfinderplus database
99 """
100
101 def __init__(self,
102 db_dir=Path.cwd(),
103 db_name="bakta",
104 tarball_name="db.tar.gz",
105 test_mode=False):
106 super().__init__()
107 self.md5 = None
108 self.db_dir = db_dir
109 self.db_name = db_name
110 self.tarball_name = tarball_name
111 self.tarball_path = None
112 self.test_mode = test_mode
113
114 def download(self):
115 self.db_name = f'{self.db_name}_{self.db_version}'
116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name)
117 try:
118 with bakta_path.open('wb') as fh_out, \
119 requests.get(self.db_url, stream=True) as resp:
120 total_length = resp.headers.get('content-length')
121 if total_length is None: # no content length header
122 for data in resp.iter_content(chunk_size=1024 * 1024):
123 fh_out.write(data)
124 else:
125 for data in resp.iter_content(chunk_size=1024 * 1024):
126 fh_out.write(data)
127 print(f'Download bakta database {self.db_version}')
128 self.tarball_path = bakta_path
129 except IOError:
130 print(f'ERROR: Could not download file from Zenodo!'
131 f' url={self.db_url}, path={self.tarball_name}')
132
133 def untar(self):
134 db_path = Path(self.db_dir).as_posix()
135 try:
136 with self.tarball_path.open('rb') as fh_in, \
137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file:
138 tar_file.extractall(path=db_path)
139 print(f'Untar the database in {db_path}')
140 return db_path
141 except OSError:
142 sys.exit(f'ERROR: Could not extract {self.tarball_name} '
143 f'to {self.db_name}')
144
145 def calc_md5_sum(self, buffer_size=1048576):
146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name)
147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"]
148 md5 = hashlib.md5()
149 with tarball_path.open('rb') as fh:
150 data = fh.read(buffer_size)
151 while data:
152 md5.update(data)
153 data = fh.read(buffer_size)
154 if md5.hexdigest() == self.md5:
155 print('\t...md5 control database OK')
156 else:
157 print(f"Error: corrupt database file! "
158 f"calculated md5 = {md5.hexdigest()}"
159 f" different from {self.md5} ")
160
161
162 """
163 This is the method to download the amrfinderplus database need by bakta.
164 Deprecated to use the amrfinderplus data_manager
165 def update_amrfinderplus_db(self):
166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db"
167 if self.db_version == "test":
168 cmd = [
169 'amrfinder_update',
170 '--database', str(amrfinderplus_db_path),
171 '--force_update',
172 '--help'
173 ]
174 else:
175 cmd = [
176 'amrfinder_update',
177 '--database', str(amrfinderplus_db_path),
178 '--force_update'
179 ]
180 proc = sp.run(
181 cmd,
182 universal_newlines=True
183 )
184 if proc.returncode != 0:
185 print(f"ERROR: AMRFinderPlus failed! "
186 f"command: 'amrfinder_update --force_update"
187 f" --database {amrfinderplus_db_path}'")
188 else:
189 print("AMRFinderPlus database download")
190 """
191
192
193 def parse_arguments():
194 # parse options and arguments
195 arg_parser = argparse.ArgumentParser()
196 arg_parser.add_argument("data_manager_json")
197 arg_parser.add_argument("-d", "--database_version",
198 help='Select the database version '
199 '(major and minor eg. 4.0),'
200 'default is the latest version',
201 default="latest",
202 required=True)
203 arg_parser.add_argument("-t", "--test", action='store_true',
204 help="option to test the script with an empty database")
205 return arg_parser.parse_args()
206
207
208 def main():
209 all_args = parse_arguments()
210 with open(all_args.data_manager_json) as fh:
211 params = json.load(fh)
212 target_dir = params['output_data'][0]['extra_files_path']
213 os.makedirs(target_dir)
214 # init the class to download bakta db
215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test)
216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version)
217 # update the path for galaxy
218 bakta_upload.db_dir = target_dir
219 # download the database
220 bakta_upload.download()
221 # check md5 sum
222 bakta_upload.calc_md5_sum()
223 # untar db
224 bakta_upload.untar()
225 # make the data_manager metadata
226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db)
227 with open(all_args.data_manager_json, 'w') as fh:
228 json.dump(bakta_data_manager, fh, sort_keys=True)
229
230
231 if __name__ == '__main__':
232 main()