Mercurial > repos > iuc > data_manager_bakta
comparison data_manager/bakta_build_database.py @ 0:a19189a128cb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bakta commit fba6deae1d3707e0c14202433d0495e157745afd
| author | iuc |
|---|---|
| date | Sat, 10 Dec 2022 21:52:28 +0000 |
| parents | |
| children | bb463043c93e |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a19189a128cb |
|---|---|
| 1 import argparse | |
| 2 import hashlib | |
| 3 import json | |
| 4 import os | |
| 5 import sys | |
| 6 import tarfile | |
| 7 from datetime import datetime | |
| 8 from pathlib import Path | |
| 9 | |
| 10 | |
| 11 import requests | |
| 12 | |
| 13 | |
| 14 class GetBaktaDatabaseInfo: | |
| 15 """ | |
| 16 Extract bakta database information to make a json file for data_manager | |
| 17 """ | |
| 18 | |
| 19 def __init__(self, | |
| 20 data_table_name="bakta_database", | |
| 21 db_name=Path.cwd().joinpath("db"), | |
| 22 db_version="latest", | |
| 23 test_mode=False): | |
| 24 self.bakta_table_list = None | |
| 25 self.db_url = None | |
| 26 self.data_table_entry = None | |
| 27 self.data_table_name = data_table_name | |
| 28 self.db_name = db_name | |
| 29 self.db_version = db_version | |
| 30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' | |
| 31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' | |
| 32 self.test_mode = test_mode | |
| 33 | |
| 34 def get_data_table_format(self): | |
| 35 """ | |
| 36 Skeleton of a data_table format | |
| 37 return: a data table formated for json output | |
| 38 """ | |
| 39 self.data_table_entry = { | |
| 40 "data_tables": { | |
| 41 self.data_table_name: {} | |
| 42 } | |
| 43 } | |
| 44 return self.data_table_entry | |
| 45 | |
| 46 def fetch_db_versions(self, db_version="latest"): | |
| 47 """ | |
| 48 List bakta database info related to the db_version selected | |
| 49 """ | |
| 50 if self.test_mode is True: | |
| 51 self.DB_VERSIONS_URL = self.DB_TEST_URL | |
| 52 try: | |
| 53 with requests.get(self.DB_VERSIONS_URL) as resp: | |
| 54 versions = json.loads(resp.content) | |
| 55 except IOError as e: | |
| 56 print(e, file=sys.stderr) | |
| 57 raise e | |
| 58 else: | |
| 59 if db_version == "latest": | |
| 60 db_date_list = [] | |
| 61 for db_dic in versions: | |
| 62 db_date_list.append(datetime.strptime(db_dic["date"], | |
| 63 '%Y-%m-%d').date()) | |
| 64 filtered_version = max(versions, key=lambda x: x['date']) | |
| 65 else: | |
| 66 filtered_version = None | |
| 67 for item in versions: | |
| 68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: | |
| 69 filtered_version = item | |
| 70 break | |
| 71 if filtered_version is None: | |
| 72 print("No matching version detected in the list") | |
| 73 if filtered_version is not None: | |
| 74 self.db_url = f"https://zenodo.org/record/" \ | |
| 75 f"{filtered_version['record']}/files/db.tar.gz" | |
| 76 self.db_version = db_version | |
| 77 return filtered_version | |
| 78 | |
| 79 def get_data_manager(self, bakta_database_info): | |
| 80 self.bakta_table_list = self.get_data_table_format() | |
| 81 bakta_value = f"V{bakta_database_info['major']}." \ | |
| 82 f"{bakta_database_info['minor']}_" \ | |
| 83 f"{bakta_database_info['date']}" | |
| 84 tool_version = str(f"{bakta_database_info['software-min']['major']}." | |
| 85 f"{bakta_database_info['software-min']['minor']}") | |
| 86 data_info = dict(value=bakta_database_info['record'], | |
| 87 dbkey=bakta_value, | |
| 88 bakta_version=tool_version, | |
| 89 path="db") | |
| 90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] | |
| 91 return self.bakta_table_list | |
| 92 | |
| 93 | |
| 94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): | |
| 95 """ | |
| 96 Download the bakta database, | |
| 97 check md5 sum, | |
| 98 untar the download db and update for the amrfinderplus database | |
| 99 """ | |
| 100 | |
| 101 def __init__(self, | |
| 102 db_dir=Path.cwd(), | |
| 103 db_name="bakta", | |
| 104 tarball_name="db.tar.gz", | |
| 105 test_mode=False): | |
| 106 super().__init__() | |
| 107 self.md5 = None | |
| 108 self.db_dir = db_dir | |
| 109 self.db_name = db_name | |
| 110 self.tarball_name = tarball_name | |
| 111 self.tarball_path = None | |
| 112 self.test_mode = test_mode | |
| 113 | |
| 114 def download(self): | |
| 115 self.db_name = f'{self.db_name}_{self.db_version}' | |
| 116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) | |
| 117 try: | |
| 118 with bakta_path.open('wb') as fh_out, \ | |
| 119 requests.get(self.db_url, stream=True) as resp: | |
| 120 total_length = resp.headers.get('content-length') | |
| 121 if total_length is None: # no content length header | |
| 122 for data in resp.iter_content(chunk_size=1024 * 1024): | |
| 123 fh_out.write(data) | |
| 124 else: | |
| 125 for data in resp.iter_content(chunk_size=1024 * 1024): | |
| 126 fh_out.write(data) | |
| 127 print(f'Download bakta database {self.db_version}') | |
| 128 self.tarball_path = bakta_path | |
| 129 except IOError: | |
| 130 print(f'ERROR: Could not download file from Zenodo!' | |
| 131 f' url={self.db_url}, path={self.tarball_name}') | |
| 132 | |
| 133 def untar(self): | |
| 134 db_path = Path(self.db_dir).as_posix() | |
| 135 try: | |
| 136 with self.tarball_path.open('rb') as fh_in, \ | |
| 137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | |
| 138 tar_file.extractall(path=db_path) | |
| 139 print(f'Untar the database in {db_path}') | |
| 140 return db_path | |
| 141 except OSError: | |
| 142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' | |
| 143 f'to {self.db_name}') | |
| 144 | |
| 145 def calc_md5_sum(self, buffer_size=1048576): | |
| 146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) | |
| 147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"] | |
| 148 md5 = hashlib.md5() | |
| 149 with tarball_path.open('rb') as fh: | |
| 150 data = fh.read(buffer_size) | |
| 151 while data: | |
| 152 md5.update(data) | |
| 153 data = fh.read(buffer_size) | |
| 154 if md5.hexdigest() == self.md5: | |
| 155 print('\t...md5 control database OK') | |
| 156 else: | |
| 157 print(f"Error: corrupt database file! " | |
| 158 f"calculated md5 = {md5.hexdigest()}" | |
| 159 f" different from {self.md5} ") | |
| 160 | |
| 161 | |
| 162 """ | |
| 163 This is the method to download the amrfinderplus database need by bakta. | |
| 164 Deprecated to use the amrfinderplus data_manager | |
| 165 def update_amrfinderplus_db(self): | |
| 166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db" | |
| 167 if self.db_version == "test": | |
| 168 cmd = [ | |
| 169 'amrfinder_update', | |
| 170 '--database', str(amrfinderplus_db_path), | |
| 171 '--force_update', | |
| 172 '--help' | |
| 173 ] | |
| 174 else: | |
| 175 cmd = [ | |
| 176 'amrfinder_update', | |
| 177 '--database', str(amrfinderplus_db_path), | |
| 178 '--force_update' | |
| 179 ] | |
| 180 proc = sp.run( | |
| 181 cmd, | |
| 182 universal_newlines=True | |
| 183 ) | |
| 184 if proc.returncode != 0: | |
| 185 print(f"ERROR: AMRFinderPlus failed! " | |
| 186 f"command: 'amrfinder_update --force_update" | |
| 187 f" --database {amrfinderplus_db_path}'") | |
| 188 else: | |
| 189 print("AMRFinderPlus database download") | |
| 190 """ | |
| 191 | |
| 192 | |
| 193 def parse_arguments(): | |
| 194 # parse options and arguments | |
| 195 arg_parser = argparse.ArgumentParser() | |
| 196 arg_parser.add_argument("data_manager_json") | |
| 197 arg_parser.add_argument("-d", "--database_version", | |
| 198 help='Select the database version ' | |
| 199 '(major and minor eg. 4.0),' | |
| 200 'default is the latest version', | |
| 201 default="latest", | |
| 202 required=True) | |
| 203 arg_parser.add_argument("-t", "--test", action='store_true', | |
| 204 help="option to test the script with an empty database") | |
| 205 return arg_parser.parse_args() | |
| 206 | |
| 207 | |
| 208 def main(): | |
| 209 all_args = parse_arguments() | |
| 210 with open(all_args.data_manager_json) as fh: | |
| 211 params = json.load(fh) | |
| 212 target_dir = params['output_data'][0]['extra_files_path'] | |
| 213 os.makedirs(target_dir) | |
| 214 # init the class to download bakta db | |
| 215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) | |
| 216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) | |
| 217 # update the path for galaxy | |
| 218 bakta_upload.db_dir = target_dir | |
| 219 # download the database | |
| 220 bakta_upload.download() | |
| 221 # check md5 sum | |
| 222 bakta_upload.calc_md5_sum() | |
| 223 # untar db | |
| 224 bakta_upload.untar() | |
| 225 # make the data_manager metadata | |
| 226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) | |
| 227 with open(all_args.data_manager_json, 'w') as fh: | |
| 228 json.dump(bakta_data_manager, fh, sort_keys=True) | |
| 229 | |
| 230 | |
| 231 if __name__ == '__main__': | |
| 232 main() |
