Mercurial > repos > iuc > data_manager_bakta
comparison data_manager/bakta_build_database.py @ 3:3e73c97f025d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 487cb35fe55883ac6eeb8dda58b56c9ca2ec0a85
| author | iuc |
|---|---|
| date | Fri, 23 Jun 2023 21:37:05 +0000 |
| parents | bb463043c93e |
| children | d74850cf4e42 |
comparison
equal
deleted
inserted
replaced
| 2:adfd6bf710bd | 3:3e73c97f025d |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import hashlib | 2 import hashlib |
| 3 import json | 3 import json |
| 4 import os | 4 import os |
| 5 import re | |
| 5 import sys | 6 import sys |
| 6 import tarfile | 7 import tarfile |
| 7 from datetime import datetime | 8 from datetime import datetime |
| 8 from pathlib import Path | 9 from pathlib import Path |
| 9 | 10 |
| 14 class GetBaktaDatabaseInfo: | 15 class GetBaktaDatabaseInfo: |
| 15 """ | 16 """ |
| 16 Extract bakta database information to make a json file for data_manager | 17 Extract bakta database information to make a json file for data_manager |
| 17 """ | 18 """ |
| 18 | 19 |
| 19 def __init__(self, | 20 def __init__( |
| 20 data_table_name="bakta_database", | 21 self, |
| 21 db_name=Path.cwd().joinpath("db"), | 22 data_table_name="bakta_database", |
| 22 db_version="latest", | 23 db_name=Path.cwd().joinpath("db"), |
| 23 test_mode=False): | 24 db_version="latest", |
| 25 tarball_name="db.tar.gz", | |
| 26 test_mode=False, | |
| 27 ): | |
| 24 self.bakta_table_list = None | 28 self.bakta_table_list = None |
| 25 self.db_url = None | 29 self.db_url = None |
| 30 self.db_type = "" | |
| 26 self.data_table_entry = None | 31 self.data_table_entry = None |
| 27 self.data_table_name = data_table_name | 32 self.data_table_name = data_table_name |
| 28 self.db_name = db_name | 33 self.db_name = db_name |
| 34 self.tar_name = tarball_name | |
| 29 self.db_version = db_version | 35 self.db_version = db_version |
| 30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' | 36 self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json" |
| 31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' | 37 self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json" |
| 32 self.test_mode = test_mode | 38 self.test_mode = test_mode |
| 39 | |
| 40 def get_database_type(self): | |
| 41 self.light_db = bool(re.search(pattern="light", string=self.db_version)) | |
| 42 self.db_version = self.db_version.split(sep="_")[0] | |
| 43 if self.light_db: | |
| 44 self.db_type = "light" | |
| 45 self.tar_name = "db-light.tar.gz" | |
| 46 self.md5 = self.fetch_db_versions()["md5-light"] | |
| 47 else: | |
| 48 self.md5 = self.fetch_db_versions()["md5"] | |
| 33 | 49 |
| 34 def get_data_table_format(self): | 50 def get_data_table_format(self): |
| 35 """ | 51 """ |
| 36 Skeleton of a data_table format | 52 Skeleton of a data_table format |
| 37 return: a data table formated for json output | 53 return: a data table formated for json output |
| 38 """ | 54 """ |
| 39 self.data_table_entry = { | 55 self.data_table_entry = {"data_tables": {self.data_table_name: {}}} |
| 40 "data_tables": { | |
| 41 self.data_table_name: {} | |
| 42 } | |
| 43 } | |
| 44 return self.data_table_entry | 56 return self.data_table_entry |
| 45 | 57 |
| 46 def fetch_db_versions(self, db_version="latest"): | 58 def fetch_db_versions(self): |
| 47 """ | 59 """ |
| 48 List bakta database info related to the db_version selected | 60 List bakta database info related to the db_version selected |
| 49 """ | 61 """ |
| 50 if self.test_mode is True: | 62 |
| 63 if self.test_mode: | |
| 51 self.DB_VERSIONS_URL = self.DB_TEST_URL | 64 self.DB_VERSIONS_URL = self.DB_TEST_URL |
| 52 try: | 65 try: |
| 53 with requests.get(self.DB_VERSIONS_URL) as resp: | 66 with requests.get(self.DB_VERSIONS_URL) as resp: |
| 54 versions = json.loads(resp.content) | 67 versions = json.loads(resp.content) |
| 55 except IOError as e: | 68 except IOError as e: |
| 56 print(e, file=sys.stderr) | 69 print(e, file=sys.stderr) |
| 57 raise e | 70 raise e |
| 58 else: | 71 |
| 59 if db_version == "latest": | 72 if self.db_version == "latest": |
| 60 db_date_list = [] | 73 db_date_list = [] |
| 61 for db_dic in versions: | 74 for db_dic in versions: |
| 62 db_date_list.append(datetime.strptime(db_dic["date"], | 75 db_date_list.append( |
| 63 '%Y-%m-%d').date()) | 76 datetime.strptime(db_dic["date"], "%Y-%m-%d").date() |
| 64 filtered_version = max(versions, key=lambda x: x['date']) | 77 ) |
| 65 else: | 78 filtered_version = max(versions, key=lambda x: x["date"]) |
| 66 filtered_version = None | 79 else: |
| 67 for item in versions: | 80 filtered_version = None |
| 68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: | 81 for item in versions: |
| 69 filtered_version = item | 82 if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version: |
| 70 break | 83 filtered_version = item |
| 71 if filtered_version is None: | 84 break |
| 72 print("No matching version detected in the list") | 85 if filtered_version is None: |
| 73 if filtered_version is not None: | 86 print("No matching version detected in the list") |
| 74 self.db_url = f"https://zenodo.org/record/" \ | 87 else: |
| 75 f"{filtered_version['record']}/files/db.tar.gz" | 88 self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}" |
| 76 self.db_version = db_version | 89 return filtered_version |
| 77 return filtered_version | |
| 78 | 90 |
| 79 def get_data_manager(self, bakta_database_info): | 91 def get_data_manager(self, bakta_database_info): |
| 80 self.bakta_table_list = self.get_data_table_format() | 92 self.bakta_table_list = self.get_data_table_format() |
| 81 bakta_name = f"V{bakta_database_info['major']}." \ | 93 bakta_name = ( |
| 82 f"{bakta_database_info['minor']}_" \ | 94 f"V{bakta_database_info['major']}." |
| 83 f"{bakta_database_info['date']}" | 95 f"{bakta_database_info['minor']}{self.db_type}_" |
| 84 tool_version = str(f"{bakta_database_info['software-min']['major']}." | 96 f"{bakta_database_info['date']}" |
| 85 f"{bakta_database_info['software-min']['minor']}") | 97 ) |
| 86 data_info = dict(value=bakta_name, | 98 tool_version = str( |
| 87 dbkey=bakta_database_info['record'], | 99 f"{bakta_database_info['software-min']['major']}." |
| 88 bakta_version=tool_version, | 100 f"{bakta_database_info['software-min']['minor']}" |
| 89 path="db") | 101 ) |
| 102 data_info = dict( | |
| 103 value=bakta_name, | |
| 104 dbkey=bakta_database_info["record"], | |
| 105 bakta_version=tool_version, | |
| 106 path="db", | |
| 107 ) | |
| 90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] | 108 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] |
| 91 return self.bakta_table_list | 109 return self.bakta_table_list |
| 92 | 110 |
| 93 | 111 |
| 94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): | 112 class InstallBaktaDatabase(GetBaktaDatabaseInfo): |
| 96 Download the bakta database, | 114 Download the bakta database, |
| 97 check md5 sum, | 115 check md5 sum, |
| 98 untar the download db and update for the amrfinderplus database | 116 untar the download db and update for the amrfinderplus database |
| 99 """ | 117 """ |
| 100 | 118 |
| 101 def __init__(self, | 119 def __init__( |
| 102 db_dir=Path.cwd(), | 120 self, db_dir=Path.cwd(), db_name="bakta", db_version="latest", test_mode=False |
| 103 db_name="bakta", | 121 ): |
| 104 tarball_name="db.tar.gz", | |
| 105 test_mode=False): | |
| 106 super().__init__() | 122 super().__init__() |
| 107 self.md5 = None | 123 self.md5 = None |
| 124 self.db_version = db_version | |
| 108 self.db_dir = db_dir | 125 self.db_dir = db_dir |
| 109 self.db_name = db_name | 126 self.db_name = db_name |
| 110 self.tarball_name = tarball_name | 127 self.tarball_path = "" |
| 111 self.tarball_path = None | |
| 112 self.test_mode = test_mode | 128 self.test_mode = test_mode |
| 129 self.get_database_type() | |
| 113 | 130 |
| 114 def download(self): | 131 def download(self): |
| 115 self.db_name = f'{self.db_name}_{self.db_version}' | 132 self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}" |
| 116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) | 133 bakta_path = Path(self.db_dir).joinpath(self.tar_name) |
| 117 try: | 134 try: |
| 118 with bakta_path.open('wb') as fh_out, \ | 135 with bakta_path.open("wb") as fh_out, requests.get( |
| 119 requests.get(self.db_url, stream=True) as resp: | 136 self.db_url, stream=True) as resp: |
| 120 total_length = resp.headers.get('content-length') | 137 total_length = resp.headers.get("content-length") |
| 121 if total_length is None: # no content length header | 138 if total_length is None: # no content length header |
| 122 for data in resp.iter_content(chunk_size=1024 * 1024): | 139 for data in resp.iter_content(chunk_size=1024 * 1024): |
| 123 fh_out.write(data) | 140 fh_out.write(data) |
| 124 else: | 141 else: |
| 125 for data in resp.iter_content(chunk_size=1024 * 1024): | 142 for data in resp.iter_content(chunk_size=1024 * 1024): |
| 126 fh_out.write(data) | 143 fh_out.write(data) |
| 127 print(f'Download bakta database {self.db_version}') | 144 print(f"Download bakta database {self.db_version}") |
| 128 self.tarball_path = bakta_path | 145 self.tarball_path = bakta_path |
| 129 except IOError: | 146 except IOError: |
| 130 print(f'ERROR: Could not download file from Zenodo!' | 147 print( |
| 131 f' url={self.db_url}, path={self.tarball_name}') | 148 f"ERROR: Could not download file from Zenodo!" |
| 149 f" url={self.db_url}, to={self.tarball_path}" | |
| 150 ) | |
| 132 | 151 |
| 133 def untar(self): | 152 def untar(self): |
| 134 db_path = Path(self.db_dir).as_posix() | 153 db_path = Path(self.db_dir).as_posix() |
| 135 try: | 154 try: |
| 136 with self.tarball_path.open('rb') as fh_in, \ | 155 with self.tarball_path.open("rb") as fh_in, tarfile.open( |
| 137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | 156 fileobj=fh_in, mode="r:gz" |
| 157 ) as tar_file: | |
| 138 tar_file.extractall(path=db_path) | 158 tar_file.extractall(path=db_path) |
| 139 print(f'Untar the database in {db_path}') | 159 print(f"Untar the database in {db_path}") |
| 140 return db_path | 160 return db_path |
| 141 except OSError: | 161 except OSError: |
| 142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' | 162 sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {self.db_name}") |
| 143 f'to {self.db_name}') | |
| 144 | 163 |
| 145 def calc_md5_sum(self, buffer_size=1048576): | 164 def calc_md5_sum(self, buffer_size=1048576): |
| 146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) | 165 tarball_path = Path(self.db_dir).joinpath(self.tar_name) |
| 147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"] | |
| 148 md5 = hashlib.md5() | 166 md5 = hashlib.md5() |
| 149 with tarball_path.open('rb') as fh: | 167 with tarball_path.open("rb") as fh: |
| 150 data = fh.read(buffer_size) | 168 data = fh.read(buffer_size) |
| 151 while data: | 169 while data: |
| 152 md5.update(data) | 170 md5.update(data) |
| 153 data = fh.read(buffer_size) | 171 data = fh.read(buffer_size) |
| 154 if md5.hexdigest() == self.md5: | 172 if md5.hexdigest() == self.md5: |
| 155 print('\t...md5 control database OK') | 173 print("\t...md5 control database OK") |
| 156 else: | 174 else: |
| 157 print(f"Error: corrupt database file! " | 175 print( |
| 158 f"calculated md5 = {md5.hexdigest()}" | 176 f"Error: corrupt database file! " |
| 159 f" different from {self.md5} ") | 177 f"calculated md5 = {md5.hexdigest()}" |
| 160 | 178 f" different from {self.md5} " |
| 161 | 179 ) |
| 162 """ | |
| 163 This is the method to download the amrfinderplus database need by bakta. | |
| 164 Deprecated to use the amrfinderplus data_manager | |
| 165 def update_amrfinderplus_db(self): | |
| 166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db" | |
| 167 if self.db_version == "test": | |
| 168 cmd = [ | |
| 169 'amrfinder_update', | |
| 170 '--database', str(amrfinderplus_db_path), | |
| 171 '--force_update', | |
| 172 '--help' | |
| 173 ] | |
| 174 else: | |
| 175 cmd = [ | |
| 176 'amrfinder_update', | |
| 177 '--database', str(amrfinderplus_db_path), | |
| 178 '--force_update' | |
| 179 ] | |
| 180 proc = sp.run( | |
| 181 cmd, | |
| 182 universal_newlines=True | |
| 183 ) | |
| 184 if proc.returncode != 0: | |
| 185 print(f"ERROR: AMRFinderPlus failed! " | |
| 186 f"command: 'amrfinder_update --force_update" | |
| 187 f" --database {amrfinderplus_db_path}'") | |
| 188 else: | |
| 189 print("AMRFinderPlus database download") | |
| 190 """ | |
| 191 | 180 |
| 192 | 181 |
| 193 def parse_arguments(): | 182 def parse_arguments(): |
| 194 # parse options and arguments | 183 # parse options and arguments |
| 195 arg_parser = argparse.ArgumentParser() | 184 arg_parser = argparse.ArgumentParser() |
| 196 arg_parser.add_argument("data_manager_json") | 185 arg_parser.add_argument("data_manager_json") |
| 197 arg_parser.add_argument("-d", "--database_version", | 186 arg_parser.add_argument( |
| 198 help='Select the database version ' | 187 "-d", |
| 199 '(major and minor eg. 4.0),' | 188 "--database_version", |
| 200 'default is the latest version', | 189 help="Select the database version " |
| 201 default="latest", | 190 "(major and minor eg. 4.0)," |
| 202 required=True) | 191 "default is the latest version", |
| 203 arg_parser.add_argument("-t", "--test", action='store_true', | 192 default="latest", |
| 204 help="option to test the script with an empty database") | 193 required=True, |
| 194 ) | |
| 195 arg_parser.add_argument( | |
| 196 "-t", | |
| 197 "--test", | |
| 198 action="store_true", | |
| 199 help="option to test the script with an empty database", | |
| 200 ) | |
| 205 return arg_parser.parse_args() | 201 return arg_parser.parse_args() |
| 206 | 202 |
| 207 | 203 |
| 208 def main(): | 204 def main(): |
| 209 all_args = parse_arguments() | 205 all_args = parse_arguments() |
| 210 with open(all_args.data_manager_json) as fh: | 206 with open(all_args.data_manager_json) as fh: |
| 211 params = json.load(fh) | 207 params = json.load(fh) |
| 212 target_dir = params['output_data'][0]['extra_files_path'] | 208 target_dir = params["output_data"][0]["extra_files_path"] |
| 213 os.makedirs(target_dir) | 209 os.makedirs(target_dir) |
| 214 # init the class to download bakta db | 210 # init the class to download bakta db |
| 215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) | 211 bakta_upload = InstallBaktaDatabase( |
| 216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) | 212 test_mode=all_args.test, db_version=all_args.database_version |
| 213 ) | |
| 214 bakta_db = bakta_upload.fetch_db_versions() | |
| 217 # update the path for galaxy | 215 # update the path for galaxy |
| 218 bakta_upload.db_dir = target_dir | 216 bakta_upload.db_dir = target_dir |
| 219 # download the database | 217 # download the database |
| 220 bakta_upload.download() | 218 bakta_upload.download() |
| 221 # check md5 sum | 219 # check md5 sum |
| 222 bakta_upload.calc_md5_sum() | 220 bakta_upload.calc_md5_sum() |
| 223 # untar db | 221 # untar db |
| 224 bakta_upload.untar() | 222 bakta_upload.untar() |
| 225 # make the data_manager metadata | 223 # make the data_manager metadata |
| 226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) | 224 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) |
| 227 with open(all_args.data_manager_json, 'w') as fh: | 225 with open(all_args.data_manager_json, "w") as fh: |
| 228 json.dump(bakta_data_manager, fh, sort_keys=True) | 226 json.dump(bakta_data_manager, fh, sort_keys=True) |
| 229 | 227 |
| 230 | 228 |
| 231 if __name__ == '__main__': | 229 if __name__ == "__main__": |
| 232 main() | 230 main() |
