Mercurial > repos > iuc > amrfinderplus_data_manager_build
comparison data_manager/data_manager_build_amrfinderplus.py @ 3:dfa1eb2941b0 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_amrfinderplus commit 45dbbf06a59df43da2c321c272de11cc41e17d43
| author | iuc |
|---|---|
| date | Sun, 23 Nov 2025 12:21:34 +0000 |
| parents | a5921c09b7b7 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:a5921c09b7b7 | 3:dfa1eb2941b0 |
|---|---|
| 12 class GetAmrFinderPlusDataManager: | 12 class GetAmrFinderPlusDataManager: |
| 13 """ | 13 """ |
| 14 Create the json file with database information for galaxy data manager | 14 Create the json file with database information for galaxy data manager |
| 15 """ | 15 """ |
| 16 | 16 |
| 17 def __init__(self, | 17 def __init__( |
| 18 amrfinderplus_database="amrfinderplus_versioned_database", | 18 self, |
| 19 db_name="amrfinderplus-db", | 19 amrfinderplus_database="amrfinderplus_versioned_database", |
| 20 amrfinderplus_version="latest", | 20 db_name="amrfinderplus-db", |
| 21 date_version=None): | 21 amrfinderplus_version="latest", |
| 22 date_version=None, | |
| 23 ): | |
| 22 self.data_table_name = amrfinderplus_database | 24 self.data_table_name = amrfinderplus_database |
| 23 self._db_name = db_name | 25 self._db_name = db_name |
| 24 self._amrfinderplus_version = amrfinderplus_version | 26 self._amrfinderplus_version = amrfinderplus_version |
| 25 self._amrfinderplus_date_version = date_version | 27 self._amrfinderplus_date_version = date_version |
| 26 self.data_table_entry = None | 28 self.data_table_entry = None |
| 29 def get_data_table_format(self): | 31 def get_data_table_format(self): |
| 30 """ | 32 """ |
| 31 Skeleton of a data_table format | 33 Skeleton of a data_table format |
| 32 return: a data table formatted for json output | 34 return: a data table formatted for json output |
| 33 """ | 35 """ |
| 34 self.data_table_entry = { | 36 self.data_table_entry = {"data_tables": {self.data_table_name: {}}} |
| 35 "data_tables": { | |
| 36 self.data_table_name: {} | |
| 37 } | |
| 38 } | |
| 39 return self.data_table_entry | 37 return self.data_table_entry |
| 40 | 38 |
| 41 def get_data_manager(self): | 39 def get_data_manager(self): |
| 42 """ | 40 """ |
| 43 Create the empty data table format and add all the information into | 41 Create the empty data table format and add all the information into |
| 44 return: The data table with database information | 42 return: The data table with database information |
| 45 """ | 43 """ |
| 46 self.amrfinderplus_table_list = self.get_data_table_format() | 44 self.amrfinderplus_table_list = self.get_data_table_format() |
| 47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ | 45 amrfinderplus_value = ( |
| 48 f"_{self._amrfinderplus_date_version}" | 46 f"amrfinderplus_V{self._amrfinderplus_version}" |
| 49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \ | 47 f"_{self._amrfinderplus_date_version}" |
| 50 f"-{self._amrfinderplus_date_version}" | 48 ) |
| 51 data_info = dict(value=amrfinderplus_value, | 49 amrfinderplus_name = ( |
| 52 name=amrfinderplus_name, | 50 f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}" |
| 53 db_version=self._amrfinderplus_version, | 51 ) |
| 54 path=self._db_name) | 52 data_info = dict( |
| 53 value=amrfinderplus_value, | |
| 54 name=amrfinderplus_name, | |
| 55 db_version=self._amrfinderplus_version, | |
| 56 path=self._db_name, | |
| 57 ) | |
| 55 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] | 58 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] |
| 56 return self.amrfinderplus_table_list | 59 return self.amrfinderplus_table_list |
| 57 | 60 |
| 58 | 61 |
| 59 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): | 62 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): |
| 61 Download the amrfinderplus database from the ncbi. | 64 Download the amrfinderplus database from the ncbi. |
| 62 Make the database available with hmm and indexed files | 65 Make the database available with hmm and indexed files |
| 63 Build the data manager infos for galaxy | 66 Build the data manager infos for galaxy |
| 64 """ | 67 """ |
| 65 | 68 |
| 66 def __init__(self, | 69 def __init__( |
| 67 output_dir=Path.cwd(), | 70 self, |
| 68 ncbi_url="ftp.ncbi.nlm.nih.gov", | 71 output_dir=Path.cwd(), |
| 69 ftp_login="anonymous", | 72 ncbi_url="ftp.ncbi.nlm.nih.gov", |
| 70 ftp_password="anonymous", | 73 ftp_login="anonymous", |
| 71 amrfinderplus_database="amrfinderplus_database", | 74 ftp_password="anonymous", |
| 72 db_name="amrfinderplus-db", | 75 amrfinderplus_database="amrfinderplus_database", |
| 73 amrfinderplus_version="latest", | 76 db_name="amrfinderplus-db", |
| 74 json_file_path=None, | 77 amrfinderplus_version="latest", |
| 75 date_version=None, | 78 json_file_path=None, |
| 76 amrfinderplus_db_path=None, | 79 date_version=None, |
| 77 test_mode=False): | 80 amrfinderplus_db_path=None, |
| 81 test_mode=False, | |
| 82 ): | |
| 78 | 83 |
| 79 super().__init__() | 84 super().__init__() |
| 80 self.json_file_path = json_file_path | 85 self.json_file_path = json_file_path |
| 81 self._output_dir = output_dir | 86 self._output_dir = output_dir |
| 82 self._ncbi_ftp_url = ncbi_url | 87 self._ncbi_ftp_url = ncbi_url |
| 83 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" | 88 self._ncbi_database_path = ( |
| 89 "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" | |
| 90 ) | |
| 84 self._login = ftp_login | 91 self._login = ftp_login |
| 85 self._password = ftp_password | 92 self._password = ftp_password |
| 86 self._amrfinderplus_database = amrfinderplus_database | 93 self._amrfinderplus_database = amrfinderplus_database |
| 87 self._db_name = db_name | 94 self._db_name = db_name |
| 88 self._amrfinderplus_version = amrfinderplus_version | 95 self._amrfinderplus_version = amrfinderplus_version |
| 101 """ | 108 """ |
| 102 cmd = [command] | 109 cmd = [command] |
| 103 [cmd.append(i) for i in args] | 110 [cmd.append(i) for i in args] |
| 104 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) | 111 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) |
| 105 if proc.returncode != 0: | 112 if proc.returncode != 0: |
| 106 print(f'Error type {proc.returncode} with : \n {proc}') | 113 print(f"Error type {proc.returncode} with : \n {proc}") |
| 107 | 114 |
| 108 def download_amrfinderplus_db(self): | 115 def download_amrfinderplus_db(self): |
| 109 """ | 116 """ |
| 110 Download the amrfinderplus database from the ncbi ftp server | 117 Download the amrfinderplus database from the ncbi ftp server |
| 111 """ | 118 """ |
| 112 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' | 119 self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}" |
| 113 os.makedirs(self.amrfinderplus_db_path) | 120 os.makedirs(self.amrfinderplus_db_path) |
| 114 | 121 |
| 115 amrfinderplus_ftp_path = f"ftp://{self._login}:" \ | 122 if self._amrfinderplus_version == "latest": |
| 116 f"{self._password}@{self._ncbi_ftp_url}/" \ | 123 self.get_amrfinderplus_version() |
| 117 f"{self._ncbi_database_path}/" \ | 124 |
| 118 f"{self._amrfinderplus_version}/" \ | 125 amrfinderplus_ftp_path = ( |
| 119 f"{self._amrfinderplus_date_version}" | 126 f"ftp://{self._login}:" |
| 127 f"{self._password}@{self._ncbi_ftp_url}/" | |
| 128 f"{self._ncbi_database_path}/" | |
| 129 f"{self._amrfinderplus_version}/" | |
| 130 f"{self._amrfinderplus_date_version}" | |
| 131 ) | |
| 132 | |
| 133 if self._amrfinderplus_version == "3.12": | |
| 134 taxa_group_file = "taxgroup.tab" | |
| 135 test_dna_fasta = "AMR_DNA-Escherichia" | |
| 136 else: | |
| 137 taxa_group_file = "taxgroup.tsv" | |
| 138 test_dna_fasta = "AMR_DNA-Escherichia.fa" | |
| 120 if self.test_mode is True: | 139 if self.test_mode is True: |
| 121 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] | 140 file_list = [ |
| 141 test_dna_fasta, | |
| 142 "version.txt", | |
| 143 taxa_group_file, | |
| 144 "database_format_version.txt", | |
| 145 ] | |
| 122 output_option = "-O" | 146 output_option = "-O" |
| 123 for file in file_list: | 147 for file in file_list: |
| 124 self.subprocess_cmd("wget", | 148 self.subprocess_cmd( |
| 125 "-nd", | 149 "wget", |
| 126 "-np", | 150 "-nd", |
| 127 "-r", | 151 "-np", |
| 128 f"{amrfinderplus_ftp_path}/{file}", | 152 "-r", |
| 129 output_option, | 153 f"{amrfinderplus_ftp_path}/{file}", |
| 130 f"{self.amrfinderplus_db_path}/{file}") | 154 output_option, |
| 155 f"{self.amrfinderplus_db_path}/{file}", | |
| 156 ) | |
| 131 else: | 157 else: |
| 132 output_option = "-P" | 158 output_option = "-P" |
| 133 self.subprocess_cmd("wget", | 159 self.subprocess_cmd( |
| 134 "-nd", | 160 "wget", |
| 135 "-np", | 161 "-nd", |
| 136 "-r", | 162 "-np", |
| 137 amrfinderplus_ftp_path, | 163 "-r", |
| 138 output_option, | 164 amrfinderplus_ftp_path, |
| 139 self.amrfinderplus_db_path) | 165 output_option, |
| 166 self.amrfinderplus_db_path, | |
| 167 ) | |
| 140 | 168 |
| 141 def make_hmm_profile(self): | 169 def make_hmm_profile(self): |
| 142 """ | 170 """ |
| 143 Make the hmm profile using the AMR.LIB file previously download | 171 Make the hmm profile using the AMR.LIB file previously download |
| 144 """ | 172 """ |
| 151 def extract_filelist_makeblast(self): | 179 def extract_filelist_makeblast(self): |
| 152 """ | 180 """ |
| 153 Extract le list of species which have file in the database | 181 Extract le list of species which have file in the database |
| 154 return: a filtered species list of available species in the database | 182 return: a filtered species list of available species in the database |
| 155 """ | 183 """ |
| 156 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") | 184 if self._amrfinderplus_version == "3.12": |
| 185 taxa_group_file = "taxgroup.tab" | |
| 186 else: | |
| 187 taxa_group_file = "taxgroup.tsv" | |
| 188 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}") | |
| 157 if Path.exists(taxa_group_path): | 189 if Path.exists(taxa_group_path): |
| 158 taxa_table = pd.read_table(taxa_group_path) | 190 taxa_table = pd.read_table(taxa_group_path) |
| 159 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] | 191 taxa_table.columns = [ |
| 160 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) | 192 "taxgroup", |
| 193 "gpipe_taxgroup", | |
| 194 "number_of_nucl_ref_genes", | |
| 195 ] | |
| 196 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter( | |
| 197 items=["taxgroup"], axis=1 | |
| 198 ) | |
| 161 if self.test_mode is True: | 199 if self.test_mode is True: |
| 162 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup | 200 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup |
| 163 else: | 201 else: |
| 164 taxa_df = taxa_df.taxgroup | 202 taxa_df = taxa_df.taxgroup |
| 165 self.species_list = list(taxa_df) | 203 self.species_list = list(taxa_df) |
| 166 else: | 204 else: |
| 167 print("taxgroup.tab file is missing to list available species") | 205 print(f"{taxa_group_file} file is missing to list available species") |
| 168 | 206 |
| 169 def make_blastdb(self): | 207 def make_blastdb(self): |
| 170 """ | 208 """ |
| 171 Index fasta file for blast | 209 Index fasta file for blast |
| 172 """ | 210 """ |
| 173 self.extract_filelist_makeblast() | 211 self.extract_filelist_makeblast() |
| 174 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] | 212 if self._amrfinderplus_version == "3.12": |
| 175 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' | 213 nucl_file_db_list = [ |
| 176 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' | 214 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}" |
| 215 for specie in self.species_list | |
| 216 ] | |
| 217 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS" | |
| 218 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt" | |
| 219 else: | |
| 220 nucl_file_db_list = [ | |
| 221 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa" | |
| 222 for specie in self.species_list | |
| 223 ] | |
| 224 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa" | |
| 225 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa" | |
| 177 os.chdir(self.amrfinderplus_db_path) | 226 os.chdir(self.amrfinderplus_db_path) |
| 178 if Path(amr_dna).exists(): | 227 if Path(amr_dna).exists(): |
| 179 nucl_file_db_list.append(amr_dna) | 228 nucl_file_db_list.append(amr_dna) |
| 180 else: | 229 else: |
| 181 print("No file AMR_CDS detected for indexing") | 230 print("No file AMR_CDS detected for indexing") |
| 182 if Path(amr_prot).exists(): | 231 if Path(amr_prot).exists(): |
| 183 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") | 232 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") |
| 184 else: | 233 else: |
| 185 print("No file AMRProt detected for indexing") | 234 print("No file AMRProt detected for indexing") |
| 186 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] | 235 [ |
| 187 | 236 self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") |
| 188 def get_amrfinderplus_version(self, version_file="version.txt", | 237 for file in nucl_file_db_list |
| 189 database_version_file="database_format_version.txt"): | 238 ] |
| 239 | |
| 240 def get_amrfinderplus_version( | |
| 241 self, | |
| 242 version_file="version.txt", | |
| 243 database_version_file="database_format_version.txt", | |
| 244 ): | |
| 190 """ | 245 """ |
| 191 Check the version when latest if provided and update the number | 246 Check the version when latest if provided and update the number |
| 192 param version_file: name of the file containing version information | 247 param version_file: name of the file containing version information |
| 193 param database_version_file: name of the file containing date version information | 248 param database_version_file: name of the file containing date version information |
| 194 """ | 249 """ |
| 195 ftp = FTP(self._ncbi_ftp_url) | 250 ftp = FTP(self._ncbi_ftp_url) |
| 196 ftp.login(self._login, self._password) | 251 ftp.login(self._login, self._password) |
| 197 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") | 252 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") |
| 198 db_version = BytesIO() | 253 db_version = BytesIO() |
| 199 db_date_version = BytesIO() | 254 db_date_version = BytesIO() |
| 200 ftp.retrbinary(f'RETR {version_file}', db_version.write) | 255 ftp.retrbinary(f"RETR {version_file}", db_version.write) |
| 201 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) | 256 ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write) |
| 202 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] | 257 self._amrfinderplus_date_version = ( |
| 203 self._amrfinderplus_version = '.'.join( | 258 db_version.getvalue().decode("utf-8").splitlines()[0] |
| 204 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) | 259 ) |
| 260 self._amrfinderplus_version = ".".join( | |
| 261 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2] | |
| 262 ) | |
| 205 | 263 |
| 206 def read_json_input_file(self): | 264 def read_json_input_file(self): |
| 207 """ | 265 """ |
| 208 Import the json file | 266 Import the json file |
| 209 """ | 267 """ |
| 210 with open(self.json_file_path) as fh: | 268 with open(self.json_file_path) as fh: |
| 211 params = json.load(fh) | 269 params = json.load(fh) |
| 212 target_dir = params['output_data'][0]['extra_files_path'] | 270 target_dir = params["output_data"][0]["extra_files_path"] |
| 213 os.makedirs(target_dir) | 271 os.makedirs(target_dir) |
| 214 self._output_dir = target_dir | 272 self._output_dir = target_dir |
| 215 | 273 |
| 216 def write_json_infos(self): | 274 def write_json_infos(self): |
| 217 """ | 275 """ |
| 218 Write in the imported json file | 276 Write in the imported json file |
| 219 """ | 277 """ |
| 220 with open(self.json_file_path, 'w') as fh: | 278 with open(self.json_file_path, "w") as fh: |
| 221 json.dump(self.get_data_manager(), fh, sort_keys=True) | 279 json.dump(self.get_data_manager(), fh, sort_keys=True) |
| 222 | 280 |
| 223 | 281 |
| 224 def parse_arguments(): | 282 def parse_arguments(): |
| 225 """ | 283 """ |
| 226 List of arguments provided by the user | 284 List of arguments provided by the user |
| 227 return: parsed arguments | 285 return: parsed arguments |
| 228 """ | 286 """ |
| 229 # parse options and arguments | 287 # parse options and arguments |
| 230 arg_parser = argparse.ArgumentParser() | 288 arg_parser = argparse.ArgumentParser() |
| 231 arg_parser.add_argument("data_manager_json", | 289 arg_parser.add_argument("data_manager_json", help="json file from galaxy") |
| 232 help="json file from galaxy") | 290 arg_parser.add_argument( |
| 233 arg_parser.add_argument("--db_version", default="latest", | 291 "--db_version", |
| 234 help="select the major version of the database (e.g. 3.10, 3.8), default is latest") | 292 default="latest", |
| 235 arg_parser.add_argument("--db_date", | 293 help="select the major version of the database (e.g. 3.10, 3.8), default is latest", |
| 236 help="select the date into the database version (e.g. 2022-10-11.2)") | 294 ) |
| 237 arg_parser.add_argument("--test", action='store_true', | 295 arg_parser.add_argument( |
| 238 help="option to test the script with an lighted database") | 296 "--db_date", |
| 297 help="select the date into the database version (e.g. 2022-10-11.2)", | |
| 298 ) | |
| 299 arg_parser.add_argument( | |
| 300 "--test", | |
| 301 action="store_true", | |
| 302 help="option to test the script with an lighted database", | |
| 303 ) | |
| 239 return arg_parser.parse_args() | 304 return arg_parser.parse_args() |
| 240 | 305 |
| 241 | 306 |
| 242 def main(): | 307 def main(): |
| 243 all_args = parse_arguments() | 308 all_args = parse_arguments() |
| 244 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, | 309 amrfinderplus_download = DownloadAmrFinderPlusDatabase( |
| 245 date_version=all_args.db_date, | 310 amrfinderplus_version=all_args.db_version, |
| 246 json_file_path=all_args.data_manager_json, | 311 date_version=all_args.db_date, |
| 247 test_mode=all_args.test) | 312 json_file_path=all_args.data_manager_json, |
| 313 test_mode=all_args.test, | |
| 314 ) | |
| 248 amrfinderplus_download.read_json_input_file() | 315 amrfinderplus_download.read_json_input_file() |
| 249 amrfinderplus_download.download_amrfinderplus_db() | 316 amrfinderplus_download.download_amrfinderplus_db() |
| 250 amrfinderplus_download.make_hmm_profile() | 317 amrfinderplus_download.make_hmm_profile() |
| 251 amrfinderplus_download.make_blastdb() | 318 amrfinderplus_download.make_blastdb() |
| 252 amrfinderplus_download.write_json_infos() | 319 amrfinderplus_download.write_json_infos() |
| 253 | 320 |
| 254 | 321 |
| 255 if __name__ == '__main__': | 322 if __name__ == "__main__": |
| 256 main() | 323 main() |
