Mercurial > repos > iuc > amrfinderplus_data_manager_build
changeset 3:dfa1eb2941b0 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_amrfinderplus commit 45dbbf06a59df43da2c321c272de11cc41e17d43
| author | iuc |
|---|---|
| date | Sun, 23 Nov 2025 12:21:34 +0000 |
| parents | a5921c09b7b7 |
| children | |
| files | data_manager/data_manager_build_amrfinderplus.py data_manager/data_manager_build_amrfinderplus.xml data_manager/macro.xml |
| diffstat | 3 files changed, 177 insertions(+), 88 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_build_amrfinderplus.py Thu May 30 13:06:37 2024 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.py Sun Nov 23 12:21:34 2025 +0000 @@ -14,11 +14,13 @@ Create the json file with database information for galaxy data manager """ - def __init__(self, - amrfinderplus_database="amrfinderplus_versioned_database", - db_name="amrfinderplus-db", - amrfinderplus_version="latest", - date_version=None): + def __init__( + self, + amrfinderplus_database="amrfinderplus_versioned_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + date_version=None, + ): self.data_table_name = amrfinderplus_database self._db_name = db_name self._amrfinderplus_version = amrfinderplus_version @@ -31,11 +33,7 @@ Skeleton of a data_table format return: a data table formatted for json output """ - self.data_table_entry = { - "data_tables": { - self.data_table_name: {} - } - } + self.data_table_entry = {"data_tables": {self.data_table_name: {}}} return self.data_table_entry def get_data_manager(self): @@ -44,14 +42,19 @@ return: The data table with database information """ self.amrfinderplus_table_list = self.get_data_table_format() - amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ - f"_{self._amrfinderplus_date_version}" - amrfinderplus_name = f"V{self._amrfinderplus_version}" \ - f"-{self._amrfinderplus_date_version}" - data_info = dict(value=amrfinderplus_value, - name=amrfinderplus_name, - db_version=self._amrfinderplus_version, - path=self._db_name) + amrfinderplus_value = ( + f"amrfinderplus_V{self._amrfinderplus_version}" + f"_{self._amrfinderplus_date_version}" + ) + amrfinderplus_name = ( + f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}" + ) + data_info = dict( + value=amrfinderplus_value, + name=amrfinderplus_name, + db_version=self._amrfinderplus_version, + path=self._db_name, + ) self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] return self.amrfinderplus_table_list @@ -63,24 +66,28 @@ Build the data manager infos for galaxy """ - def __init__(self, - output_dir=Path.cwd(), - ncbi_url="ftp.ncbi.nlm.nih.gov", - ftp_login="anonymous", - ftp_password="anonymous", - amrfinderplus_database="amrfinderplus_database", - db_name="amrfinderplus-db", - amrfinderplus_version="latest", - json_file_path=None, - date_version=None, - amrfinderplus_db_path=None, - test_mode=False): + def __init__( + self, + output_dir=Path.cwd(), + ncbi_url="ftp.ncbi.nlm.nih.gov", + ftp_login="anonymous", + ftp_password="anonymous", + amrfinderplus_database="amrfinderplus_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + json_file_path=None, + date_version=None, + amrfinderplus_db_path=None, + test_mode=False, + ): super().__init__() self.json_file_path = json_file_path self._output_dir = output_dir self._ncbi_ftp_url = ncbi_url - self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" + self._ncbi_database_path = ( + "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" + ) self._login = ftp_login self._password = ftp_password self._amrfinderplus_database = amrfinderplus_database @@ -103,40 +110,61 @@ [cmd.append(i) for i in args] proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) if proc.returncode != 0: - print(f'Error type {proc.returncode} with : \n {proc}') + print(f"Error type {proc.returncode} with : \n {proc}") def download_amrfinderplus_db(self): """ Download the amrfinderplus database from the ncbi ftp server """ - self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' + self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}" os.makedirs(self.amrfinderplus_db_path) - amrfinderplus_ftp_path = f"ftp://{self._login}:" \ - f"{self._password}@{self._ncbi_ftp_url}/" \ - f"{self._ncbi_database_path}/" \ - f"{self._amrfinderplus_version}/" \ - f"{self._amrfinderplus_date_version}" + if self._amrfinderplus_version == "latest": + self.get_amrfinderplus_version() + + amrfinderplus_ftp_path = ( + f"ftp://{self._login}:" + f"{self._password}@{self._ncbi_ftp_url}/" + f"{self._ncbi_database_path}/" + f"{self._amrfinderplus_version}/" + f"{self._amrfinderplus_date_version}" + ) + + if self._amrfinderplus_version == "3.12": + taxa_group_file = "taxgroup.tab" + test_dna_fasta = "AMR_DNA-Escherichia" + else: + taxa_group_file = "taxgroup.tsv" + test_dna_fasta = "AMR_DNA-Escherichia.fa" if self.test_mode is True: - file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] + file_list = [ + test_dna_fasta, + "version.txt", + taxa_group_file, + "database_format_version.txt", + ] output_option = "-O" for file in file_list: - self.subprocess_cmd("wget", - "-nd", - "-np", - "-r", - f"{amrfinderplus_ftp_path}/{file}", - output_option, - f"{self.amrfinderplus_db_path}/{file}") + self.subprocess_cmd( + "wget", + "-nd", + "-np", + "-r", + f"{amrfinderplus_ftp_path}/{file}", + output_option, + f"{self.amrfinderplus_db_path}/{file}", + ) else: output_option = "-P" - self.subprocess_cmd("wget", - "-nd", - "-np", - "-r", - amrfinderplus_ftp_path, - output_option, - self.amrfinderplus_db_path) + self.subprocess_cmd( + "wget", + "-nd", + "-np", + "-r", + amrfinderplus_ftp_path, + output_option, + self.amrfinderplus_db_path, + ) def make_hmm_profile(self): """ @@ -153,27 +181,48 @@ Extract le list of species which have file in the database return: a filtered species list of available species in the database """ - taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") + if self._amrfinderplus_version == "3.12": + taxa_group_file = "taxgroup.tab" + else: + taxa_group_file = "taxgroup.tsv" + taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}") if Path.exists(taxa_group_path): taxa_table = pd.read_table(taxa_group_path) - taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] - taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) + taxa_table.columns = [ + "taxgroup", + "gpipe_taxgroup", + "number_of_nucl_ref_genes", + ] + taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter( + items=["taxgroup"], axis=1 + ) if self.test_mode is True: taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup else: taxa_df = taxa_df.taxgroup self.species_list = list(taxa_df) else: - print("taxgroup.tab file is missing to list available species") + print(f"{taxa_group_file} file is missing to list available species") def make_blastdb(self): """ Index fasta file for blast """ self.extract_filelist_makeblast() - nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] - amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' - amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' + if self._amrfinderplus_version == "3.12": + nucl_file_db_list = [ + f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}" + for specie in self.species_list + ] + amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS" + amr_prot = f"{self.amrfinderplus_db_path}/AMRProt" + else: + nucl_file_db_list = [ + f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa" + for specie in self.species_list + ] + amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa" + amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa" os.chdir(self.amrfinderplus_db_path) if Path(amr_dna).exists(): nucl_file_db_list.append(amr_dna) @@ -183,10 +232,16 @@ self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") else: print("No file AMRProt detected for indexing") - [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] + [ + self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") + for file in nucl_file_db_list + ] - def get_amrfinderplus_version(self, version_file="version.txt", - database_version_file="database_format_version.txt"): + def get_amrfinderplus_version( + self, + version_file="version.txt", + database_version_file="database_format_version.txt", + ): """ Check the version when latest if provided and update the number param version_file: name of the file containing version information @@ -197,11 +252,14 @@ ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") db_version = BytesIO() db_date_version = BytesIO() - ftp.retrbinary(f'RETR {version_file}', db_version.write) - ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) - self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] - self._amrfinderplus_version = '.'.join( - db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) + ftp.retrbinary(f"RETR {version_file}", db_version.write) + ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write) + self._amrfinderplus_date_version = ( + db_version.getvalue().decode("utf-8").splitlines()[0] + ) + self._amrfinderplus_version = ".".join( + db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2] + ) def read_json_input_file(self): """ @@ -209,7 +267,7 @@ """ with open(self.json_file_path) as fh: params = json.load(fh) - target_dir = params['output_data'][0]['extra_files_path'] + target_dir = params["output_data"][0]["extra_files_path"] os.makedirs(target_dir) self._output_dir = target_dir @@ -217,7 +275,7 @@ """ Write in the imported json file """ - with open(self.json_file_path, 'w') as fh: + with open(self.json_file_path, "w") as fh: json.dump(self.get_data_manager(), fh, sort_keys=True) @@ -228,23 +286,32 @@ """ # parse options and arguments arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("data_manager_json", - help="json file from galaxy") - arg_parser.add_argument("--db_version", default="latest", - help="select the major version of the database (e.g. 3.10, 3.8), default is latest") - arg_parser.add_argument("--db_date", - help="select the date into the database version (e.g. 2022-10-11.2)") - arg_parser.add_argument("--test", action='store_true', - help="option to test the script with an lighted database") + arg_parser.add_argument("data_manager_json", help="json file from galaxy") + arg_parser.add_argument( + "--db_version", + default="latest", + help="select the major version of the database (e.g. 3.10, 3.8), default is latest", + ) + arg_parser.add_argument( + "--db_date", + help="select the date into the database version (e.g. 2022-10-11.2)", + ) + arg_parser.add_argument( + "--test", + action="store_true", + help="option to test the script with an lighted database", + ) return arg_parser.parse_args() def main(): all_args = parse_arguments() - amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, - date_version=all_args.db_date, - json_file_path=all_args.data_manager_json, - test_mode=all_args.test) + amrfinderplus_download = DownloadAmrFinderPlusDatabase( + amrfinderplus_version=all_args.db_version, + date_version=all_args.db_date, + json_file_path=all_args.data_manager_json, + test_mode=all_args.test, + ) amrfinderplus_download.read_json_input_file() amrfinderplus_download.download_amrfinderplus_db() amrfinderplus_download.make_hmm_profile() @@ -252,5 +319,5 @@ amrfinderplus_download.write_json_infos() -if __name__ == '__main__': +if __name__ == "__main__": main()
--- a/data_manager/data_manager_build_amrfinderplus.xml Thu May 30 13:06:37 2024 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.xml Sun Nov 23 12:21:34 2025 +0000 @@ -17,8 +17,14 @@ <inputs> <conditional name="database_list"> <param name="database_version_select" type="select" label="Database version"> + <option value="4.0">V4.0</option> <option value="3.12">V3.12</option> </param> + <when value="4.0"> + <param name="database_date_select" type="select" label="Date version"> + <option value="2025-07-16.1">2025-07-16.1</option> + </param> + </when> <when value="3.12"> <param name="database_date_select" type="select" label="Date version"> <option value="2024-05-02.2">2024-05-02.2</option> @@ -26,7 +32,7 @@ </param> </when> </conditional> - <param name="test_data_manager" type="hidden" value=""/> + <param name="test_data_manager" type="hidden" value=""/> </inputs> <outputs> <data name="output_file" format="data_manager_json"/> @@ -40,7 +46,6 @@ <has_n_lines n="1"/> <has_text text="{"data_tables""/> <has_text text="amrfinderplus_versioned_database"/> - <has_text text='"db_version": "3.12"'/> </assert_contents> </output> </test> @@ -61,6 +66,23 @@ </assert_contents> </output> </test> + <!-- Test_3 DB 4.0 2025-07-16.1 --> + <test expect_num_outputs="1"> + <param name="test_data_manager" value="--test"/> + <conditional name="database_list"> + <param name="database_version_select" value="4.0"/> + <param name="database_date_select" value="2025-07-16.1"/> + </conditional> + <output name="output_file"> + <assert_contents> + <has_n_lines n="1"/> + <has_text text="{"data_tables""/> + <has_text text="amrfinderplus_versioned_database"/> + <has_text text='"name": "V4.0-2025-07-16.1"'/> + <has_text text='"db_version": "4.0"'/> + </assert_contents> + </output> + </test> </tests> <help><