comparison data_manager/data_manager_build_amrfinderplus.py @ 3:dfa1eb2941b0 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_amrfinderplus commit 45dbbf06a59df43da2c321c272de11cc41e17d43
author iuc
date Sun, 23 Nov 2025 12:21:34 +0000
parents a5921c09b7b7
children
comparison
equal deleted inserted replaced
2:a5921c09b7b7 3:dfa1eb2941b0
12 class GetAmrFinderPlusDataManager: 12 class GetAmrFinderPlusDataManager:
13 """ 13 """
14 Create the json file with database information for galaxy data manager 14 Create the json file with database information for galaxy data manager
15 """ 15 """
16 16
17 def __init__(self, 17 def __init__(
18 amrfinderplus_database="amrfinderplus_versioned_database", 18 self,
19 db_name="amrfinderplus-db", 19 amrfinderplus_database="amrfinderplus_versioned_database",
20 amrfinderplus_version="latest", 20 db_name="amrfinderplus-db",
21 date_version=None): 21 amrfinderplus_version="latest",
22 date_version=None,
23 ):
22 self.data_table_name = amrfinderplus_database 24 self.data_table_name = amrfinderplus_database
23 self._db_name = db_name 25 self._db_name = db_name
24 self._amrfinderplus_version = amrfinderplus_version 26 self._amrfinderplus_version = amrfinderplus_version
25 self._amrfinderplus_date_version = date_version 27 self._amrfinderplus_date_version = date_version
26 self.data_table_entry = None 28 self.data_table_entry = None
29 def get_data_table_format(self): 31 def get_data_table_format(self):
30 """ 32 """
31 Skeleton of a data_table format 33 Skeleton of a data_table format
32 return: a data table formatted for json output 34 return: a data table formatted for json output
33 """ 35 """
34 self.data_table_entry = { 36 self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
35 "data_tables": {
36 self.data_table_name: {}
37 }
38 }
39 return self.data_table_entry 37 return self.data_table_entry
40 38
41 def get_data_manager(self): 39 def get_data_manager(self):
42 """ 40 """
43 Create the empty data table format and add all the information into 41 Create the empty data table format and add all the information into
44 return: The data table with database information 42 return: The data table with database information
45 """ 43 """
46 self.amrfinderplus_table_list = self.get_data_table_format() 44 self.amrfinderplus_table_list = self.get_data_table_format()
47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ 45 amrfinderplus_value = (
48 f"_{self._amrfinderplus_date_version}" 46 f"amrfinderplus_V{self._amrfinderplus_version}"
49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \ 47 f"_{self._amrfinderplus_date_version}"
50 f"-{self._amrfinderplus_date_version}" 48 )
51 data_info = dict(value=amrfinderplus_value, 49 amrfinderplus_name = (
52 name=amrfinderplus_name, 50 f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}"
53 db_version=self._amrfinderplus_version, 51 )
54 path=self._db_name) 52 data_info = dict(
53 value=amrfinderplus_value,
54 name=amrfinderplus_name,
55 db_version=self._amrfinderplus_version,
56 path=self._db_name,
57 )
55 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] 58 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info]
56 return self.amrfinderplus_table_list 59 return self.amrfinderplus_table_list
57 60
58 61
59 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): 62 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager):
61 Download the amrfinderplus database from the ncbi. 64 Download the amrfinderplus database from the ncbi.
62 Make the database available with hmm and indexed files 65 Make the database available with hmm and indexed files
63 Build the data manager infos for galaxy 66 Build the data manager infos for galaxy
64 """ 67 """
65 68
66 def __init__(self, 69 def __init__(
67 output_dir=Path.cwd(), 70 self,
68 ncbi_url="ftp.ncbi.nlm.nih.gov", 71 output_dir=Path.cwd(),
69 ftp_login="anonymous", 72 ncbi_url="ftp.ncbi.nlm.nih.gov",
70 ftp_password="anonymous", 73 ftp_login="anonymous",
71 amrfinderplus_database="amrfinderplus_database", 74 ftp_password="anonymous",
72 db_name="amrfinderplus-db", 75 amrfinderplus_database="amrfinderplus_database",
73 amrfinderplus_version="latest", 76 db_name="amrfinderplus-db",
74 json_file_path=None, 77 amrfinderplus_version="latest",
75 date_version=None, 78 json_file_path=None,
76 amrfinderplus_db_path=None, 79 date_version=None,
77 test_mode=False): 80 amrfinderplus_db_path=None,
81 test_mode=False,
82 ):
78 83
79 super().__init__() 84 super().__init__()
80 self.json_file_path = json_file_path 85 self.json_file_path = json_file_path
81 self._output_dir = output_dir 86 self._output_dir = output_dir
82 self._ncbi_ftp_url = ncbi_url 87 self._ncbi_ftp_url = ncbi_url
83 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" 88 self._ncbi_database_path = (
89 "pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
90 )
84 self._login = ftp_login 91 self._login = ftp_login
85 self._password = ftp_password 92 self._password = ftp_password
86 self._amrfinderplus_database = amrfinderplus_database 93 self._amrfinderplus_database = amrfinderplus_database
87 self._db_name = db_name 94 self._db_name = db_name
88 self._amrfinderplus_version = amrfinderplus_version 95 self._amrfinderplus_version = amrfinderplus_version
101 """ 108 """
102 cmd = [command] 109 cmd = [command]
103 [cmd.append(i) for i in args] 110 [cmd.append(i) for i in args]
104 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) 111 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
105 if proc.returncode != 0: 112 if proc.returncode != 0:
106 print(f'Error type {proc.returncode} with : \n {proc}') 113 print(f"Error type {proc.returncode} with : \n {proc}")
107 114
108 def download_amrfinderplus_db(self): 115 def download_amrfinderplus_db(self):
109 """ 116 """
110 Download the amrfinderplus database from the ncbi ftp server 117 Download the amrfinderplus database from the ncbi ftp server
111 """ 118 """
112 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' 119 self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}"
113 os.makedirs(self.amrfinderplus_db_path) 120 os.makedirs(self.amrfinderplus_db_path)
114 121
115 amrfinderplus_ftp_path = f"ftp://{self._login}:" \ 122 if self._amrfinderplus_version == "latest":
116 f"{self._password}@{self._ncbi_ftp_url}/" \ 123 self.get_amrfinderplus_version()
117 f"{self._ncbi_database_path}/" \ 124
118 f"{self._amrfinderplus_version}/" \ 125 amrfinderplus_ftp_path = (
119 f"{self._amrfinderplus_date_version}" 126 f"ftp://{self._login}:"
127 f"{self._password}@{self._ncbi_ftp_url}/"
128 f"{self._ncbi_database_path}/"
129 f"{self._amrfinderplus_version}/"
130 f"{self._amrfinderplus_date_version}"
131 )
132
133 if self._amrfinderplus_version == "3.12":
134 taxa_group_file = "taxgroup.tab"
135 test_dna_fasta = "AMR_DNA-Escherichia"
136 else:
137 taxa_group_file = "taxgroup.tsv"
138 test_dna_fasta = "AMR_DNA-Escherichia.fa"
120 if self.test_mode is True: 139 if self.test_mode is True:
121 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] 140 file_list = [
141 test_dna_fasta,
142 "version.txt",
143 taxa_group_file,
144 "database_format_version.txt",
145 ]
122 output_option = "-O" 146 output_option = "-O"
123 for file in file_list: 147 for file in file_list:
124 self.subprocess_cmd("wget", 148 self.subprocess_cmd(
125 "-nd", 149 "wget",
126 "-np", 150 "-nd",
127 "-r", 151 "-np",
128 f"{amrfinderplus_ftp_path}/{file}", 152 "-r",
129 output_option, 153 f"{amrfinderplus_ftp_path}/{file}",
130 f"{self.amrfinderplus_db_path}/{file}") 154 output_option,
155 f"{self.amrfinderplus_db_path}/{file}",
156 )
131 else: 157 else:
132 output_option = "-P" 158 output_option = "-P"
133 self.subprocess_cmd("wget", 159 self.subprocess_cmd(
134 "-nd", 160 "wget",
135 "-np", 161 "-nd",
136 "-r", 162 "-np",
137 amrfinderplus_ftp_path, 163 "-r",
138 output_option, 164 amrfinderplus_ftp_path,
139 self.amrfinderplus_db_path) 165 output_option,
166 self.amrfinderplus_db_path,
167 )
140 168
141 def make_hmm_profile(self): 169 def make_hmm_profile(self):
142 """ 170 """
143 Make the hmm profile using the AMR.LIB file previously download 171 Make the hmm profile using the AMR.LIB file previously download
144 """ 172 """
151 def extract_filelist_makeblast(self): 179 def extract_filelist_makeblast(self):
152 """ 180 """
153 Extract le list of species which have file in the database 181 Extract le list of species which have file in the database
154 return: a filtered species list of available species in the database 182 return: a filtered species list of available species in the database
155 """ 183 """
156 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") 184 if self._amrfinderplus_version == "3.12":
185 taxa_group_file = "taxgroup.tab"
186 else:
187 taxa_group_file = "taxgroup.tsv"
188 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}")
157 if Path.exists(taxa_group_path): 189 if Path.exists(taxa_group_path):
158 taxa_table = pd.read_table(taxa_group_path) 190 taxa_table = pd.read_table(taxa_group_path)
159 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] 191 taxa_table.columns = [
160 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) 192 "taxgroup",
193 "gpipe_taxgroup",
194 "number_of_nucl_ref_genes",
195 ]
196 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(
197 items=["taxgroup"], axis=1
198 )
161 if self.test_mode is True: 199 if self.test_mode is True:
162 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup 200 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup
163 else: 201 else:
164 taxa_df = taxa_df.taxgroup 202 taxa_df = taxa_df.taxgroup
165 self.species_list = list(taxa_df) 203 self.species_list = list(taxa_df)
166 else: 204 else:
167 print("taxgroup.tab file is missing to list available species") 205 print(f"{taxa_group_file} file is missing to list available species")
168 206
169 def make_blastdb(self): 207 def make_blastdb(self):
170 """ 208 """
171 Index fasta file for blast 209 Index fasta file for blast
172 """ 210 """
173 self.extract_filelist_makeblast() 211 self.extract_filelist_makeblast()
174 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] 212 if self._amrfinderplus_version == "3.12":
175 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' 213 nucl_file_db_list = [
176 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' 214 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}"
215 for specie in self.species_list
216 ]
217 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS"
218 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt"
219 else:
220 nucl_file_db_list = [
221 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa"
222 for specie in self.species_list
223 ]
224 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa"
225 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa"
177 os.chdir(self.amrfinderplus_db_path) 226 os.chdir(self.amrfinderplus_db_path)
178 if Path(amr_dna).exists(): 227 if Path(amr_dna).exists():
179 nucl_file_db_list.append(amr_dna) 228 nucl_file_db_list.append(amr_dna)
180 else: 229 else:
181 print("No file AMR_CDS detected for indexing") 230 print("No file AMR_CDS detected for indexing")
182 if Path(amr_prot).exists(): 231 if Path(amr_prot).exists():
183 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") 232 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot")
184 else: 233 else:
185 print("No file AMRProt detected for indexing") 234 print("No file AMRProt detected for indexing")
186 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] 235 [
187 236 self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl")
188 def get_amrfinderplus_version(self, version_file="version.txt", 237 for file in nucl_file_db_list
189 database_version_file="database_format_version.txt"): 238 ]
239
240 def get_amrfinderplus_version(
241 self,
242 version_file="version.txt",
243 database_version_file="database_format_version.txt",
244 ):
190 """ 245 """
191 Check the version when latest if provided and update the number 246 Check the version when latest if provided and update the number
192 param version_file: name of the file containing version information 247 param version_file: name of the file containing version information
193 param database_version_file: name of the file containing date version information 248 param database_version_file: name of the file containing date version information
194 """ 249 """
195 ftp = FTP(self._ncbi_ftp_url) 250 ftp = FTP(self._ncbi_ftp_url)
196 ftp.login(self._login, self._password) 251 ftp.login(self._login, self._password)
197 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") 252 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}")
198 db_version = BytesIO() 253 db_version = BytesIO()
199 db_date_version = BytesIO() 254 db_date_version = BytesIO()
200 ftp.retrbinary(f'RETR {version_file}', db_version.write) 255 ftp.retrbinary(f"RETR {version_file}", db_version.write)
201 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) 256 ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write)
202 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] 257 self._amrfinderplus_date_version = (
203 self._amrfinderplus_version = '.'.join( 258 db_version.getvalue().decode("utf-8").splitlines()[0]
204 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) 259 )
260 self._amrfinderplus_version = ".".join(
261 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]
262 )
205 263
206 def read_json_input_file(self): 264 def read_json_input_file(self):
207 """ 265 """
208 Import the json file 266 Import the json file
209 """ 267 """
210 with open(self.json_file_path) as fh: 268 with open(self.json_file_path) as fh:
211 params = json.load(fh) 269 params = json.load(fh)
212 target_dir = params['output_data'][0]['extra_files_path'] 270 target_dir = params["output_data"][0]["extra_files_path"]
213 os.makedirs(target_dir) 271 os.makedirs(target_dir)
214 self._output_dir = target_dir 272 self._output_dir = target_dir
215 273
216 def write_json_infos(self): 274 def write_json_infos(self):
217 """ 275 """
218 Write in the imported json file 276 Write in the imported json file
219 """ 277 """
220 with open(self.json_file_path, 'w') as fh: 278 with open(self.json_file_path, "w") as fh:
221 json.dump(self.get_data_manager(), fh, sort_keys=True) 279 json.dump(self.get_data_manager(), fh, sort_keys=True)
222 280
223 281
224 def parse_arguments(): 282 def parse_arguments():
225 """ 283 """
226 List of arguments provided by the user 284 List of arguments provided by the user
227 return: parsed arguments 285 return: parsed arguments
228 """ 286 """
229 # parse options and arguments 287 # parse options and arguments
230 arg_parser = argparse.ArgumentParser() 288 arg_parser = argparse.ArgumentParser()
231 arg_parser.add_argument("data_manager_json", 289 arg_parser.add_argument("data_manager_json", help="json file from galaxy")
232 help="json file from galaxy") 290 arg_parser.add_argument(
233 arg_parser.add_argument("--db_version", default="latest", 291 "--db_version",
234 help="select the major version of the database (e.g. 3.10, 3.8), default is latest") 292 default="latest",
235 arg_parser.add_argument("--db_date", 293 help="select the major version of the database (e.g. 3.10, 3.8), default is latest",
236 help="select the date into the database version (e.g. 2022-10-11.2)") 294 )
237 arg_parser.add_argument("--test", action='store_true', 295 arg_parser.add_argument(
238 help="option to test the script with an lighted database") 296 "--db_date",
297 help="select the date into the database version (e.g. 2022-10-11.2)",
298 )
299 arg_parser.add_argument(
300 "--test",
301 action="store_true",
302 help="option to test the script with an lighted database",
303 )
239 return arg_parser.parse_args() 304 return arg_parser.parse_args()
240 305
241 306
242 def main(): 307 def main():
243 all_args = parse_arguments() 308 all_args = parse_arguments()
244 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, 309 amrfinderplus_download = DownloadAmrFinderPlusDatabase(
245 date_version=all_args.db_date, 310 amrfinderplus_version=all_args.db_version,
246 json_file_path=all_args.data_manager_json, 311 date_version=all_args.db_date,
247 test_mode=all_args.test) 312 json_file_path=all_args.data_manager_json,
313 test_mode=all_args.test,
314 )
248 amrfinderplus_download.read_json_input_file() 315 amrfinderplus_download.read_json_input_file()
249 amrfinderplus_download.download_amrfinderplus_db() 316 amrfinderplus_download.download_amrfinderplus_db()
250 amrfinderplus_download.make_hmm_profile() 317 amrfinderplus_download.make_hmm_profile()
251 amrfinderplus_download.make_blastdb() 318 amrfinderplus_download.make_blastdb()
252 amrfinderplus_download.write_json_infos() 319 amrfinderplus_download.write_json_infos()
253 320
254 321
255 if __name__ == '__main__': 322 if __name__ == "__main__":
256 main() 323 main()