comparison data_manager/data_manager_build_amrfinderplus.py @ 0:eea0c38a9afd draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_amrfinderplus commit 59077a173599fc9d355a5d36ad7875050dbe3e19
author iuc
date Thu, 05 Jan 2023 14:28:07 +0000
parents
children a5921c09b7b7
comparison
equal deleted inserted replaced
-1:000000000000 0:eea0c38a9afd
1 import argparse
2 import json
3 import os
4 import subprocess as sp
5 from ftplib import FTP
6 from io import BytesIO
7 from pathlib import Path
8
9 import pandas as pd
10
11
12 class GetAmrFinderPlusDataManager:
13 """
14 Create the json file with database information for galaxy data manager
15 """
16
17 def __init__(self,
18 amrfinderplus_database="amrfinderplus_database",
19 db_name="amrfinderplus-db",
20 amrfinderplus_version="latest",
21 date_version=None):
22 self.data_table_name = amrfinderplus_database
23 self._db_name = db_name
24 self._amrfinderplus_version = amrfinderplus_version
25 self._amrfinderplus_date_version = date_version
26 self.data_table_entry = None
27 self.amrfinderplus_table_list = None
28
29 def get_data_table_format(self):
30 """
31 Skeleton of a data_table format
32 return: a data table formatted for json output
33 """
34 self.data_table_entry = {
35 "data_tables": {
36 self.data_table_name: {}
37 }
38 }
39 return self.data_table_entry
40
41 def get_data_manager(self):
42 """
43 Create the empty data table format and add all the information into
44 return: The data table with database information
45 """
46 self.amrfinderplus_table_list = self.get_data_table_format()
47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \
48 f"_{self._amrfinderplus_date_version}"
49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \
50 f"-{self._amrfinderplus_date_version}"
51 data_info = dict(value=amrfinderplus_value,
52 name=amrfinderplus_name,
53 path=self._db_name)
54 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info]
55 return self.amrfinderplus_table_list
56
57
58 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager):
59 """
60 Download the amrfinderplus database from the ncbi.
61 Make the database available with hmm and indexed files
62 Build the data manager infos for galaxy
63 """
64
65 def __init__(self,
66 output_dir=Path.cwd(),
67 ncbi_url="ftp.ncbi.nlm.nih.gov",
68 ftp_login="anonymous",
69 ftp_password="anonymous",
70 amrfinderplus_database="amrfinderplus_database",
71 db_name="amrfinderplus-db",
72 amrfinderplus_version="latest",
73 json_file_path=None,
74 date_version=None,
75 amrfinderplus_db_path=None,
76 test_mode=False):
77
78 super().__init__()
79 self.json_file_path = json_file_path
80 self._output_dir = output_dir
81 self._ncbi_ftp_url = ncbi_url
82 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
83 self._login = ftp_login
84 self._password = ftp_password
85 self._amrfinderplus_database = amrfinderplus_database
86 self._db_name = db_name
87 self._amrfinderplus_version = amrfinderplus_version
88 self._amrfinderplus_date_version = date_version
89 self.species_list = None
90 self.test_mode = test_mode
91 self.amrfinderplus_db_path = amrfinderplus_db_path
92
93 @staticmethod
94 def subprocess_cmd(command, *args):
95 """
96 Method to call external tools with any parameters
97 :param command: command name from the tool used (e.g. wget or makeblastdb)
98 :param args: free number of argument need for the command tool (e.g. -r, -P ...)
99 :return: launch the command line from the system
100 """
101 cmd = [command]
102 [cmd.append(i) for i in args]
103 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
104 if proc.returncode != 0:
105 print(f'Error type {proc.returncode} with : \n {proc}')
106
107 def download_amrfinderplus_db(self):
108 """
109 Download the amrfinderplus database from the ncbi ftp server
110 """
111 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}'
112 os.makedirs(self.amrfinderplus_db_path)
113 if self._amrfinderplus_version == 'latest':
114 self.get_amrfinderplus_version()
115
116 amrfinderplus_ftp_path = f"ftp://{self._login}:" \
117 f"{self._password}@{self._ncbi_ftp_url}/" \
118 f"{self._ncbi_database_path}/" \
119 f"{self._amrfinderplus_version}/" \
120 f"{self._amrfinderplus_date_version}"
121 if self.test_mode is True:
122 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"]
123 output_option = "-O"
124 for file in file_list:
125 self.subprocess_cmd("wget",
126 "-nd",
127 "-np",
128 "-r",
129 f"{amrfinderplus_ftp_path}/{file}",
130 output_option,
131 f"{self.amrfinderplus_db_path}/{file}")
132 else:
133 output_option = "-P"
134 self.subprocess_cmd("wget",
135 "-nd",
136 "-np",
137 "-r",
138 amrfinderplus_ftp_path,
139 output_option,
140 self.amrfinderplus_db_path)
141
142 def make_hmm_profile(self):
143 """
144 Make the hmm profile using the AMR.LIB file previously download
145 """
146 hmm_file = Path(f"{self.amrfinderplus_db_path}/AMR.LIB")
147 if Path.exists(hmm_file) and self.test_mode is False:
148 self.subprocess_cmd("hmmpress", "-f", hmm_file)
149 else:
150 print("hmm_file file is missing to make hmm profiles")
151
152 def extract_filelist_makeblast(self):
153 """
154 Extract le list of species which have file in the database
155 return: a filtered species list of available species in the database
156 """
157 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab")
158 if Path.exists(taxa_group_path):
159 taxa_table = pd.read_table(taxa_group_path)
160 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"]
161 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1)
162 if self.test_mode is True:
163 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup
164 else:
165 taxa_df = taxa_df.taxgroup
166 self.species_list = list(taxa_df)
167 else:
168 print("taxgroup.tab file is missing to list available species")
169
170 def make_blastdb(self):
171 """
172 Index fasta file for blast
173 """
174 self.extract_filelist_makeblast()
175 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list]
176 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS'
177 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt'
178 os.chdir(self.amrfinderplus_db_path)
179 if Path(amr_dna).exists():
180 nucl_file_db_list.append(amr_dna)
181 else:
182 print("No file AMR_CDS detected for indexing")
183 if Path(amr_prot).exists():
184 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot")
185 else:
186 print("No file AMRProt detected for indexing")
187 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list]
188
189 def get_amrfinderplus_version(self, version_file="version.txt",
190 database_version_file="database_format_version.txt"):
191 """
192 Check the version when latest if provided and update the number
193 param version_file: name of the file containing version information
194 param database_version_file: name of the file containing date version information
195 """
196 ftp = FTP(self._ncbi_ftp_url)
197 ftp.login(self._login, self._password)
198 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}")
199 db_version = BytesIO()
200 db_date_version = BytesIO()
201 ftp.retrbinary(f'RETR {version_file}', db_version.write)
202 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write)
203 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0]
204 self._amrfinderplus_version = '.'.join(
205 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2])
206
207 def read_json_input_file(self):
208 """
209 Import the json file
210 """
211 with open(self.json_file_path) as fh:
212 params = json.load(fh)
213 target_dir = params['output_data'][0]['extra_files_path']
214 os.makedirs(target_dir)
215 self._output_dir = target_dir
216
217 def write_json_infos(self):
218 """
219 Write in the imported json file
220 """
221 with open(self.json_file_path, 'w') as fh:
222 json.dump(self.get_data_manager(), fh, sort_keys=True)
223
224
225 def parse_arguments():
226 """
227 List of arguments provided by the user
228 return: parsed arguments
229 """
230 # parse options and arguments
231 arg_parser = argparse.ArgumentParser()
232 arg_parser.add_argument("data_manager_json",
233 help="json file from galaxy")
234 arg_parser.add_argument("--db_version", default="latest",
235 help="select the major version of the database (e.g. 3.10, 3.8), default is latest")
236 arg_parser.add_argument("--db_date",
237 help="select the date into the database version (e.g. 2022-10-11.2)")
238 arg_parser.add_argument("--test", action='store_true',
239 help="option to test the script with an lighted database")
240 return arg_parser.parse_args()
241
242
243 def main():
244 all_args = parse_arguments()
245 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version,
246 date_version=all_args.db_date,
247 json_file_path=all_args.data_manager_json,
248 test_mode=all_args.test)
249 amrfinderplus_download.read_json_input_file()
250 amrfinderplus_download.download_amrfinderplus_db()
251 amrfinderplus_download.make_hmm_profile()
252 amrfinderplus_download.make_blastdb()
253 amrfinderplus_download.write_json_infos()
254
255
256 if __name__ == '__main__':
257 main()