Mercurial > repos > iuc > amrfinderplus_data_manager_build
comparison data_manager/data_manager_build_amrfinderplus.py @ 0:eea0c38a9afd draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_amrfinderplus commit 59077a173599fc9d355a5d36ad7875050dbe3e19
author | iuc |
---|---|
date | Thu, 05 Jan 2023 14:28:07 +0000 |
parents | |
children | a5921c09b7b7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:eea0c38a9afd |
---|---|
1 import argparse | |
2 import json | |
3 import os | |
4 import subprocess as sp | |
5 from ftplib import FTP | |
6 from io import BytesIO | |
7 from pathlib import Path | |
8 | |
9 import pandas as pd | |
10 | |
11 | |
12 class GetAmrFinderPlusDataManager: | |
13 """ | |
14 Create the json file with database information for galaxy data manager | |
15 """ | |
16 | |
17 def __init__(self, | |
18 amrfinderplus_database="amrfinderplus_database", | |
19 db_name="amrfinderplus-db", | |
20 amrfinderplus_version="latest", | |
21 date_version=None): | |
22 self.data_table_name = amrfinderplus_database | |
23 self._db_name = db_name | |
24 self._amrfinderplus_version = amrfinderplus_version | |
25 self._amrfinderplus_date_version = date_version | |
26 self.data_table_entry = None | |
27 self.amrfinderplus_table_list = None | |
28 | |
29 def get_data_table_format(self): | |
30 """ | |
31 Skeleton of a data_table format | |
32 return: a data table formatted for json output | |
33 """ | |
34 self.data_table_entry = { | |
35 "data_tables": { | |
36 self.data_table_name: {} | |
37 } | |
38 } | |
39 return self.data_table_entry | |
40 | |
41 def get_data_manager(self): | |
42 """ | |
43 Create the empty data table format and add all the information into | |
44 return: The data table with database information | |
45 """ | |
46 self.amrfinderplus_table_list = self.get_data_table_format() | |
47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ | |
48 f"_{self._amrfinderplus_date_version}" | |
49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \ | |
50 f"-{self._amrfinderplus_date_version}" | |
51 data_info = dict(value=amrfinderplus_value, | |
52 name=amrfinderplus_name, | |
53 path=self._db_name) | |
54 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] | |
55 return self.amrfinderplus_table_list | |
56 | |
57 | |
58 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): | |
59 """ | |
60 Download the amrfinderplus database from the ncbi. | |
61 Make the database available with hmm and indexed files | |
62 Build the data manager infos for galaxy | |
63 """ | |
64 | |
65 def __init__(self, | |
66 output_dir=Path.cwd(), | |
67 ncbi_url="ftp.ncbi.nlm.nih.gov", | |
68 ftp_login="anonymous", | |
69 ftp_password="anonymous", | |
70 amrfinderplus_database="amrfinderplus_database", | |
71 db_name="amrfinderplus-db", | |
72 amrfinderplus_version="latest", | |
73 json_file_path=None, | |
74 date_version=None, | |
75 amrfinderplus_db_path=None, | |
76 test_mode=False): | |
77 | |
78 super().__init__() | |
79 self.json_file_path = json_file_path | |
80 self._output_dir = output_dir | |
81 self._ncbi_ftp_url = ncbi_url | |
82 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" | |
83 self._login = ftp_login | |
84 self._password = ftp_password | |
85 self._amrfinderplus_database = amrfinderplus_database | |
86 self._db_name = db_name | |
87 self._amrfinderplus_version = amrfinderplus_version | |
88 self._amrfinderplus_date_version = date_version | |
89 self.species_list = None | |
90 self.test_mode = test_mode | |
91 self.amrfinderplus_db_path = amrfinderplus_db_path | |
92 | |
93 @staticmethod | |
94 def subprocess_cmd(command, *args): | |
95 """ | |
96 Method to call external tools with any parameters | |
97 :param command: command name from the tool used (e.g. wget or makeblastdb) | |
98 :param args: free number of argument need for the command tool (e.g. -r, -P ...) | |
99 :return: launch the command line from the system | |
100 """ | |
101 cmd = [command] | |
102 [cmd.append(i) for i in args] | |
103 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) | |
104 if proc.returncode != 0: | |
105 print(f'Error type {proc.returncode} with : \n {proc}') | |
106 | |
107 def download_amrfinderplus_db(self): | |
108 """ | |
109 Download the amrfinderplus database from the ncbi ftp server | |
110 """ | |
111 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' | |
112 os.makedirs(self.amrfinderplus_db_path) | |
113 if self._amrfinderplus_version == 'latest': | |
114 self.get_amrfinderplus_version() | |
115 | |
116 amrfinderplus_ftp_path = f"ftp://{self._login}:" \ | |
117 f"{self._password}@{self._ncbi_ftp_url}/" \ | |
118 f"{self._ncbi_database_path}/" \ | |
119 f"{self._amrfinderplus_version}/" \ | |
120 f"{self._amrfinderplus_date_version}" | |
121 if self.test_mode is True: | |
122 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] | |
123 output_option = "-O" | |
124 for file in file_list: | |
125 self.subprocess_cmd("wget", | |
126 "-nd", | |
127 "-np", | |
128 "-r", | |
129 f"{amrfinderplus_ftp_path}/{file}", | |
130 output_option, | |
131 f"{self.amrfinderplus_db_path}/{file}") | |
132 else: | |
133 output_option = "-P" | |
134 self.subprocess_cmd("wget", | |
135 "-nd", | |
136 "-np", | |
137 "-r", | |
138 amrfinderplus_ftp_path, | |
139 output_option, | |
140 self.amrfinderplus_db_path) | |
141 | |
142 def make_hmm_profile(self): | |
143 """ | |
144 Make the hmm profile using the AMR.LIB file previously download | |
145 """ | |
146 hmm_file = Path(f"{self.amrfinderplus_db_path}/AMR.LIB") | |
147 if Path.exists(hmm_file) and self.test_mode is False: | |
148 self.subprocess_cmd("hmmpress", "-f", hmm_file) | |
149 else: | |
150 print("hmm_file file is missing to make hmm profiles") | |
151 | |
152 def extract_filelist_makeblast(self): | |
153 """ | |
154 Extract le list of species which have file in the database | |
155 return: a filtered species list of available species in the database | |
156 """ | |
157 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") | |
158 if Path.exists(taxa_group_path): | |
159 taxa_table = pd.read_table(taxa_group_path) | |
160 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] | |
161 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) | |
162 if self.test_mode is True: | |
163 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup | |
164 else: | |
165 taxa_df = taxa_df.taxgroup | |
166 self.species_list = list(taxa_df) | |
167 else: | |
168 print("taxgroup.tab file is missing to list available species") | |
169 | |
170 def make_blastdb(self): | |
171 """ | |
172 Index fasta file for blast | |
173 """ | |
174 self.extract_filelist_makeblast() | |
175 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] | |
176 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' | |
177 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' | |
178 os.chdir(self.amrfinderplus_db_path) | |
179 if Path(amr_dna).exists(): | |
180 nucl_file_db_list.append(amr_dna) | |
181 else: | |
182 print("No file AMR_CDS detected for indexing") | |
183 if Path(amr_prot).exists(): | |
184 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") | |
185 else: | |
186 print("No file AMRProt detected for indexing") | |
187 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] | |
188 | |
189 def get_amrfinderplus_version(self, version_file="version.txt", | |
190 database_version_file="database_format_version.txt"): | |
191 """ | |
192 Check the version when latest if provided and update the number | |
193 param version_file: name of the file containing version information | |
194 param database_version_file: name of the file containing date version information | |
195 """ | |
196 ftp = FTP(self._ncbi_ftp_url) | |
197 ftp.login(self._login, self._password) | |
198 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") | |
199 db_version = BytesIO() | |
200 db_date_version = BytesIO() | |
201 ftp.retrbinary(f'RETR {version_file}', db_version.write) | |
202 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) | |
203 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] | |
204 self._amrfinderplus_version = '.'.join( | |
205 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) | |
206 | |
207 def read_json_input_file(self): | |
208 """ | |
209 Import the json file | |
210 """ | |
211 with open(self.json_file_path) as fh: | |
212 params = json.load(fh) | |
213 target_dir = params['output_data'][0]['extra_files_path'] | |
214 os.makedirs(target_dir) | |
215 self._output_dir = target_dir | |
216 | |
217 def write_json_infos(self): | |
218 """ | |
219 Write in the imported json file | |
220 """ | |
221 with open(self.json_file_path, 'w') as fh: | |
222 json.dump(self.get_data_manager(), fh, sort_keys=True) | |
223 | |
224 | |
225 def parse_arguments(): | |
226 """ | |
227 List of arguments provided by the user | |
228 return: parsed arguments | |
229 """ | |
230 # parse options and arguments | |
231 arg_parser = argparse.ArgumentParser() | |
232 arg_parser.add_argument("data_manager_json", | |
233 help="json file from galaxy") | |
234 arg_parser.add_argument("--db_version", default="latest", | |
235 help="select the major version of the database (e.g. 3.10, 3.8), default is latest") | |
236 arg_parser.add_argument("--db_date", | |
237 help="select the date into the database version (e.g. 2022-10-11.2)") | |
238 arg_parser.add_argument("--test", action='store_true', | |
239 help="option to test the script with an lighted database") | |
240 return arg_parser.parse_args() | |
241 | |
242 | |
243 def main(): | |
244 all_args = parse_arguments() | |
245 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, | |
246 date_version=all_args.db_date, | |
247 json_file_path=all_args.data_manager_json, | |
248 test_mode=all_args.test) | |
249 amrfinderplus_download.read_json_input_file() | |
250 amrfinderplus_download.download_amrfinderplus_db() | |
251 amrfinderplus_download.make_hmm_profile() | |
252 amrfinderplus_download.make_blastdb() | |
253 amrfinderplus_download.write_json_infos() | |
254 | |
255 | |
256 if __name__ == '__main__': | |
257 main() |