Mercurial > repos > iuc > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 0:e4cdf82de430 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 30d1a86d5c8a3fa434e24ff915f85f51e514ceb2
author | iuc |
---|---|
date | Thu, 30 May 2019 03:03:22 -0400 |
parents | |
children | 6ba1ec5f86fc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4cdf82de430 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import argparse | |
6 import datetime | |
7 import errno | |
8 import json | |
9 import os | |
10 import shutil | |
11 import subprocess | |
12 import sys | |
13 import tarfile | |
14 from enum import Enum | |
15 | |
16 try: | |
17 # Python3 | |
18 from urllib.request import urlopen | |
19 except ImportError: | |
20 from urllib2 import urlopen | |
21 | |
22 | |
23 DATA_TABLE_NAME = "kraken2_databases" | |
24 | |
25 | |
26 class KrakenDatabaseTypes(Enum): | |
27 standard = 'standard' | |
28 minikraken = 'minikraken' | |
29 special = 'special' | |
30 custom = 'custom' | |
31 | |
32 def __str__(self): | |
33 return self.value | |
34 | |
35 | |
36 class SpecialDatabaseTypes(Enum): | |
37 rdp = 'rdp' | |
38 greengenes = 'greengenes' | |
39 silva = 'silva' | |
40 | |
41 def __str__(self): | |
42 return self.value | |
43 | |
44 | |
45 class Minikraken2Versions(Enum): | |
46 v1 = 'v1' | |
47 v2 = 'v2' | |
48 | |
49 def __str__(self): | |
50 return self.value | |
51 | |
52 | |
53 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
54 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
55 | |
56 database_value = "_".join([ | |
57 now, | |
58 "standard", | |
59 "kmer-len", str(kraken2_args["kmer_len"]), | |
60 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
62 ]) | |
63 | |
64 database_name = " ".join([ | |
65 "Standard", | |
66 "(Created:", | |
67 now + ",", | |
68 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
69 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
70 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
71 ]) | |
72 | |
73 database_path = database_value | |
74 | |
75 args = [ | |
76 '--threads', str(kraken2_args["threads"]), | |
77 '--standard', | |
78 '--kmer-len', str(kraken2_args["kmer_len"]), | |
79 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
80 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
81 '--db', database_path | |
82 ] | |
83 | |
84 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
85 | |
86 args = [ | |
87 '--threads', str(kraken2_args["threads"]), | |
88 '--clean', | |
89 '--db', database_path | |
90 ] | |
91 | |
92 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
93 | |
94 data_table_entry = { | |
95 'data_tables': { | |
96 data_table_name: [ | |
97 { | |
98 "value": database_value, | |
99 "name": database_name, | |
100 "path": database_path, | |
101 } | |
102 ] | |
103 } | |
104 } | |
105 | |
106 return data_table_entry | |
107 | |
108 | |
109 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | |
110 | |
111 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
112 | |
113 database_value = "_".join([ | |
114 now, | |
115 "minikraken2", | |
116 minikraken2_version, | |
117 "8GB", | |
118 ]) | |
119 | |
120 database_name = " ".join([ | |
121 "Minikraken2", | |
122 minikraken2_version, | |
123 "(Created:", | |
124 now + ")" | |
125 ]) | |
126 | |
127 database_path = database_value | |
128 | |
129 # download the minikraken2 data | |
130 src = urlopen( | |
131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' | |
132 % minikraken2_version | |
133 ) | |
134 with open('tmp_data.tar.gz', 'wb') as dst: | |
135 shutil.copyfileobj(src, dst) | |
136 # unpack the downloaded archive to the target directory | |
137 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
138 for member in fh.getmembers(): | |
139 if member.isreg(): | |
140 member.name = os.path.basename(member.name) | |
141 fh.extract(member, os.path.join(target_directory, database_path)) | |
142 | |
143 data_table_entry = { | |
144 'data_tables': { | |
145 data_table_name: [ | |
146 { | |
147 "value": database_value, | |
148 "name": database_name, | |
149 "path": database_path, | |
150 } | |
151 ] | |
152 } | |
153 } | |
154 | |
155 return data_table_entry | |
156 | |
157 | |
158 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
159 | |
160 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
161 | |
162 special_database_names = { | |
163 "rdp": "RDP", | |
164 "greengenes": "Greengenes", | |
165 "silva": "Silva", | |
166 } | |
167 | |
168 database_value = "_".join([ | |
169 now, | |
170 kraken2_args["special_database_type"], | |
171 "kmer-len", str(kraken2_args["kmer_len"]), | |
172 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
173 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
174 ]) | |
175 | |
176 database_name = " ".join([ | |
177 special_database_names[kraken2_args["special_database_type"]], | |
178 "(Created:", | |
179 now + ",", | |
180 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
181 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
182 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
183 ]) | |
184 | |
185 database_path = database_value | |
186 | |
187 args = [ | |
188 '--threads', str(kraken2_args["threads"]), | |
189 '--special', kraken2_args["special_database_type"], | |
190 '--kmer-len', str(kraken2_args["kmer_len"]), | |
191 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
192 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
193 '--db', database_path | |
194 ] | |
195 | |
196 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
197 | |
198 args = [ | |
199 '--threads', str(kraken2_args["threads"]), | |
200 '--clean', | |
201 '--db', database_path | |
202 ] | |
203 | |
204 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
205 | |
206 data_table_entry = { | |
207 'data_tables': { | |
208 data_table_name: [ | |
209 { | |
210 "value": database_value, | |
211 "name": database_name, | |
212 "path": database_path, | |
213 } | |
214 ] | |
215 } | |
216 } | |
217 | |
218 return data_table_entry | |
219 | |
220 | |
221 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME): | |
222 | |
223 args = [ | |
224 '--threads', str(kraken2_args["threads"]), | |
225 '--download-taxonomy', | |
226 '--db', custom_database_name, | |
227 ] | |
228 | |
229 if kraken2_args['skip_maps']: | |
230 args.append('--skip-maps') | |
231 | |
232 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
233 | |
234 args = [ | |
235 '--threads', str(kraken2_args["threads"]), | |
236 '--add-to-library', kraken2_args["custom_fasta"], | |
237 '--db', custom_database_name | |
238 ] | |
239 | |
240 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
241 | |
242 args = [ | |
243 '--threads', str(kraken2_args["threads"]), | |
244 '--build', | |
245 '--kmer-len', str(kraken2_args["kmer_len"]), | |
246 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
247 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
248 '--db', custom_database_name | |
249 ] | |
250 | |
251 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
252 | |
253 args = [ | |
254 '--threads', str(kraken2_args["threads"]), | |
255 '--clean', | |
256 '--db', custom_database_name | |
257 ] | |
258 | |
259 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
260 | |
261 data_table_entry = { | |
262 'data_tables': { | |
263 data_table_name: [ | |
264 { | |
265 "value": custom_database_name, | |
266 "name": custom_database_name, | |
267 "path": custom_database_name | |
268 } | |
269 ] | |
270 } | |
271 } | |
272 | |
273 return data_table_entry | |
274 | |
275 | |
276 def main(): | |
277 parser = argparse.ArgumentParser() | |
278 parser.add_argument('data_manager_json') | |
279 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
280 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
281 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
282 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
283 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
284 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | |
285 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
286 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
287 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | |
288 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | |
289 args = parser.parse_args() | |
290 | |
291 data_manager_input = json.loads(open(args.data_manager_json).read()) | |
292 | |
293 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
294 | |
295 try: | |
296 os.mkdir( target_directory ) | |
297 except OSError as exc: | |
298 if exc.errno == errno.EEXIST and os.path.isdir( target_directory ): | |
299 pass | |
300 else: | |
301 raise | |
302 | |
303 data_manager_output = {} | |
304 | |
305 if str(args.database_type) == 'standard': | |
306 kraken2_args = { | |
307 "kmer_len": args.kmer_len, | |
308 "minimizer_len": args.minimizer_len, | |
309 "minimizer_spaces": args.minimizer_spaces, | |
310 "threads": args.threads, | |
311 } | |
312 data_manager_output = kraken2_build_standard( | |
313 kraken2_args, | |
314 target_directory, | |
315 ) | |
316 elif str(args.database_type) == 'minikraken': | |
317 data_manager_output = kraken2_build_minikraken( | |
318 str(args.minikraken2_version), | |
319 target_directory | |
320 ) | |
321 elif str(args.database_type) == 'special': | |
322 kraken2_args = { | |
323 "special_database_type": str(args.special_database_type), | |
324 "kmer_len": args.kmer_len, | |
325 "minimizer_len": args.minimizer_len, | |
326 "minimizer_spaces": args.minimizer_spaces, | |
327 "threads": args.threads, | |
328 } | |
329 data_manager_output = kraken2_build_special( | |
330 kraken2_args, | |
331 target_directory, | |
332 ) | |
333 elif str(args.database_type) == 'custom': | |
334 kraken2_args = { | |
335 "custom_fasta": args.custom_fasta, | |
336 "skip_maps": args.skip_maps, | |
337 "kmer_len": args.kmer_len, | |
338 "minimizer_len": args.minimizer_len, | |
339 "minimizer_spaces": args.minimizer_spaces, | |
340 "threads": args.threads, | |
341 } | |
342 data_manager_output = kraken2_build_custom( | |
343 kraken2_args, | |
344 args.custom_database_name, | |
345 target_directory, | |
346 ) | |
347 else: | |
348 sys.exit("Invalid database type") | |
349 | |
350 open(args.data_manager_json, 'w').write(json.dumps(data_manager_output)) | |
351 | |
352 | |
353 if __name__ == "__main__": | |
354 main() |