comparison data_manager/kraken2_build_database.py @ 0:e4cdf82de430 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 30d1a86d5c8a3fa434e24ff915f85f51e514ceb2
author iuc
date Thu, 30 May 2019 03:03:22 -0400
parents
children 6ba1ec5f86fc
comparison
equal deleted inserted replaced
-1:000000000000 0:e4cdf82de430
1 #!/usr/bin/env python
2
3 from __future__ import print_function
4
5 import argparse
6 import datetime
7 import errno
8 import json
9 import os
10 import shutil
11 import subprocess
12 import sys
13 import tarfile
14 from enum import Enum
15
16 try:
17 # Python3
18 from urllib.request import urlopen
19 except ImportError:
20 from urllib2 import urlopen
21
22
23 DATA_TABLE_NAME = "kraken2_databases"
24
25
26 class KrakenDatabaseTypes(Enum):
27 standard = 'standard'
28 minikraken = 'minikraken'
29 special = 'special'
30 custom = 'custom'
31
32 def __str__(self):
33 return self.value
34
35
36 class SpecialDatabaseTypes(Enum):
37 rdp = 'rdp'
38 greengenes = 'greengenes'
39 silva = 'silva'
40
41 def __str__(self):
42 return self.value
43
44
45 class Minikraken2Versions(Enum):
46 v1 = 'v1'
47 v2 = 'v2'
48
49 def __str__(self):
50 return self.value
51
52
53 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
54 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
55
56 database_value = "_".join([
57 now,
58 "standard",
59 "kmer-len", str(kraken2_args["kmer_len"]),
60 "minimizer-len", str(kraken2_args["minimizer_len"]),
61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
62 ])
63
64 database_name = " ".join([
65 "Standard",
66 "(Created:",
67 now + ",",
68 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
69 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
70 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
71 ])
72
73 database_path = database_value
74
75 args = [
76 '--threads', str(kraken2_args["threads"]),
77 '--standard',
78 '--kmer-len', str(kraken2_args["kmer_len"]),
79 '--minimizer-len', str(kraken2_args["minimizer_len"]),
80 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
81 '--db', database_path
82 ]
83
84 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
85
86 args = [
87 '--threads', str(kraken2_args["threads"]),
88 '--clean',
89 '--db', database_path
90 ]
91
92 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
93
94 data_table_entry = {
95 'data_tables': {
96 data_table_name: [
97 {
98 "value": database_value,
99 "name": database_name,
100 "path": database_path,
101 }
102 ]
103 }
104 }
105
106 return data_table_entry
107
108
109 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME):
110
111 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
112
113 database_value = "_".join([
114 now,
115 "minikraken2",
116 minikraken2_version,
117 "8GB",
118 ])
119
120 database_name = " ".join([
121 "Minikraken2",
122 minikraken2_version,
123 "(Created:",
124 now + ")"
125 ])
126
127 database_path = database_value
128
129 # download the minikraken2 data
130 src = urlopen(
131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz'
132 % minikraken2_version
133 )
134 with open('tmp_data.tar.gz', 'wb') as dst:
135 shutil.copyfileobj(src, dst)
136 # unpack the downloaded archive to the target directory
137 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
138 for member in fh.getmembers():
139 if member.isreg():
140 member.name = os.path.basename(member.name)
141 fh.extract(member, os.path.join(target_directory, database_path))
142
143 data_table_entry = {
144 'data_tables': {
145 data_table_name: [
146 {
147 "value": database_value,
148 "name": database_name,
149 "path": database_path,
150 }
151 ]
152 }
153 }
154
155 return data_table_entry
156
157
158 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
159
160 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
161
162 special_database_names = {
163 "rdp": "RDP",
164 "greengenes": "Greengenes",
165 "silva": "Silva",
166 }
167
168 database_value = "_".join([
169 now,
170 kraken2_args["special_database_type"],
171 "kmer-len", str(kraken2_args["kmer_len"]),
172 "minimizer-len", str(kraken2_args["minimizer_len"]),
173 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
174 ])
175
176 database_name = " ".join([
177 special_database_names[kraken2_args["special_database_type"]],
178 "(Created:",
179 now + ",",
180 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
181 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
182 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
183 ])
184
185 database_path = database_value
186
187 args = [
188 '--threads', str(kraken2_args["threads"]),
189 '--special', kraken2_args["special_database_type"],
190 '--kmer-len', str(kraken2_args["kmer_len"]),
191 '--minimizer-len', str(kraken2_args["minimizer_len"]),
192 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
193 '--db', database_path
194 ]
195
196 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
197
198 args = [
199 '--threads', str(kraken2_args["threads"]),
200 '--clean',
201 '--db', database_path
202 ]
203
204 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
205
206 data_table_entry = {
207 'data_tables': {
208 data_table_name: [
209 {
210 "value": database_value,
211 "name": database_name,
212 "path": database_path,
213 }
214 ]
215 }
216 }
217
218 return data_table_entry
219
220
221 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME):
222
223 args = [
224 '--threads', str(kraken2_args["threads"]),
225 '--download-taxonomy',
226 '--db', custom_database_name,
227 ]
228
229 if kraken2_args['skip_maps']:
230 args.append('--skip-maps')
231
232 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
233
234 args = [
235 '--threads', str(kraken2_args["threads"]),
236 '--add-to-library', kraken2_args["custom_fasta"],
237 '--db', custom_database_name
238 ]
239
240 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
241
242 args = [
243 '--threads', str(kraken2_args["threads"]),
244 '--build',
245 '--kmer-len', str(kraken2_args["kmer_len"]),
246 '--minimizer-len', str(kraken2_args["minimizer_len"]),
247 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
248 '--db', custom_database_name
249 ]
250
251 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
252
253 args = [
254 '--threads', str(kraken2_args["threads"]),
255 '--clean',
256 '--db', custom_database_name
257 ]
258
259 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
260
261 data_table_entry = {
262 'data_tables': {
263 data_table_name: [
264 {
265 "value": custom_database_name,
266 "name": custom_database_name,
267 "path": custom_database_name
268 }
269 ]
270 }
271 }
272
273 return data_table_entry
274
275
276 def main():
277 parser = argparse.ArgumentParser()
278 parser.add_argument('data_manager_json')
279 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length')
280 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length')
281 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces')
282 parser.add_argument('--threads', dest='threads', default=1, help='threads')
283 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build')
284 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)')
285 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)')
286 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)')
287 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)')
288 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='')
289 args = parser.parse_args()
290
291 data_manager_input = json.loads(open(args.data_manager_json).read())
292
293 target_directory = data_manager_input['output_data'][0]['extra_files_path']
294
295 try:
296 os.mkdir( target_directory )
297 except OSError as exc:
298 if exc.errno == errno.EEXIST and os.path.isdir( target_directory ):
299 pass
300 else:
301 raise
302
303 data_manager_output = {}
304
305 if str(args.database_type) == 'standard':
306 kraken2_args = {
307 "kmer_len": args.kmer_len,
308 "minimizer_len": args.minimizer_len,
309 "minimizer_spaces": args.minimizer_spaces,
310 "threads": args.threads,
311 }
312 data_manager_output = kraken2_build_standard(
313 kraken2_args,
314 target_directory,
315 )
316 elif str(args.database_type) == 'minikraken':
317 data_manager_output = kraken2_build_minikraken(
318 str(args.minikraken2_version),
319 target_directory
320 )
321 elif str(args.database_type) == 'special':
322 kraken2_args = {
323 "special_database_type": str(args.special_database_type),
324 "kmer_len": args.kmer_len,
325 "minimizer_len": args.minimizer_len,
326 "minimizer_spaces": args.minimizer_spaces,
327 "threads": args.threads,
328 }
329 data_manager_output = kraken2_build_special(
330 kraken2_args,
331 target_directory,
332 )
333 elif str(args.database_type) == 'custom':
334 kraken2_args = {
335 "custom_fasta": args.custom_fasta,
336 "skip_maps": args.skip_maps,
337 "kmer_len": args.kmer_len,
338 "minimizer_len": args.minimizer_len,
339 "minimizer_spaces": args.minimizer_spaces,
340 "threads": args.threads,
341 }
342 data_manager_output = kraken2_build_custom(
343 kraken2_args,
344 args.custom_database_name,
345 target_directory,
346 )
347 else:
348 sys.exit("Invalid database type")
349
350 open(args.data_manager_json, 'w').write(json.dumps(data_manager_output))
351
352
353 if __name__ == "__main__":
354 main()