Mercurial > repos > iuc > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 0:e4cdf82de430 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 30d1a86d5c8a3fa434e24ff915f85f51e514ceb2
| author | iuc |
|---|---|
| date | Thu, 30 May 2019 03:03:22 -0400 |
| parents | |
| children | 6ba1ec5f86fc |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e4cdf82de430 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 from __future__ import print_function | |
| 4 | |
| 5 import argparse | |
| 6 import datetime | |
| 7 import errno | |
| 8 import json | |
| 9 import os | |
| 10 import shutil | |
| 11 import subprocess | |
| 12 import sys | |
| 13 import tarfile | |
| 14 from enum import Enum | |
| 15 | |
| 16 try: | |
| 17 # Python3 | |
| 18 from urllib.request import urlopen | |
| 19 except ImportError: | |
| 20 from urllib2 import urlopen | |
| 21 | |
| 22 | |
| 23 DATA_TABLE_NAME = "kraken2_databases" | |
| 24 | |
| 25 | |
| 26 class KrakenDatabaseTypes(Enum): | |
| 27 standard = 'standard' | |
| 28 minikraken = 'minikraken' | |
| 29 special = 'special' | |
| 30 custom = 'custom' | |
| 31 | |
| 32 def __str__(self): | |
| 33 return self.value | |
| 34 | |
| 35 | |
| 36 class SpecialDatabaseTypes(Enum): | |
| 37 rdp = 'rdp' | |
| 38 greengenes = 'greengenes' | |
| 39 silva = 'silva' | |
| 40 | |
| 41 def __str__(self): | |
| 42 return self.value | |
| 43 | |
| 44 | |
| 45 class Minikraken2Versions(Enum): | |
| 46 v1 = 'v1' | |
| 47 v2 = 'v2' | |
| 48 | |
| 49 def __str__(self): | |
| 50 return self.value | |
| 51 | |
| 52 | |
| 53 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 54 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 55 | |
| 56 database_value = "_".join([ | |
| 57 now, | |
| 58 "standard", | |
| 59 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 60 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 61 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 62 ]) | |
| 63 | |
| 64 database_name = " ".join([ | |
| 65 "Standard", | |
| 66 "(Created:", | |
| 67 now + ",", | |
| 68 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 69 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 70 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 71 ]) | |
| 72 | |
| 73 database_path = database_value | |
| 74 | |
| 75 args = [ | |
| 76 '--threads', str(kraken2_args["threads"]), | |
| 77 '--standard', | |
| 78 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 79 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 80 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 81 '--db', database_path | |
| 82 ] | |
| 83 | |
| 84 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 85 | |
| 86 args = [ | |
| 87 '--threads', str(kraken2_args["threads"]), | |
| 88 '--clean', | |
| 89 '--db', database_path | |
| 90 ] | |
| 91 | |
| 92 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 93 | |
| 94 data_table_entry = { | |
| 95 'data_tables': { | |
| 96 data_table_name: [ | |
| 97 { | |
| 98 "value": database_value, | |
| 99 "name": database_name, | |
| 100 "path": database_path, | |
| 101 } | |
| 102 ] | |
| 103 } | |
| 104 } | |
| 105 | |
| 106 return data_table_entry | |
| 107 | |
| 108 | |
| 109 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 110 | |
| 111 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 112 | |
| 113 database_value = "_".join([ | |
| 114 now, | |
| 115 "minikraken2", | |
| 116 minikraken2_version, | |
| 117 "8GB", | |
| 118 ]) | |
| 119 | |
| 120 database_name = " ".join([ | |
| 121 "Minikraken2", | |
| 122 minikraken2_version, | |
| 123 "(Created:", | |
| 124 now + ")" | |
| 125 ]) | |
| 126 | |
| 127 database_path = database_value | |
| 128 | |
| 129 # download the minikraken2 data | |
| 130 src = urlopen( | |
| 131 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken2_%s_8GB_201904_UPDATE.tgz' | |
| 132 % minikraken2_version | |
| 133 ) | |
| 134 with open('tmp_data.tar.gz', 'wb') as dst: | |
| 135 shutil.copyfileobj(src, dst) | |
| 136 # unpack the downloaded archive to the target directory | |
| 137 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
| 138 for member in fh.getmembers(): | |
| 139 if member.isreg(): | |
| 140 member.name = os.path.basename(member.name) | |
| 141 fh.extract(member, os.path.join(target_directory, database_path)) | |
| 142 | |
| 143 data_table_entry = { | |
| 144 'data_tables': { | |
| 145 data_table_name: [ | |
| 146 { | |
| 147 "value": database_value, | |
| 148 "name": database_name, | |
| 149 "path": database_path, | |
| 150 } | |
| 151 ] | |
| 152 } | |
| 153 } | |
| 154 | |
| 155 return data_table_entry | |
| 156 | |
| 157 | |
| 158 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 159 | |
| 160 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 161 | |
| 162 special_database_names = { | |
| 163 "rdp": "RDP", | |
| 164 "greengenes": "Greengenes", | |
| 165 "silva": "Silva", | |
| 166 } | |
| 167 | |
| 168 database_value = "_".join([ | |
| 169 now, | |
| 170 kraken2_args["special_database_type"], | |
| 171 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 172 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 173 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 174 ]) | |
| 175 | |
| 176 database_name = " ".join([ | |
| 177 special_database_names[kraken2_args["special_database_type"]], | |
| 178 "(Created:", | |
| 179 now + ",", | |
| 180 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 181 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 182 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 183 ]) | |
| 184 | |
| 185 database_path = database_value | |
| 186 | |
| 187 args = [ | |
| 188 '--threads', str(kraken2_args["threads"]), | |
| 189 '--special', kraken2_args["special_database_type"], | |
| 190 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 191 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 192 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 193 '--db', database_path | |
| 194 ] | |
| 195 | |
| 196 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 197 | |
| 198 args = [ | |
| 199 '--threads', str(kraken2_args["threads"]), | |
| 200 '--clean', | |
| 201 '--db', database_path | |
| 202 ] | |
| 203 | |
| 204 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 205 | |
| 206 data_table_entry = { | |
| 207 'data_tables': { | |
| 208 data_table_name: [ | |
| 209 { | |
| 210 "value": database_value, | |
| 211 "name": database_name, | |
| 212 "path": database_path, | |
| 213 } | |
| 214 ] | |
| 215 } | |
| 216 } | |
| 217 | |
| 218 return data_table_entry | |
| 219 | |
| 220 | |
| 221 def kraken2_build_custom(kraken2_args, custom_database_name, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 222 | |
| 223 args = [ | |
| 224 '--threads', str(kraken2_args["threads"]), | |
| 225 '--download-taxonomy', | |
| 226 '--db', custom_database_name, | |
| 227 ] | |
| 228 | |
| 229 if kraken2_args['skip_maps']: | |
| 230 args.append('--skip-maps') | |
| 231 | |
| 232 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 233 | |
| 234 args = [ | |
| 235 '--threads', str(kraken2_args["threads"]), | |
| 236 '--add-to-library', kraken2_args["custom_fasta"], | |
| 237 '--db', custom_database_name | |
| 238 ] | |
| 239 | |
| 240 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 241 | |
| 242 args = [ | |
| 243 '--threads', str(kraken2_args["threads"]), | |
| 244 '--build', | |
| 245 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 246 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 247 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 248 '--db', custom_database_name | |
| 249 ] | |
| 250 | |
| 251 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 252 | |
| 253 args = [ | |
| 254 '--threads', str(kraken2_args["threads"]), | |
| 255 '--clean', | |
| 256 '--db', custom_database_name | |
| 257 ] | |
| 258 | |
| 259 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 260 | |
| 261 data_table_entry = { | |
| 262 'data_tables': { | |
| 263 data_table_name: [ | |
| 264 { | |
| 265 "value": custom_database_name, | |
| 266 "name": custom_database_name, | |
| 267 "path": custom_database_name | |
| 268 } | |
| 269 ] | |
| 270 } | |
| 271 } | |
| 272 | |
| 273 return data_table_entry | |
| 274 | |
| 275 | |
| 276 def main(): | |
| 277 parser = argparse.ArgumentParser() | |
| 278 parser.add_argument('data_manager_json') | |
| 279 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
| 280 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
| 281 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
| 282 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
| 283 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
| 284 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | |
| 285 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
| 286 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
| 287 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | |
| 288 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | |
| 289 args = parser.parse_args() | |
| 290 | |
| 291 data_manager_input = json.loads(open(args.data_manager_json).read()) | |
| 292 | |
| 293 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
| 294 | |
| 295 try: | |
| 296 os.mkdir( target_directory ) | |
| 297 except OSError as exc: | |
| 298 if exc.errno == errno.EEXIST and os.path.isdir( target_directory ): | |
| 299 pass | |
| 300 else: | |
| 301 raise | |
| 302 | |
| 303 data_manager_output = {} | |
| 304 | |
| 305 if str(args.database_type) == 'standard': | |
| 306 kraken2_args = { | |
| 307 "kmer_len": args.kmer_len, | |
| 308 "minimizer_len": args.minimizer_len, | |
| 309 "minimizer_spaces": args.minimizer_spaces, | |
| 310 "threads": args.threads, | |
| 311 } | |
| 312 data_manager_output = kraken2_build_standard( | |
| 313 kraken2_args, | |
| 314 target_directory, | |
| 315 ) | |
| 316 elif str(args.database_type) == 'minikraken': | |
| 317 data_manager_output = kraken2_build_minikraken( | |
| 318 str(args.minikraken2_version), | |
| 319 target_directory | |
| 320 ) | |
| 321 elif str(args.database_type) == 'special': | |
| 322 kraken2_args = { | |
| 323 "special_database_type": str(args.special_database_type), | |
| 324 "kmer_len": args.kmer_len, | |
| 325 "minimizer_len": args.minimizer_len, | |
| 326 "minimizer_spaces": args.minimizer_spaces, | |
| 327 "threads": args.threads, | |
| 328 } | |
| 329 data_manager_output = kraken2_build_special( | |
| 330 kraken2_args, | |
| 331 target_directory, | |
| 332 ) | |
| 333 elif str(args.database_type) == 'custom': | |
| 334 kraken2_args = { | |
| 335 "custom_fasta": args.custom_fasta, | |
| 336 "skip_maps": args.skip_maps, | |
| 337 "kmer_len": args.kmer_len, | |
| 338 "minimizer_len": args.minimizer_len, | |
| 339 "minimizer_spaces": args.minimizer_spaces, | |
| 340 "threads": args.threads, | |
| 341 } | |
| 342 data_manager_output = kraken2_build_custom( | |
| 343 kraken2_args, | |
| 344 args.custom_database_name, | |
| 345 target_directory, | |
| 346 ) | |
| 347 else: | |
| 348 sys.exit("Invalid database type") | |
| 349 | |
| 350 open(args.data_manager_json, 'w').write(json.dumps(data_manager_output)) | |
| 351 | |
| 352 | |
| 353 if __name__ == "__main__": | |
| 354 main() |
