Mercurial > repos > dfornika > data_manager_build_bracken_database
changeset 0:911bb6c95bf8 draft default tip
"planemo upload for repository https://github.com/dfornika/galaxytools/tree/master/data_managers/data_manager_build_bracken_database/ commit d0b060e7d9cc9fd89926dbc07fc93c8a1471b3fe-dirty"
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/bracken_build_database.py Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import errno +import json +import os +import subprocess +import uuid + + +DATA_TABLE_NAME = "bracken_databases" + + +def bracken_build_database(target_directory, bracken_build_args, database_name, data_table_name=DATA_TABLE_NAME): + + database_value = str(uuid.uuid4()) + + database_name = database_name + + database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib') + + bracken_build_args_list = [ + '-t', bracken_build_args['threads'], + '-k', bracken_build_args['kmer_len'], + '-l', bracken_build_args['read_len'], + '-d', bracken_build_args['kraken_database'], + ] + + subprocess.check_call(['bracken-build'] + bracken_build_args_list) + + data_table_entry = { + "data_tables": { + data_table_name: [ + { + "value": database_value, + "name": database_name, + "path": database_path, + } + ] + } + } + + return data_table_entry + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('data_manager_json') + parser.add_argument('--threads', dest='threads', default=1, help='threads') + parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length') + parser.add_argument('--read-len', dest='read_len', help='Read length') + parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database') + parser.add_argument('--database-name', dest='database_name', help='Database Name') + args = parser.parse_args() + + data_manager_input = json.loads(open(args.data_manager_json).read()) + + target_directory = data_manager_input['output_data'][0]['extra_files_path'] + + bracken_build_args = { + 'threads': args.threads, + 'kmer_len': args.kmer_len, + 'read_len': args.read_len, + 'kraken_database': args.kraken_database, + } + + try: + os.mkdir(target_directory) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir( target_directory ): + pass + else: + raise + + data_manager_output = {} + + data_manager_output = bracken_build_database( + target_directory, + bracken_build_args, + args.database_name, + ) + + with open(args.data_manager_json, 'w') as out: + out.write(json.dumps(data_manager_output, sort_keys=True)) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/bracken_build_database.xml Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,48 @@ +<?xml version="1.0"?> +<tool id="bracken_build_database" name="Bracken Database Builder" tool_type="manage_data" version="2.5_galaxy0" profile="19.01"> + <description>bracken database builder</description> + <requirements> + <requirement type="package" version="2.5">bracken</requirement> + <requirement type="package" version="2.0.8_beta">kraken2</requirement> + </requirements> + <command> + <![CDATA[ + python '$__tool_directory__/bracken_build_database.py' + '${out_file}' + --kraken-db '${kraken_db.fields.path}' + --threads \${GALAXY_SLOTS:-1} + --kmer-len ${kmer_len} + --read-len ${read_len} + --database-name '${database_name}' + ]]> + </command> + <inputs> + <param name="kraken_db" type="select"> + <options from_data_table="kraken2_databases"> + <validator type="no_options" message="No Kraken2 databases are available" /> + </options> + </param> + <param name="kmer_len" type="integer" min="8" max="256" value="35" label="K-mer length" /> + <param name="read_len" type="integer" min="8" max="1000" value="100" label="Read length" /> + <param name="database_name" type="text" label="Database Name" /> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <test> + <param name="kraken_db" value="test_entry" /> + <param name="database_name" value="database" /> + <output name="out_file"> + <assert_contents> + <has_text text="test_db/database100mers.kmer_distrib" /> + </assert_contents> + </output> + </test> + </tests> + <help> + </help> + <citations> + <citation type="doi">10.7717/peerj-cs.104</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,11 @@ +<data_managers> + <data_manager tool_file="data_manager/bracken_build_database.xml" id="bracken_build_database" version="2.5_galaxy0"> + <data_table name="bracken_databases"> + <output> + <column name="value"/> + <column name="name"/> + <column name="path" output_ref="out_file"/> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/kraken2_databases.loc Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,6 @@ +# Tab separated with three columns: +# - value (Galaxy records this in the Galaxy DB) +# - name (Galaxy shows this in the UI) +# - path (folder name containing the Kraken DB) +# +test_entry "Test Database" ${__HERE__}/test_db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/nodes_patterns.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,15 @@ +^220341\s +^90370\s +^59201\s +^28901\s +^590\s +^543\s +^91347\s +^1236\s +^1224\s +^2\s +^131567\s +^1\s +^585057\s +^562\s +^561\s
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reproduce_test_dataset.sh Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,18 @@ +#!/bin/bash + +# This script produces a small kraken2 database containing only a ~1kb portion each of a salmonella and ecoli genome +# It requires kraken2, and entrez-direct (available on bioconda) +kraken2-build --db test_db --download_taxonomy +mv test_db/taxonomy/nucl_gb.accession2taxid test_db/taxonomy/nucl_gb.accession2taxid_full +grep -e 'NC_003198.1' -e 'NC_011750.1' test_db/taxonomy/nucl_gb.accession2taxid_full > test_db/taxonomy/nucl_gb.accession2taxid +mv test_db/taxonomy/nodes.dmp test_db/taxonomy/nodes.dmp_full +grep -f node_patterns.txt test_db/taxonomy/nodes.dmp_full > test_db/taxonomy/nodes.dmp +mv test_db/taxonomy/names.dmp test_db/taxonomy/names.dmp_full +grep -e '^220341\s' -e '^585057\s' test_db/taxonomy/names.dmp_full > test_db/taxonomy/names.dmp +esearch -db nucleotide -query "NC_003198.1" | efetch -format fasta > NC_003198.1.fasta +esearch -db nucleotide -query "NC_011750.1" | efetch -format fasta > NC_011750.1.fasta +head -n 14 NC_003198.1.fasta > NC_003198.1_1kb.fasta +head -n 14 NC_011750.1.fasta > NC_011750.1_1kb.fasta +kraken2-build --db test_db --add-to-library NC_003198.1_1kb.fasta +kraken2-build --db test_db --add-to-library NC_011750.1_1kb.fasta +kraken2-build --db test_db --build
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/9C7DdW7GAD.fna Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,17 @@ +>NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome +AGAGATTACGTCTGGTTGCAAGAGATCATAACAGGGGAAATTGATTGAAAATAAATATAT +CGCCAGCAGCACATGAACAAGTTTCGGAATGTGATCAATTTAAAAATTTATTGACTTAGG +CGGGCAGATACTTTAACCAATATAGGAATACAAGACAGACAAATAAAAATGACAGAGTAC +ACAACATCCATGAACCGCATCAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGT +AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGAACAGTGCGG +GCxxxxxxxxCGACCAGAGATCACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT +ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATTCC +AGGCAAGGGCAGGTAGCGACCGTACTTTCCGCCCCCGCGAAAATTACCAACCATCTGGTG +GCGATGATTGAAAAAACTATCGGCGGCCAGGATGCTTTGCCGAATATCAGCGATGCCGAA +CGTATTTTTTCTGACCTGCTCGCAGGACTTGCCAGCGCGCAGCCGGGATTCCCGCTTGCA +CGGTTGAAAATGGTTGTCGAACAAGAATTCGCTCAGATCAAACATGTTTTGCATGGTATC +AGCCTGCTGGGTCAGTGCCCGGATAGCATCAACGCCGCGCTGATTTGCCGTGGCGAAAAA +ATGTCGATCGCGATTATGGCGGGACTCCTGGAGGCGCGTGGACATCGCGTCACGGTGATC +GATCCGGTAGAAAAACTGCTGGCGGTGGGCCATTACCTTGAATCTACCGTCGATATCGCG +GAATCGACTCGCCGTATCGCCGCCAGCCAGATCCCGGCCGATCACATGATCCTGATGGCG +GGCTTTACTG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/cWk1IBlK73.fna Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,17 @@ +>NC_011750.1 Escherichia coli IAI39 chromosome, complete genome +GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTxxxxxxxGAGTGTCT +GATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGT +CACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACA +CAACATCCATGAAACGCATTAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGTA +ACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGC +xxxxxxxxCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTAC +ATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAG +GCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGC +GATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACG +TATTTTTGCCGAACTTCTGACGGGACTCGCCGCTGCCCAACCGGGATTCCCGCTGGCGCA +ACTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAG +TTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAAT +GTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACCGTTATCGA +TCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGA +GTCCACCCGCCGTATTGCGGCAAGTCGTATTCCGGCTGATCACATGGTGCTGATGGCAGG +TTTCACCGCC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,2 @@ +ACCNUM NC_011750.1 NC_011750 +ACCNUM NC_003198.1 NC_003198
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,1 @@ +ACCNUM NC_003198.1 NC_003198
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,1 @@ +ACCNUM NC_011750.1 NC_011750
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/seqid2taxid.map Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,1 @@ +NC_011750.1 585057
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/names.dmp Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,5 @@ +220341 | Salmonella enterica subsp. enterica serovar Typhi CT18 | | equivalent name | +220341 | Salmonella enterica subsp. enterica serovar Typhi str. CT18 | | scientific name | +220341 | Salmonella enterica subsp. enterica serovar Typhi strain CT18 | | equivalent name | +220341 | Salmonella typhi CT18 | | equivalent name | +585057 | Escherichia coli IAI39 | | scientific name |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/nodes.dmp Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,15 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +562 | 561 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +590 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +28901 | 590 | species | SE | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +59201 | 28901 | subspecies | SE | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +90370 | 59201 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +220341 | 90370 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +585057 | 562 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/nucl_gb.accession2taxid Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,2 @@ +NC_003198 NC_003198.1 220341 16758993 +NC_011750 NC_011750.1 585057 218698419
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/prelim_map.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,2 @@ +ACCNUM NC_011750.1 NC_011750 +ACCNUM NC_003198.1 NC_003198
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/unmapped.txt Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,1 @@ +NC_003198
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of bracken databases in the required format --> + <table name="bracken_databases" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/bracken_databases.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Oct 24 17:44:45 2019 -0400 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of Kraken database in the required format --> + <table name="kraken2_databases" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/kraken2_databases.loc" /> + </table> +</tables>