Mercurial > repos > iuc > data_manager_build_bracken_database
changeset 7:174a754bd3b6 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_bracken_database commit a108f20aebc04574a8bd0a90b955064439a50852
| author | iuc |
|---|---|
| date | Wed, 05 Nov 2025 13:32:09 +0000 |
| parents | 84cc0dc92b0c |
| children | |
| files | data_manager/bracken_build_database.py data_manager/bracken_build_database.xml data_manager_conf.xml test-data/kraken2_databases.loc test-data/test_db/database.kraken test-data/test_db/database100mers.kmer_distrib test-data/test_db/database100mers.kraken |
| diffstat | 7 files changed, 111 insertions(+), 130 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/bracken_build_database.py Wed Mar 06 14:09:08 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function - -import argparse -import errno -import json -import os -import subprocess -import uuid - - -DATA_TABLE_NAME = "bracken_databases" - - -def bracken_build_database(target_directory, bracken_build_args, database_name, prebuilt=False, data_table_name=DATA_TABLE_NAME): - - database_value = str(uuid.uuid4()) - - database_name = database_name - - database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib') - - if not prebuilt: - bracken_build_args_list = [ - '-t', bracken_build_args['threads'], - '-k', bracken_build_args['kmer_len'], - '-l', bracken_build_args['read_len'], - '-d', bracken_build_args['kraken_database'], - ] - - subprocess.check_call(['bracken-build'] + bracken_build_args_list) - - data_table_entry = { - "data_tables": { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('data_manager_json') - parser.add_argument('--threads', dest='threads', default=1, help='threads') - parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length') - parser.add_argument('--read-len', dest='read_len', help='Read length') - parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database') - parser.add_argument('--database-name', dest='database_name', help='Database Name') - parser.add_argument('--prebuilt', action='store_true', dest='prebuilt', help='Use pre-built DB') - args = parser.parse_args() - - with open(args.data_manager_json) as fh: - data_manager_input = json.load(fh) - - target_directory = data_manager_input['output_data'][0]['extra_files_path'] - - if args.prebuilt: - bracken_build_args = { - 'threads': args.threads, - 'read_len': args.read_len, - 'kraken_database': args.kraken_database, - } - else: - bracken_build_args = { - 'threads': args.threads, - 'kmer_len': args.kmer_len, - 'read_len': args.read_len, - 'kraken_database': args.kraken_database, - } - - try: - os.mkdir(target_directory) - except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(target_directory): - pass - else: - raise - - data_manager_output = {} - - data_manager_output = bracken_build_database( - target_directory, - bracken_build_args, - args.database_name, - args.prebuilt, - ) - - with open(args.data_manager_json, 'w') as fh: - json.dump(data_manager_output, fh, sort_keys=True) - - -if __name__ == "__main__": - main()
--- a/data_manager/bracken_build_database.xml Wed Mar 06 14:09:08 2024 +0000 +++ b/data_manager/bracken_build_database.xml Wed Nov 05 13:32:09 2025 +0000 @@ -1,37 +1,77 @@ -<?xml version="1.0"?> <tool id="bracken_build_database" name="Bracken Database Builder" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>bracken database builder</description> <macros> - <token name="@TOOL_VERSION@">2.8</token> + <token name="@TOOL_VERSION@">3.1</token> <token name="@VERSION_SUFFIX@">0</token> - <token name="@PROFILE@">22.01</token> + <token name="@PROFILE@">24.0</token> </macros> <requirements> <requirement type="package" version="@TOOL_VERSION@">bracken</requirement> </requirements> <command> <![CDATA[ - #import os - #set db_dir = os.path.basename($kraken_db.fields.path) - + #set $db_dir = $out_file.extra_files_path mkdir '$db_dir' && - ln -s '${kraken_db.fields.path}/*' '$db_dir/' && - python '$__tool_directory__/bracken_build_database.py' - '${out_file}' - - --threads \${GALAXY_SLOTS:-1} - #if $check_prebuilt.prebuilt == "no" - --kraken-db '$db_dir' - --kmer-len ${check_prebuilt.kmer_len} - #end if - #if $check_prebuilt.prebuilt == "yes" - --kraken-db '${kraken_db.fields.path}' - --prebuilt - #end if - --read-len ${check_prebuilt.read_len} - --database-name '${database_name}' + + ## Create symlinks to the contents of the kraken database, since otherwise + ## the DM tool (ie bracken-build) would write to the kraken DB folder. + ## + ## These symlinks will be part of the final db_dir which will be moved + ## to the tool-data (except for some of the pre-exsting *mers.kraken + ## and *mers.kmer_distrib which may be deleted -- depending on the + ## choice of prebuilt). + ## Hence the final db_dir will have + ## - either a created mers.kmer_distrib file or a symlink pointing to the kraken2 db source + ## - either a created mers.kraken file or a symlink pointing to the kraken2 db source + ## - and symlinks to the krakenDB + ## Note that in the data table entry only the path to mers.kmer_distrib file is used + ln -s '${kraken_db.fields.path}'/* '$db_dir/' && + + #if $check_prebuilt.prebuilt == "no" + ## remove any other pre-existing bracken-build results + find '$db_dir' -maxdepth 1 -name "*mers*" -delete && + bracken-build + -t "\${GALAXY_SLOTS:-1}" + -k $check_prebuilt.kmer_len + -l $check_prebuilt.read_len + -d '$db_dir/' + #else + if [ ! -e '$db_dir/database${check_prebuilt.read_len}mers.kmer_distrib' ]; then + >&2 echo "Have you chosen the correct k-mer length? file $db_dir/database${check_prebuilt.read_len}mers.kmer_distrib does not exist."; + exit 1; + fi + ## delete any other (links to) prebuilt bracken databases (except the one of the chosen read_len) + && find '$db_dir' -maxdepth 1 -name "*mers.kraken" -a \! -name "*${check_prebuilt.read_len}mers.kraken" -delete + && find '$db_dir' -maxdepth 1 -name "*mers.kmer_distrib" -a \! -name "*${check_prebuilt.read_len}mers.kmer_distrib" -delete + #end if + && cp '$dmjson' '$out_file' ]]> </command> + <configfiles> + <configfile name="dmjson"><![CDATA[#slurp +#import uuid +#set $database_value = str(uuid.uuid4()) +#set $db_dir = $out_file.extra_files_path + +#if $database_name == "" + #if $check_prebuilt.prebuilt == "no" + #set $database_name = $kraken_db.fields.name + " (k-mer length = " + str($check_prebuilt.kmer_len) +", read length = " + str($check_prebuilt.read_len) + ")" + #else + #set $database_name = $kraken_db.fields.name + " (Prebuilt bracken DB with read length = " + str($check_prebuilt.read_len) + ")" + #end if +#end if +{ + "data_tables":{ + "bracken_databases":[ + { + "value": "$database_value", + "name": "$database_name", + "path": "$db_dir/database${check_prebuilt.read_len}mers.kmer_distrib" + } + ] + } +}]]></configfile> + </configfiles> <inputs> <param name="kraken_db" type="select"> <options from_data_table="kraken2_databases"> @@ -58,8 +98,8 @@ <option value="300">300</option> </param> </when> - </conditional> - <param name="database_name" type="text" label="Database Name" help="Please add a clear reference to the corresponding kraken2 DB the read length to the name." /> + </conditional> + <param name="database_name" type="text" label="Database Name" help="If left empty a name will be constructed from the name of the Kraken2 database suffixed by info on read and k-mer length. Otherwise please add a clear reference to the corresponding kraken2 DB the read length and kmer length to the name." /> </inputs> <outputs> <data name="out_file" format="data_manager_json" /> @@ -67,10 +107,29 @@ <tests> <test> <param name="kraken_db" value="test_entry" /> - <param name="database_name" value="database" /> + <param name="database_name" value="" /> + <conditional name="check_prebuilt"> + <param name="prebuilt" value="yes" /> + <param name="read_len" value="100" /> + </conditional> <output name="out_file"> <assert_contents> - <has_text text="test_db/database100mers.kmer_distrib" /> + <has_text text="Prebuilt bracken DB with read length = 100" /> + <has_text text="database100mers.kmer_distrib" /> + </assert_contents> + </output> + </test> + <test> + <param name="kraken_db" value="test_entry" /> + <param name="database_name" value="custom database name" /> + <conditional name="check_prebuilt"> + <param name="prebuilt" value="no" /> + <param name="read_len" value="150" /> + </conditional> + <output name="out_file"> + <assert_contents> + <has_text text="custom database name" /> + <has_text text="database150mers.kmer_distrib" /> </assert_contents> </output> </test> @@ -85,15 +144,16 @@ ====================================== The prebuilt option does use existing bracken DBs, that are shipped with kraken2 DBs. This is the case for DBs downloaded from https://benlangmead.github.io/aws-indexes/k2. -All prebuilt databases contain a Kraken 2 database along with Bracken databases built for 50, 75, 100, 150, 200, 250 and 300 read lengths (refering to the lenght of the sequenced reads to be analysis using that database (one read in case of paired reads)). +All prebuilt databases contain a Kraken 2 database along with Bracken databases built for 50, 75, 100, 150, 200, 250 and 300 read lengths (referring to the length of the sequenced reads to be analysis using that database (one read in case of paired reads)). In this case the data manager points to the same DB as the kracken2 DB. +Note the Kraken 2 / Bracken 16s DBs only contain the Bracken databases built for 100mers, 150mers, and 200mers. **The prebuilt option must not be used for custom kraken2 DBs ! For this the bracken DB needs to be build.** ====================================== Building new DBs ====================================== -Use the same K-mer length as the kraken2 DB and choose read lengths that are close to the read length of analysis you want to perform (one read in case of paired reads). +Use the same K-mer length as the kraken2 DB and choose read lengths that are close to the read length of the analysis you want to perform (one read in case of paired reads). </help> <citations> <citation type="doi">10.7717/peerj-cs.104</citation>
--- a/data_manager_conf.xml Wed Mar 06 14:09:08 2024 +0000 +++ b/data_manager_conf.xml Wed Nov 05 13:32:09 2025 +0000 @@ -4,8 +4,25 @@ <output> <column name="value"/> <column name="name"/> - <column name="path" output_ref="out_file"/> + <column name="path" output_ref="out_file"> + <move type="directory" relativize_symlinks="True"> + <!-- we move the top level folder, since the path points to the bracken DB file --> + <source> + #import os + #set base_dir = os.path.dirname($path) + ${base_dir} + </source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">bracken_databases/${value}</target> + </move> + <!-- we store the path to the bracken DB file, since this is what bracken uses --> + <value_translation> + #import os + ${GALAXY_DATA_MANAGER_DATA_PATH}/bracken_databases/${value}/${os.path.basename($path)} + </value_translation> + <value_translation type="function">abspath</value_translation> + </column> </output> + </data_table> </data_manager> </data_managers>
--- a/test-data/kraken2_databases.loc Wed Mar 06 14:09:08 2024 +0000 +++ b/test-data/kraken2_databases.loc Wed Nov 05 13:32:09 2025 +0000 @@ -3,4 +3,4 @@ # - name (Galaxy shows this in the UI) # - path (folder name containing the Kraken DB) # -test_entry "Test Database" ${__HERE__}/test_db +test_entry Test Database ${__HERE__}/test_db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/database.kraken Wed Nov 05 13:32:09 2025 +0000 @@ -0,0 +1,2 @@ +C NC_011750.1 585057 910 585057:11 A:40 585057:117 A:66 585057:32 A:41 585057:569 +C NC_003198.1 585057 910 0:169 A:66 585057:24 0:9 A:41 0:10 585057:65 0:134 585057:2 0:356
