changeset 0:c2e4127fb5bf draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bracken_database/ commit 74e81c69c8806d98beb15a889741bcd702866ce3"
author iuc
date Sun, 20 Oct 2019 09:37:21 -0400
parents
children 9059edc96a85
files data_manager/bracken_build_database.py data_manager/bracken_build_database.xml data_manager_conf.xml test-data/kraken2_databases.loc test-data/nodes_patterns.txt test-data/reproduce_test_dataset.sh test-data/test_db/hash.k2d test-data/test_db/library/added/9C7DdW7GAD.fna test-data/test_db/library/added/9C7DdW7GAD.fna.masked test-data/test_db/library/added/cWk1IBlK73.fna test-data/test_db/library/added/cWk1IBlK73.fna.masked test-data/test_db/library/added/prelim_map.txt test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt test-data/test_db/opts.k2d test-data/test_db/seqid2taxid.map test-data/test_db/taxo.k2d test-data/test_db/taxonomy/names.dmp test-data/test_db/taxonomy/nodes.dmp test-data/test_db/taxonomy/nucl_gb.accession2taxid test-data/test_db/taxonomy/prelim_map.txt test-data/test_db/unmapped.txt tool-data/bracken_databases.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 22 files changed, 268 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/bracken_build_database.py	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import argparse
+import errno
+import json
+import os
+import subprocess
+import uuid
+
+
+DATA_TABLE_NAME = "bracken_databases"
+
+
+def bracken_build_database(target_directory, bracken_build_args, database_name, data_table_name=DATA_TABLE_NAME):
+
+    database_value = str(uuid.uuid4())
+
+    database_name = database_name
+
+    database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib')
+
+    bracken_build_args_list = [
+        '-t', bracken_build_args['threads'],
+        '-k', bracken_build_args['kmer_len'],
+        '-l', bracken_build_args['read_len'],
+        '-d', bracken_build_args['kraken_database'],
+    ]
+
+    subprocess.check_call(['bracken-build'] + bracken_build_args_list)
+
+    data_table_entry = {
+        "data_tables": {
+            data_table_name: [
+                {
+                    "value": database_value,
+                    "name": database_name,
+                    "path": database_path,
+                }
+            ]
+        }
+    }
+
+    return data_table_entry
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_manager_json')
+    parser.add_argument('--threads', dest='threads', default=1, help='threads')
+    parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length')
+    parser.add_argument('--read-len', dest='read_len', help='Read length')
+    parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database')
+    parser.add_argument('--database-name', dest='database_name', help='Database Name')
+    args = parser.parse_args()
+
+    data_manager_input = json.loads(open(args.data_manager_json).read())
+
+    target_directory = data_manager_input['output_data'][0]['extra_files_path']
+
+    bracken_build_args = {
+        'threads': args.threads,
+        'kmer_len': args.kmer_len,
+        'read_len': args.read_len,
+        'kraken_database': args.kraken_database,
+    }
+
+    try:
+        os.mkdir(target_directory)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir( target_directory ):
+            pass
+        else:
+            raise
+
+    data_manager_output = {}
+
+    data_manager_output = bracken_build_database(
+        target_directory,
+        bracken_build_args,
+        args.database_name,
+    )
+
+    with open(args.data_manager_json, 'w') as out:
+        out.write(json.dumps(data_manager_output, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/bracken_build_database.xml	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<tool id="bracken_build_database" name="Bracken Database Builder" tool_type="manage_data" version="2.5+galaxy0" profile="19.01">
+    <description>bracken database builder</description>
+    <requirements>
+        <requirement type="package" version="2.5">bracken</requirement>
+        <requirement type="package" version="2.0.8_beta">kraken2</requirement>
+    </requirements>
+    <command>
+    <![CDATA[
+        python '$__tool_directory__/bracken_build_database.py'
+          '${out_file}'
+          --kraken-db '${kraken_db.fields.path}'
+          --threads \${GALAXY_SLOTS:-1}
+          --kmer-len ${kmer_len}
+          --read-len ${read_len}
+          --database-name '${database_name}'
+    ]]>
+    </command>
+    <inputs>
+        <param name="kraken_db" type="select">
+            <options from_data_table="kraken2_databases">
+                <validator type="no_options" message="No Kraken2 databases are available" />
+            </options>
+        </param>
+        <param name="kmer_len" type="integer" min="8" max="256" value="35" label="K-mer length" />
+        <param name="read_len" type="integer" min="8" max="1000" value="100" label="Read length" />
+        <param name="database_name" type="text" label="Database Name" />
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="kraken_db" value="test_entry" />
+            <param name="database_name" value="database" />
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="test_db/database100mers.kmer_distrib" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+    </help>
+    <citations>
+        <citation type="doi">10.7717/peerj-cs.104</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,11 @@
+<data_managers>
+    <data_manager tool_file="data_manager/bracken_build_database.xml" id="bracken_build_database" version="2.5+galaxy0">
+        <data_table name="bracken_databases">
+            <output>
+                <column name="value"/>
+                <column name="name"/>
+                <column name="path" output_ref="out_file"/>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/kraken2_databases.loc	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,6 @@
+# Tab separated with three columns:
+# - value (Galaxy records this in the Galaxy DB)
+# - name (Galaxy shows this in the UI)
+# - path (folder name containing the Kraken DB)
+#
+test_entry	"Test Database"	${__HERE__}/test_db
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nodes_patterns.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,15 @@
+^220341\s
+^90370\s
+^59201\s
+^28901\s
+^590\s
+^543\s
+^91347\s
+^1236\s
+^1224\s
+^2\s
+^131567\s
+^1\s
+^585057\s
+^562\s
+^561\s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reproduce_test_dataset.sh	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# This script produces a small kraken2 database containing only a ~1kb portion each of a salmonella and ecoli genome
+# It requires kraken2, and entrez-direct (available on bioconda)
+kraken2-build --db test_db --download_taxonomy
+mv test_db/taxonomy/nucl_gb.accession2taxid test_db/taxonomy/nucl_gb.accession2taxid_full
+grep -e 'NC_003198.1' -e 'NC_011750.1' test_db/taxonomy/nucl_gb.accession2taxid_full > test_db/taxonomy/nucl_gb.accession2taxid
+mv test_db/taxonomy/nodes.dmp test_db/taxonomy/nodes.dmp_full
+grep -f node_patterns.txt test_db/taxonomy/nodes.dmp_full > test_db/taxonomy/nodes.dmp
+mv test_db/taxonomy/names.dmp test_db/taxonomy/names.dmp_full
+grep -e '^220341\s' -e '^585057\s' test_db/taxonomy/names.dmp_full > test_db/taxonomy/names.dmp
+esearch -db nucleotide -query "NC_003198.1" | efetch -format fasta > NC_003198.1.fasta
+esearch -db nucleotide -query "NC_011750.1" | efetch -format fasta > NC_011750.1.fasta
+head -n 14 NC_003198.1.fasta > NC_003198.1_1kb.fasta
+head -n 14 NC_011750.1.fasta > NC_011750.1_1kb.fasta
+kraken2-build --db test_db --add-to-library NC_003198.1_1kb.fasta
+kraken2-build --db test_db --add-to-library NC_011750.1_1kb.fasta
+kraken2-build --db test_db --build
Binary file test-data/test_db/hash.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/9C7DdW7GAD.fna	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,17 @@
+>NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome
+AGAGATTACGTCTGGTTGCAAGAGATCATAACAGGGGAAATTGATTGAAAATAAATATAT
+CGCCAGCAGCACATGAACAAGTTTCGGAATGTGATCAATTTAAAAATTTATTGACTTAGG
+CGGGCAGATACTTTAACCAATATAGGAATACAAGACAGACAAATAAAAATGACAGAGTAC
+ACAACATCCATGAACCGCATCAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGT
+AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGAACAGTGCGG
+GCxxxxxxxxCGACCAGAGATCACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
+ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATTCC
+AGGCAAGGGCAGGTAGCGACCGTACTTTCCGCCCCCGCGAAAATTACCAACCATCTGGTG
+GCGATGATTGAAAAAACTATCGGCGGCCAGGATGCTTTGCCGAATATCAGCGATGCCGAA
+CGTATTTTTTCTGACCTGCTCGCAGGACTTGCCAGCGCGCAGCCGGGATTCCCGCTTGCA
+CGGTTGAAAATGGTTGTCGAACAAGAATTCGCTCAGATCAAACATGTTTTGCATGGTATC
+AGCCTGCTGGGTCAGTGCCCGGATAGCATCAACGCCGCGCTGATTTGCCGTGGCGAAAAA
+ATGTCGATCGCGATTATGGCGGGACTCCTGGAGGCGCGTGGACATCGCGTCACGGTGATC
+GATCCGGTAGAAAAACTGCTGGCGGTGGGCCATTACCTTGAATCTACCGTCGATATCGCG
+GAATCGACTCGCCGTATCGCCGCCAGCCAGATCCCGGCCGATCACATGATCCTGATGGCG
+GGCTTTACTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/cWk1IBlK73.fna	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,17 @@
+>NC_011750.1 Escherichia coli IAI39 chromosome, complete genome
+GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTxxxxxxxGAGTGTCT
+GATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGT
+CACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACA
+CAACATCCATGAAACGCATTAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGTA
+ACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGC
+xxxxxxxxCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTAC
+ATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAG
+GCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGC
+GATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACG
+TATTTTTGCCGAACTTCTGACGGGACTCGCCGCTGCCCAACCGGGATTCCCGCTGGCGCA
+ACTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAG
+TTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAAT
+GTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACCGTTATCGA
+TCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGA
+GTCCACCCGCCGTATTGCGGCAAGTCGTATTCCGGCTGATCACATGGTGCTGATGGCAGG
+TTTCACCGCC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,2 @@
+ACCNUM	NC_011750.1	NC_011750
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,1 @@
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,1 @@
+ACCNUM	NC_011750.1	NC_011750
Binary file test-data/test_db/opts.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/seqid2taxid.map	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,1 @@
+NC_011750.1	585057
Binary file test-data/test_db/taxo.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/names.dmp	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,5 @@
+220341	|	Salmonella enterica subsp. enterica serovar Typhi CT18	|		|	equivalent name	|
+220341	|	Salmonella enterica subsp. enterica serovar Typhi str. CT18	|		|	scientific name	|
+220341	|	Salmonella enterica subsp. enterica serovar Typhi strain CT18	|		|	equivalent name	|
+220341	|	Salmonella typhi CT18	|		|	equivalent name	|
+585057	|	Escherichia coli IAI39	|		|	scientific name	|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/nodes.dmp	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,15 @@
+1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
+2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
+543	|	91347	|	family	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+561	|	543	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+562	|	561	|	species	|	EC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+590	|	543	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1224	|	2	|	phylum	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1236	|	1224	|	class	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+28901	|	590	|	species	|	SE	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+59201	|	28901	|	subspecies	|	SE	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+90370	|	59201	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+91347	|	1236	|	order	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+131567	|	1	|	no rank	|		|	8	|	1	|	1	|	1	|	0	|	1	|	1	|	0	|		|
+220341	|	90370	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+585057	|	562	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/nucl_gb.accession2taxid	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,2 @@
+NC_003198	NC_003198.1	220341	16758993
+NC_011750	NC_011750.1	585057	218698419
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/prelim_map.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,2 @@
+ACCNUM	NC_011750.1	NC_011750
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/unmapped.txt	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,1 @@
+NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of bracken databases in the required format -->
+    <table name="bracken_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/bracken_databases.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Sun Oct 20 09:37:21 2019 -0400
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of Kraken database in the required format -->
+    <table name="kraken2_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/kraken2_databases.loc" />
+    </table>
+</tables>