changeset 0:ebc08e5ce646 draft

Uploaded
author greg
date Tue, 21 Apr 2020 10:08:28 -0400
parents
children bca267738b33
files .shed.yml vsnp_determine_ref_from_data.py vsnp_determine_ref_from_data.xml
diffstat 3 files changed, 544 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Tue Apr 21 10:08:28 2020 -0400
@@ -0,0 +1,11 @@
+name: vsnp_determine_ref_from_data
+owner: greg
+description: |
+  Contains a tool that sniffs bacterial genome samples to discover the primary species within the sample.
+homepage_url: https://github.com/USDA-VS/vSNP
+long_description: |
+  Contains a tool that sniffs bacterial genome samples to discover the primary species within the sample.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_determine_ref_from_data
+type: unrestricted
+categories:
+  - Sequence Analysis
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vsnp_determine_ref_from_data.py	Tue Apr 21 10:08:28 2020 -0400
@@ -0,0 +1,367 @@
+#!/usr/bin/env python
+
+import argparse
+import gzip
+import multiprocessing
+import os
+import queue
+import yaml
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from collections import OrderedDict
+
+INPUT_READS_DIR = 'input_reads'
+OUTPUT_DBKEY_DIR = 'output_dbkey'
+OUTPUT_METRICS_DIR = 'output_metrics'
+
+
+def get_base_file_name(file_path):
+    base_file_name = os.path.basename(file_path)
+    if base_file_name.find(".") > 0:
+        # Eliminate the extension.
+        return os.path.splitext(base_file_name)[0]
+    elif base_file_name.find("_") > 0:
+        # The dot extension was likely changed to
+        # the " character.
+        items = base_file_name.split("_")
+        no_ext = "_".join(items[0:-2])
+        if len(no_ext) > 0:
+            return no_ext
+    return base_file_name
+
+
+def get_dbkey(dnaprints_dict, key, s):
+    # dnaprints_dict looks something like this:
+    # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']}
+    # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}}
+    d = dnaprints_dict.get(key, {})
+    for data_table_value, v_list in d.items():
+        if s in v_list:
+            return data_table_value
+    return ""
+
+
+def get_dnaprints_dict(dnaprint_fields):
+    # A dndprint_fields entry looks something liek this.
+    # [['AF2122', '/galaxy/tool-data/vsnp/AF2122/dnaprints/NC_002945v4.yml']]
+    dnaprints_dict = {}
+    for item in dnaprint_fields:
+        # Here item is a 2-element list of data
+        # table components, # value and path.
+        value = item[0]
+        path = item[1].strip()
+        with open(path, "rt") as fh:
+            # The format of all dnaprints yaml
+            # files is something like this:
+            # brucella:
+            #  - 0111111111111111
+            print_dict = yaml.load(fh, Loader=yaml.Loader)
+        for print_dict_k, print_dict_v in print_dict.items():
+            dnaprints_v_dict = dnaprints_dict.get(print_dict_k, {})
+            if len(dnaprints_v_dict) > 0:
+                # dnaprints_dict already contains k (e.g., 'brucella',
+                # and dnaprints_v_dict will be a dictionary # that
+                # looks something like this:
+                # {'NC_002945v4': ['11001110', '11011110', '11001100']}
+                value_list = dnaprints_v_dict.get(value, [])
+                value_list = value_list + print_dict_v
+                dnaprints_v_dict[value] = value_list
+            else:
+                # dnaprints_v_dict is an empty dictionary.
+                dnaprints_v_dict[value] = print_dict_v
+            dnaprints_dict[print_dict_k] = dnaprints_v_dict
+    # dnaprints_dict looks something like this:
+    # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']}
+    # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}}
+    return dnaprints_dict
+
+
+def get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum):
+    if brucella_sum > 3:
+        group = "Brucella"
+        dbkey = get_dbkey(dnaprints_dict, "brucella", brucella_string)
+    elif bovis_sum > 3:
+        group = "TB"
+        dbkey = get_dbkey(dnaprints_dict, "bovis", bovis_string)
+    elif para_sum >= 1:
+        group = "paraTB"
+        dbkey = get_dbkey(dnaprints_dict, "para", para_string)
+    else:
+        group = ""
+        dbkey = ""
+    return group, dbkey
+
+
+def get_group_and_dbkey_for_collection(task_queue, finished_queue, dnaprints_dict, timeout):
+    while True:
+        try:
+            tup = task_queue.get(block=True, timeout=timeout)
+        except queue.Empty:
+            break
+        fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum = tup
+        group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
+        finished_queue.put((fastq_file, count_list, group, dbkey))
+        task_queue.task_done()
+
+
+def get_oligo_dict():
+    oligo_dict = {}
+    oligo_dict["01_ab1"] = "AATTGTCGGATAGCCTGGCGATAACGACGC"
+    oligo_dict["02_ab3"] = "CACACGCGGGCCGGAACTGCCGCAAATGAC"
+    oligo_dict["03_ab5"] = "GCTGAAGCGGCAGACCGGCAGAACGAATAT"
+    oligo_dict["04_mel"] = "TGTCGCGCGTCAAGCGGCGTGAAATCTCTG"
+    oligo_dict["05_suis1"] = "TGCGTTGCCGTGAAGCTTAATTCGGCTGAT"
+    oligo_dict["06_suis2"] = "GGCAATCATGCGCAGGGCTTTGCATTCGTC"
+    oligo_dict["07_suis3"] = "CAAGGCAGATGCACATAATCCGGCGACCCG"
+    oligo_dict["08_ceti1"] = "GTGAATATAGGGTGAATTGATCTTCAGCCG"
+    oligo_dict["09_ceti2"] = "TTACAAGCAGGCCTATGAGCGCGGCGTGAA"
+    oligo_dict["10_canis4"] = "CTGCTACATAAAGCACCCGGCGACCGAGTT"
+    oligo_dict["11_canis"] = "ATCGTTTTGCGGCATATCGCTGACCACAGC"
+    oligo_dict["12_ovis"] = "CACTCAATCTTCTCTACGGGCGTGGTATCC"
+    oligo_dict["13_ether2"] = "CGAAATCGTGGTGAAGGACGGGACCGAACC"
+    oligo_dict["14_63B1"] = "CCTGTTTAAAAGAATCGTCGGAACCGCTCT"
+    oligo_dict["15_16M0"] = "TCCCGCCGCCATGCCGCCGAAAGTCGCCGT"
+    oligo_dict["16_mel1b"] = "TCTGTCCAAACCCCGTGACCGAACAATAGA"
+    oligo_dict["17_tb157"] = "CTCTTCGTATACCGTTCCGTCGTCACCATGGTCCT"
+    oligo_dict["18_tb7"] = "TCACGCAGCCAACGATATTCGTGTACCGCGACGGT"
+    oligo_dict["19_tbbov"] = "CTGGGCGACCCGGCCGACCTGCACACCGCGCATCA"
+    oligo_dict["20_tb5"] = "CCGTGGTGGCGTATCGGGCCCCTGGATCGCGCCCT"
+    oligo_dict["21_tb2"] = "ATGTCTGCGTAAAGAAGTTCCATGTCCGGGAAGTA"
+    oligo_dict["22_tb3"] = "GAAGACCTTGATGCCGATCTGGGTGTCGATCTTGA"
+    oligo_dict["23_tb4"] = "CGGTGTTGAAGGGTCCCCCGTTCCAGAAGCCGGTG"
+    oligo_dict["24_tb6"] = "ACGGTGATTCGGGTGGTCGACACCGATGGTTCAGA"
+    oligo_dict["25_para"] = "CCTTTCTTGAAGGGTGTTCG"
+    oligo_dict["26_para_sheep"] = "CGTGGTGGCGACGGCGGCGGGCCTGTCTAT"
+    oligo_dict["27_para_cattle"] = "TCTCCTCGGTCGGTGATTCGGGGGCGCGGT"
+    return oligo_dict
+
+
+def get_seq_counts(value, fastq_list, gzipped):
+    count = 0
+    for fastq_file in fastq_list:
+        if gzipped == "true":
+            with gzip.open(fastq_file, 'rt') as fh:
+                for title, seq, qual in FastqGeneralIterator(fh):
+                    count += seq.count(value)
+        else:
+            with open(fastq_file, 'r') as fh:
+                for title, seq, qual in FastqGeneralIterator(fh):
+                    count += seq.count(value)
+    return(value, count)
+
+
+def get_species_counts(fastq_list, gzipped):
+    count_summary = {}
+    oligo_dict = get_oligo_dict()
+    for v1 in oligo_dict.values():
+        returned_value, count = get_seq_counts(v1, fastq_list, gzipped)
+        for key, v2 in oligo_dict.items():
+            if returned_value == v2:
+                count_summary.update({key: count})
+    count_list = []
+    for v in count_summary.values():
+        count_list.append(v)
+    brucella_sum = sum(count_list[:16])
+    bovis_sum = sum(count_list[16:24])
+    para_sum = sum(count_list[24:])
+    return count_summary, count_list, brucella_sum, bovis_sum, para_sum
+
+
+def get_species_counts_for_collection(task_queue, finished_queue, gzipped, timeout):
+    while True:
+        try:
+            fastq_file = task_queue.get(block=True, timeout=timeout)
+        except queue.Empty:
+            break
+        count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts([fastq_file], gzipped)
+        finished_queue.put((fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum))
+        task_queue.task_done()
+
+
+def get_species_strings(count_summary):
+    binary_dictionary = {}
+    for k, v in count_summary.items():
+        if v > 1:
+            binary_dictionary.update({k: 1})
+        else:
+            binary_dictionary.update({k: 0})
+    binary_dictionary = OrderedDict(sorted(binary_dictionary.items()))
+    binary_list = []
+    for v in binary_dictionary.values():
+        binary_list.append(v)
+    brucella_binary = binary_list[:16]
+    brucella_string = ''.join(str(e) for e in brucella_binary)
+    bovis_binary = binary_list[16:24]
+    bovis_string = ''.join(str(e) for e in bovis_binary)
+    para_binary = binary_list[24:]
+    para_string = ''.join(str(e) for e in para_binary)
+    return brucella_string, bovis_string, para_string
+
+
+def get_species_strings_for_collection(task_queue, finished_queue, timeout):
+    while True:
+        try:
+            tup = task_queue.get(block=True, timeout=timeout)
+        except queue.Empty:
+            break
+        fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum = tup
+        brucella_string, bovis_string, para_string = get_species_strings(count_summary)
+        finished_queue.put((fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum))
+        task_queue.task_done()
+
+
+def output_dbkey(file_name, dbkey, output_file=None):
+    # Output the dbkey.
+    if output_file is None:
+        # We're producing a dataset collection.
+        output_file = os.path.join(OUTPUT_DBKEY_DIR, "%s.txt" % file_name)
+    with open(output_file, "w") as fh:
+        fh.write("%s" % dbkey)
+
+
+def output_files(fastq_file, count_list, group, dbkey, dbkey_file=None, metrics_file=None):
+    base_file_name = get_base_file_name(fastq_file)
+    if dbkey_file is not None:
+        # We're dealing with a single read or
+        # a set of paired reads.  If the latter,
+        # the following will hopefully produce a
+        # good sample string.
+        if base_file_name.find("_") > 0:
+            base_file_name = base_file_name.split("_")[0]
+    output_dbkey(base_file_name, dbkey, dbkey_file)
+    output_metrics(base_file_name, count_list, group, dbkey, metrics_file)
+
+
+def output_files_for_collection(task_queue, timeout):
+    while True:
+        try:
+            tup = task_queue.get(block=True, timeout=timeout)
+        except queue.Empty:
+            break
+        fastq_file, count_list, group, dbkey = tup
+        output_files(fastq_file, count_list, group, dbkey)
+        task_queue.task_done()
+
+
+def output_metrics(file_name, count_list, group, dbkey, output_file=None):
+    # Output the metrics.
+    if output_file is None:
+        # We're producing a dataset collection.
+        output_file = os.path.join(OUTPUT_METRICS_DIR, "%s.txt" % file_name)
+    with open(output_file, "w") as fh:
+        fh.write("Sample: %s\n" % file_name)
+        fh.write("Brucella counts: ")
+        for i in count_list[:16]:
+            fh.write("%d," % i)
+        fh.write("\nTB counts: ")
+        for i in count_list[16:24]:
+            fh.write("%d," % i)
+        fh.write("\nPara counts: ")
+        for i in count_list[24:]:
+            fh.write("%d," % i)
+        fh.write("\nGroup: %s" % group)
+        fh.write("\ndbkey: %s\n" % dbkey)
+
+
+def set_num_cpus(num_files, processes):
+    num_cpus = int(multiprocessing.cpu_count())
+    if num_files < num_cpus and num_files < processes:
+        return num_files
+    if num_cpus < processes:
+        half_cpus = int(num_cpus / 2)
+        if num_files < half_cpus:
+            return num_files
+        return half_cpus
+    return processes
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, help="List of dnaprints data table value, name and path fields")
+    parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read')
+    parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
+    parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped')
+    parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', required=False, default=None, help='Output reference file')
+    parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=False, default=None, help='Output metrics file')
+    parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
+
+    args = parser.parse_args()
+
+    collection = False
+    fastq_list = []
+    if args.read1 is not None:
+        fastq_list.append(args.read1)
+        if args.read2 is not None:
+            fastq_list.append(args.read2)
+    else:
+        collection = True
+        for file_name in sorted(os.listdir(INPUT_READS_DIR)):
+            file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name))
+            fastq_list.append(file_path)
+
+    # The value of dnaprint_fields is a list of lists, where each list is
+    # the [value, name, path] components of the vsnp_dnaprints data table.
+    # The data_manager_vsnp_dnaprints tool assigns the dbkey column from the
+    # all_fasta data table to the value column in the vsnp_dnaprints data
+    # table to ensure a proper mapping for discovering the dbkey.
+    dnaprints_dict = get_dnaprints_dict(args.dnaprint_fields)
+
+    if collection:
+        # Here fastq_list consists of any number of
+        # reads, so each file will be processed and
+        # dataset collections will be produced as outputs.
+        multiprocessing.set_start_method('spawn')
+        queue1 = multiprocessing.JoinableQueue()
+        queue2 = multiprocessing.JoinableQueue()
+        num_files = len(fastq_list)
+        cpus = set_num_cpus(num_files, args.processes)
+        # Set a timeout for get()s in the queue.
+        timeout = 0.05
+
+        for fastq_file in fastq_list:
+            queue1.put(fastq_file)
+
+        # Complete the get_species_counts task.
+        processes = [multiprocessing.Process(target=get_species_counts_for_collection, args=(queue1, queue2, args.gzipped, timeout, )) for _ in range(cpus)]
+        for p in processes:
+            p.start()
+        for p in processes:
+            p.join()
+        queue1.join()
+
+        # Complete the get_species_strings task.
+        processes = [multiprocessing.Process(target=get_species_strings_for_collection, args=(queue2, queue1, timeout, )) for _ in range(cpus)]
+        for p in processes:
+            p.start()
+        for p in processes:
+            p.join()
+        queue2.join()
+
+        # Complete the get_group_and_dbkey task.
+        processes = [multiprocessing.Process(target=get_group_and_dbkey_for_collection, args=(queue1, queue2, dnaprints_dict, timeout, )) for _ in range(cpus)]
+        for p in processes:
+            p.start()
+        for p in processes:
+            p.join()
+        queue1.join()
+
+        # Complete the output_files task.
+        processes = [multiprocessing.Process(target=output_files_for_collection, args=(queue2, timeout, )) for _ in range(cpus)]
+        for p in processes:
+            p.start()
+        for p in processes:
+            p.join()
+        queue2.join()
+
+        if queue1.empty() and queue2.empty():
+            queue1.close()
+            queue1.join_thread()
+            queue2.close()
+            queue2.join_thread()
+    else:
+        # Here fastq_list consists of either a single read
+        # or a set of paired reads, producing single outputs.
+        count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped)
+        brucella_string, bovis_string, para_string = get_species_strings(count_summary)
+        group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
+        output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vsnp_determine_ref_from_data.xml	Tue Apr 21 10:08:28 2020 -0400
@@ -0,0 +1,166 @@
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
+    <description>from input data</description>
+    <requirements>
+        <requirement type="package" version="1.76">biopython</requirement>
+        <requirement type="package" version="5.3">pyyaml</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+#import os
+#import re
+#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
+#set gzipped = 'false'
+#set input_type = $input_type_cond.input_type
+#set input_reads_dir = 'input_reads'
+#set output_dbkey_dir = 'output_dbkey'
+#set output_metrics_dir = 'output_metrics'
+mkdir -p $input_reads_dir &&
+mkdir -p $output_dbkey_dir &&
+mkdir -p $output_metrics_dir &&
+#if str($input_type) == "single":
+    #set read_type_cond = $input_type_cond.read_type_cond
+    #set read1 = $read_type_cond.read1
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
+    #if str($read_type_cond.read_type) == "single":
+        ln -s '${read1}' '${read1_identifier}' &&
+        #if $read1.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+    #else:
+        #set read2 = $read_type_cond.read2
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+        ln -s '${read1}' '${read1_identifier}' &&
+        ln -s '${read2}' '${read2_identifier}' &&
+        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+    #end if
+#else:
+    #for $i in $input_type_cond.reads_collection:
+        #if $i.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+        #set filename = $i.file_name
+        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
+        ln -s $filename $input_reads_dir/$identifier &&
+    #end for
+#end if
+python '$__tool_directory__/vsnp_determine_ref_from_data.py'
+#if str($input_type) == "single":
+    #if str($read_type_cond.read_type) == "single":
+        --read1 '${read1_identifier}'
+    #else:
+        --read1 '${read1_identifier}'
+        --read2 '${read2_identifier}'
+    #end if
+    --output_dbkey '$output_dbkey'
+    --output_metrics '$output_metrics'
+#end if
+--gzipped $gzipped
+--processes $processes
+#for $i in $dnaprint_fields:
+    --dnaprint_fields '${i[0]}' '${i[2]}'
+#end for
+]]></command>
+    <inputs>
+        <conditional name="input_type_cond">
+            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
+                <option value="single" selected="true">Single files</option>
+                <option value="collection">Collections of files</option>
+            </param>
+            <when value="single">
+                <conditional name="read_type_cond">
+                    <param name="read_type" type="select" label="Choose the read type">
+                        <option value="paired" selected="true">Paired</option>
+                        <option value="single">Single</option>
+                    </param>
+                    <when value="paired">
+                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
+                    </when>
+                    <when value="single">
+                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="collection">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
+            </when>
+        </conditional>
+        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
+    </inputs>
+    <outputs>
+        <data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
+            <filter>input_type_cond['input_type'] == 'single'</filter>
+        </data>
+        <data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
+            <filter>input_type_cond['input_type'] == 'single'</filter>
+        </data>
+        <collection name="output_dbkey_collection" type="list">
+            <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" />
+            <filter>input_type_cond['input_type'] == 'collection'</filter>
+        </collection>
+        <collection name="output_metrics_collection" type="list">
+            <discover_datasets pattern="__name__" directory="output_metrics" format="txt" />
+            <filter>input_type_cond['input_type'] == 'collection'</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <!-- Need to figure out how to test installed data tables -->
+            <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/>
+            <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/>
+            <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the
+best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
+perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
+the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
+manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
+with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
+**Map with BWA-MEM**) to align the reads to the associated reference.
+
+The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+used to compile the "DNA print" string.
+
+This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
+and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.
+
+ * Mycobacterium bovis AF2122/97
+ * Brucella abortus bv. 1 str. 9-941
+ * Brucella abortus strain BER
+ * Brucella canis ATCC 23365
+ * Brucella ceti TE10759-12
+ * Brucella melitensis bv. 1 str. 16M
+ * Brucella melitensis bv. 3 str. Ether
+ * Brucella melitensis BwIM_SOM_36b
+ * Brucella melitensis ATCC 23457
+ * Brucella ovis ATCC 25840
+ * Brucella suis 1330
+ * Mycobacterium tuberculosis H37Rv
+ * Mycobacterium avium subsp. paratuberculosis strain Telford
+ * Mycobacterium avium subsp. paratuberculosis K-10
+ * Brucella suis ATCC 23445
+ * Brucella suis bv. 3 str. 686
+
+**Required Options**
+
+ * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option.
+ * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {1. Stuber T},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {https://github.com/USDA-VS/vSNP},}
+        </citation>
+    </citations>
+</tool>
+