Mercurial > repos > greg > vsnp_determine_ref_from_data

diff vsnp_determine_ref_from_data.xml @ 4:36bdf8b439ed draft
Uploaded
author: greg
date: Sun, 03 Jan 2021 16:13:22 +0000
parents: 6116deacb2c7
children: d5e66f9fe086
--- a/vsnp_determine_ref_from_data.xml	Mon Nov 23 21:42:34 2020 +0000
+++ b/vsnp_determine_ref_from_data.xml	Sun Jan 03 16:13:22 2021 +0000
@@ -1,181 +1,132 @@
-<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
     <description>from input data</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
     <requirements>
         <requirement type="package" version="1.76">biopython</requirement>
         <requirement type="package" version="5.3">pyyaml</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-#import os
 #import re
 #set gzipped = 'false'
 #set input_type = $input_type_cond.input_type
-#set input_reads_dir = 'input_reads'
-#set output_dbkey_dir = 'output_dbkey'
-#set output_metrics_dir = 'output_metrics'
-mkdir -p $input_reads_dir &&
-mkdir -p $output_dbkey_dir &&
-mkdir -p $output_metrics_dir &&
-#if str($input_type) == "single":
-    #set read_type_cond = $input_type_cond.read_type_cond
-    #set read1 = $read_type_cond.read1
+
+#if $input_type in ["single", "pair"]:
+    #set read1 = $input_type_cond.read1
     #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
-    #if str($read_type_cond.read_type) == "single":
-        ln -s '${read1}' '${read1_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+    ln -s '${read1}' '${read1_identifier}' &&
+    #if $input_type == "pair":
+        #set read2 = $input_type_cond.read2
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+        ln -s '${read2}' '${read2_identifier}' &&
     #else:
-        #set read2 = $read_type_cond.read2
-        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
-        ln -s '${read1}' '${read1_identifier}' &&
-        ln -s '${read2}' '${read2_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+        #set read2 = None 
     #end if
 #else:
-    #set collection_type = $input_type_cond.collection_type_cond.collection_type
-    #for $i in $input_type_cond.collection_type_cond.reads_collection:
-        #if $i.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
-        #set filename = $i.file_name
-        #if str($collection_type) == 'single_reads':
-            #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        #else:
-            ## Galaxy builds lists of pairs as nested lists with elements
-            ## named forward and reverse.  When flattened, these lists
-            ## will work as inputs to the Parse parameter value expression
-            ## tool in workflows.  However, the output list created by the
-            ## expression tool will not function correctly with the bwa_mem
-            ## mapper.  Naming the identifier as follows is a solution.
-            #set identifier = re.sub('[^\s\w\-]', '_', str($i.name))
-        #end if
-        ln -s '$filename' '$input_reads_dir/$identifier' &&
-    #end for
+    #set read1 = $input_type_cond.reads_collection['forward']
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
+    ln -s '${read1}' '${read1_identifier}' &&
+    #set read2 = $input_type_cond.reads_collection['reverse']
+    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
+    ln -s '${read2}' '${read2_identifier}' &&
 #end if
+
 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
-#if str($input_type) == "single":
-    #if str($read_type_cond.read_type) == "single":
-        --read1 '${read1_identifier}'
-    #else:
-        --read1 '${read1_identifier}'
-        --read2 '${read2_identifier}'
+    --read1 '${read1_identifier}'
+    #if $read2 is not None
+      --read2 '${read2_identifier}'
     #end if
     --output_dbkey '$output_dbkey'
     --output_metrics '$output_metrics'
+#if $read1.is_of_type('fastqsanger.gz'):
+    --gzipped
 #end if
---gzipped $gzipped
---processes $processes
-#if str($in_test_mode) == "false":
-    #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
-    #for $i in $dnaprint_fields:
-        --dnaprint_fields '${i[0]}' '${i[2]}'
-    #end for
-#else:
-    --in_test_mode '$in_test_mode'
-#end if
+#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
+#for $i in $dnaprint_fields:
+    --dnaprint_fields '${i[0]}' '${i[2]}'
+#end for
 ]]></command>
     <inputs>
         <conditional name="input_type_cond">
             <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
                 <option value="single" selected="true">Single files</option>
-                <option value="collection">Collection of files</option>
+		<option value="paired">Paired reads</option>
+		<option value="pair">Paired reads in separate data sets</option>
             </param>
             <when value="single">
-                <conditional name="read_type_cond">
-                    <param name="read_type" type="select" label="Choose the read type">
-                        <option value="paired" selected="true">Paired</option>
-                        <option value="single">Single</option>
-                    </param>
-                    <when value="paired">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
-                    </when>
-                    <when value="single">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                    </when>
-                </conditional>
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
             </when>
-            <when value="collection">
-                <conditional name="collection_type_cond">
-                    <param name="collection_type" type="select" label="Collection of single reads or paired reads?">
-                        <option value="single_reads" selected="true">Single reads</option>
-                        <option value="paired_reads">Paired reads</option>
-                    </param>
-                    <when value="single_reads">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
-                    </when>
-                    <when value="paired_reads">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
-                    </when>
-                </conditional>
+            <when value="paired">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
+            </when>
+            <when value="pair">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
             </when>
         </conditional>
-        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
-        <!-- Functional testing -->
-        <param name="in_test_mode" type="hidden" value="false"/>
     </inputs>
     <outputs>
-        <data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
-            <filter>input_type_cond['input_type'] == 'single'</filter>
-        </data>
-        <data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
-            <filter>input_type_cond['input_type'] == 'single'</filter>
-        </data>
-        <collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
-            <filter>input_type_cond['input_type'] == 'collection'</filter>
-        </collection>
-        <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
-            <filter>input_type_cond['input_type'] == 'collection'</filter>
-        </collection>
+        <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
+        <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
     </outputs>
     <tests>
-        <test>
-            <param name="in_test_mode" value="true"/>
-            <param name="read_type" value="single"/>
+        <!-- 1 single read -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="single"/>
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
             <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
             <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
         </test>
-        <test>
-            <param name="in_test_mode" value="true"/>
-            <param name="input_type" value="collection"/>
-            <param name="collection_type" value="paired_reads"/>
+        <!-- 1 set of paired reads -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="pair"/>
+            <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
+            <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
+        </test>
+        <!-- A collection of paired reads -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="paired"/>
             <param name="reads_collection">
                 <collection type="paired">
-                    <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
-                    <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
+                    <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
+                    <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
                 </collection>
             </param>
-            <output_collection name="output_dbkey_collection" type="list">
-                <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/>
-                <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
-            </output_collection>
-            <output_collection name="output_metrics_collection" type="list">
-                <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
-                <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
-            </output_collection>
+            <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
         </test>
     </tests>
     <help>
 **What it does**
 
-Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the
-best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
-perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
-the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
-manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
-with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
-**Map with BWA-MEM**) to align the reads to the associated reference.
+Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
+inspects the data to discover the best reference genome for aligning the reads.
+
+The information needed to discover the best reference is maintained by the USDA in this repository_.  References are curreently
+
+.. _repository:  https://github.com/USDA-VS/vSNP_reference_options
+
+limited to TB complex, paraTB, and Brucella, but information for additional references will be added.  The information for each
+reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print".   These strings
+are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
 
-The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
-used to compile the "DNA print" string.
+This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
+input sample data.  During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
+consisting of zeros and ones based on the counts, (i.e., a DNA print).  This string is then compared to the strings contained
+in the in-memory dictionary of DNA prints to find a match.
+
+The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
+the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
+to align the reads to the associated reference.
+
+This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
 
 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
-and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.
+and discovering the primary species is critical.  DNA print matching is currently supported for the following genomes.
 
  * Mycobacterium bovis AF2122/97
  * Brucella abortus bv. 1 str. 9-941
@@ -197,17 +148,7 @@
 **Required Options**
 
  * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
- * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
     </help>
-    <citations>
-        <citation type="bibtex">
-            @misc{None,
-            journal = {None},
-            author = {1. Stuber T},
-            title = {Manuscript in preparation},
-            year = {None},
-            url = {https://github.com/USDA-VS/vSNP},}
-        </citation>
-    </citations>
+    <expand macro="citations"/>
 </tool>
author	greg
date	Sun, 03 Jan 2021 16:13:22 +0000
parents	6116deacb2c7
children	d5e66f9fe086