view vsnp_determine_ref_from_data.xml @ 1:bca267738b33 draft

Uploaded
author greg
date Thu, 19 Nov 2020 21:25:31 +0000
parents ebc08e5ce646
children ee6166a3edd8
line wrap: on
line source

<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
    <description>from input data</description>
    <requirements>
        <requirement type="package" version="1.76">biopython</requirement>
        <requirement type="package" version="5.3">pyyaml</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
#import os
#import re
#set gzipped = 'false'
#set input_type = $input_type_cond.input_type
#set input_reads_dir = 'input_reads'
#set output_dbkey_dir = 'output_dbkey'
#set output_metrics_dir = 'output_metrics'
mkdir -p $input_reads_dir &&
mkdir -p $output_dbkey_dir &&
mkdir -p $output_metrics_dir &&
#if str($input_type) == "single":
    #set read_type_cond = $input_type_cond.read_type_cond
    #set read1 = $read_type_cond.read1
    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
    #if str($read_type_cond.read_type) == "single":
        ln -s '${read1}' '${read1_identifier}' &&
        #if $read1.is_of_type('fastqsanger.gz'):
            #set gzipped = 'true'
        #end if
    #else:
        #set read2 = $read_type_cond.read2
        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
        ln -s '${read1}' '${read1_identifier}' &&
        ln -s '${read2}' '${read2_identifier}' &&
        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
            #set gzipped = 'true'
        #end if
    #end if
#else:
    #for $i in $input_type_cond.collection_type_cond.reads_collection:
        #if $i.is_of_type('fastqsanger.gz'):
            #set gzipped = 'true'
        #end if
        #set filename = $i.file_name
        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
        ln -s '$filename' '$input_reads_dir/$identifier' &&
    #end for
#end if
python '$__tool_directory__/vsnp_determine_ref_from_data.py'
#if str($input_type) == "single":
    #if str($read_type_cond.read_type) == "single":
        --read1 '${read1_identifier}'
    #else:
        --read1 '${read1_identifier}'
        --read2 '${read2_identifier}'
    #end if
    --output_dbkey '$output_dbkey'
    --output_metrics '$output_metrics'
#end if
--gzipped $gzipped
--processes $processes
#if str($in_test_mode) == "false":
    #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
    #for $i in $dnaprint_fields:
        --dnaprint_fields '${i[0]}' '${i[2]}'
    #end for
#else:
    --in_test_mode '$in_test_mode'
#end if
]]></command>
    <inputs>
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
                <option value="single" selected="true">Single files</option>
                <option value="collection">Collection of files</option>
            </param>
            <when value="single">
                <conditional name="read_type_cond">
                    <param name="read_type" type="select" label="Choose the read type">
                        <option value="paired" selected="true">Paired</option>
                        <option value="single">Single</option>
                    </param>
                    <when value="paired">
                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
                    </when>
                    <when value="single">
                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
                    </when>
                </conditional>
            </when>
            <when value="collection">
                <conditional name="collection_type_cond">
                    <param name="collection_type" type="select" label="Collection of single reads or paired reads?">
                        <option value="single_reads" selected="true">Single reads</option>
                        <option value="paired_reads">Paired reads</option>
                    </param>
                    <when value="single_reads">
                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
                    </when>
                    <when value="paired_reads">
                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
                    </when>
                </conditional>
            </when>
        </conditional>
        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
        <!-- Functional testing -->
        <param name="in_test_mode" type="hidden" value="false"/>
    </inputs>
    <outputs>
        <data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
            <filter>input_type_cond['input_type'] == 'single'</filter>
        </data>
        <data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
            <filter>input_type_cond['input_type'] == 'single'</filter>
        </data>
        <collection name="output_dbkey_collection" type="list">
            <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
            <filter>input_type_cond['input_type'] == 'collection'</filter>
        </collection>
        <collection name="output_metrics_collection" type="list">
            <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
            <filter>input_type_cond['input_type'] == 'collection'</filter>
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="in_test_mode" value="true"/>
            <param name="read_type" value="single"/>
            <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
            <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
            <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
        </test>
        <test>
            <param name="in_test_mode" value="true"/>
            <param name="input_type" value="collection"/>
            <param name="collection_type" value="paired_reads"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
                    <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
                </collection>
            </param>
            <output_collection name="output_dbkey_collection" type="list">
                <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/>
                <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
            </output_collection>
            <output_collection name="output_metrics_collection" type="list">
                <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
                <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
            </output_collection>
        </test>
    </tests>
    <help>
**What it does**

Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the
best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
**Map with BWA-MEM**) to align the reads to the associated reference.

The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
used to compile the "DNA print" string.

This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.

 * Mycobacterium bovis AF2122/97
 * Brucella abortus bv. 1 str. 9-941
 * Brucella abortus strain BER
 * Brucella canis ATCC 23365
 * Brucella ceti TE10759-12
 * Brucella melitensis bv. 1 str. 16M
 * Brucella melitensis bv. 3 str. Ether
 * Brucella melitensis BwIM_SOM_36b
 * Brucella melitensis ATCC 23457
 * Brucella ovis ATCC 25840
 * Brucella suis 1330
 * Mycobacterium tuberculosis H37Rv
 * Mycobacterium avium subsp. paratuberculosis strain Telford
 * Mycobacterium avium subsp. paratuberculosis K-10
 * Brucella suis ATCC 23445
 * Brucella suis bv. 3 str. 686

**Required Options**

 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
    </help>
    <citations>
        <citation type="bibtex">
            @misc{None,
            journal = {None},
            author = {1. Stuber T},
            title = {Manuscript in preparation},
            year = {None},
            url = {https://github.com/USDA-VS/vSNP},}
        </citation>
    </citations>
</tool>