Mercurial > repos > rnateam > sortmerna

<tool id="bg_sortmerna" name="Filter with SortMeRNA" version="2.0.0">
    <description>Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data</description>
    <requirements>
        <requirement type='package' version="2.0">sortmerna</requirement>
    </requirements>
    <stdio>
        <regex match="This program builds a Burst trie on an input rRNA database"
            source="both"
            level="fatal"
            description="Buildtrie program failed to execute." />
        <regex match="The database name"
            source="both"
            level="fatal"
            description="The database ${databases} has not been preprocessed using buildtrie before using SortMeRNA." />
    </stdio>
    <version_command>
<![CDATA[
sortmerna --version 2>&1|grep 'SortMeRNA version'
]]>
    </version_command>
    <command>
<![CDATA[
    #set $ref = ''
    #set $sep=''
    #if str( $databases_type.databases_selector ) == 'history':
        #for $db in $databases_type.database_name
            #set $ref += $sep + str($db) + ',' + $os.path.splitext($os.path.basename(str($db)))[0]
            #set $sep = ':'
        #end for
        indexdb_rna --ref $ref
        &&
    #else:
        ## databases path is not directly accessible, must match by hand with LOC file contents
        #set $data_table = dict([(_[0], _[2]) for _ in $databases_type.input_databases.input.options.tool_data_table.data])
        #for $db in $databases_type.input_databases.value
            #set $ref += $sep + $data_table[$db] + ',' + $os.path.splitext($data_table[$db])[0]
            #set $sep = ':'
        #end for
    #end if
    sortmerna --ref $ref --reads $input_reads --aligned aligned
    #if str( $sequencing_type.sequencing_type_selector ) == 'paired'
        $sequencing_type.paired_type
    #end if
    $strand_search
    $aligned_fastx.aligned_fastx_selector
    #if $aligned_fastx.aligned_fastx_selector == '--fastx'
        #if $aligned_fastx.other
            --other other_file
        #end if
    #end if
    $aligned_sam.aligned_sam_selector
    #if $aligned_sam.aligned_sam_selector == '--sam'
        $aligned_sam.sq
    #end if
    $aligned_blast
    $log
    -a \${GALAXY_SLOTS:-1}
]]>
    </command>
    <inputs>
        <param format="fasta,fastq" name="input_reads" type="data" label="Querying sequences" help="In FASTA or FASTQ format (--reads)"/>
        <conditional name="sequencing_type">
            <param name="sequencing_type_selector" type="select" label="Sequencing type">
                <option value="not_paired">Reads are not paired</option>
                <option value="paired">Reads are paired</option>
            </param>
            <when value="paired">
                <param name="paired_type" type="select" display="radio" label="If one of the paired-end reads aligns and the other one does not">
                    <option value="">leave the reads split between aligned and rejected files</option>
                    <option value="--paired-in">output both reads to aligned file (--paired-in)</option>
                    <option value="--paired-out">output both reads to rejected file (--paired-out)</option>
                </param>
            </when>
        </conditional>

        <param name="strand_search" type="select" label="Which strands to search" display="radio">
            <option value="">Search both strands</option>
            <option value="-F">Search only the forward strand (-F)</option>
            <option value="-R">Search only the reverse-complementary strand (-R)</option>
        </param>

        <conditional name="databases_type">
            <param name="databases_selector" type="select" label="Databases to query"
                help="Public rRNA databases provided with SortMeRNA have been indexed.
                    On the contrary, personal databases must be indexed each time SortMeRNA is launched.
                    Please be patient, this may take some time depending on the size of the given database.">
                <option value="cached" selected="true">Public ribosomal databases</option>
                <option value="history">Databases from your history</option>
            </param>
            <when value="cached">
                <param name="input_databases" label="rRNA databases" type="select" display="checkboxes" multiple="true">
                    <options from_data_table="rRNA_databases" />
                    <validator type="no_options" message="Select at least one database"/>
                </param>
            </when>
            <when value="history">
                <param name="database_name" type="data" format="fasta" multiple="true" label="rRNA databases"
                    help="Your databases will be indexed first, which may take up to several minutes."/>
            </when>
        </conditional>

        <!-- Outputs -->
        <conditional name="aligned_fastx">
            <param name="aligned_fastx_selector" type="select" label="Include aligned reads in FASTA/FASTQ format">
                <option value="--fastx">Yes (--fastx)</option>
                <option value="">No</option>
            </param>
            <when value="--fastx">
                <param name="other" type="boolean" label="Include rejected reads file" help="(--other)" />
            </when>
            <when value="" />
        </conditional>
        <conditional name="aligned_sam">
            <param name="aligned_sam_selector" type="select" label="Include alignments in SAM format">
                <option value="--sam">Yes (--sam)</option>
                <option value="">No</option>
            </param>
            <when value="--sam">
                <param name="sq" type="boolean" truevalue="--SQ" falsevalue="" label="Add SQ tags to the SAM file" help="(--SQ)" />
            </when>
            <when value="" />
        </conditional>
        <param name="aligned_blast" type="select" label="Include alignments in BLAST-like format">
            <option value="--blast 0">pairwise (--blast 0)</option>
            <option value="--blast 1">tabular BLAST -m 8 format (--blast 1)</option>
            <option value="--blast 2">tabular + column for CIGAR (--blast 2)</option>
            <option value="--blast 3">tabular + columns for CIGAR and query coverage (--blast 3)</option>
            <option value="" selected="true">No</option>
        </param>
        <param name="log" type="boolean" checked="False" truevalue="--log" falsevalue="" label="Generate statistics file"
               help="Generates statistics for the rRNA content of reads, as well as rRNA subunit distribution. (--log)">
        </param>
    </inputs>
    <outputs>
        <data format_source="input_reads" name="output_fastx" from_work_dir="aligned.dat"
            label="Aligned reads on ${on_string} (${input_reads.datatype.file_ext})">
            <filter>aligned_fastx['aligned_fastx_selector']</filter>
        </data>
        <data format_source="input_reads" name="output_other" from_work_dir="other_file.dat"
            label="Rejected reads on ${on_string} (${input_reads.datatype.file_ext})">
            <filter>aligned_fastx['aligned_fastx_selector'] and aligned_fastx['other']</filter>
        </data>
        <data format="sam" name="output_sam" from_work_dir="aligned.sam"
            label="Alignments on ${on_string} (SAM)">
            <filter>aligned_sam['aligned_sam_selector']</filter>
        </data>
        <data format="tabular" name="output_blast" from_work_dir="aligned.blast"
            label="Alignments on ${on_string} (BLAST)">
            <filter>aligned_blast</filter>
            <change_format>
                <when input="aligned_blast" value="--blast 0" format="txt" />
            </change_format>
        </data>
        <data format="txt" name="output_log" label="${tool.name} statistics (txt)" from_work_dir="aligned.log">
            <filter>log</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input_reads" value="read_small.fastq" />
            <param name="sequencing_type_selector" value="not_paired" />
            <param name="strand_search" value="" />
            <param name="databases_selector" value="history" />
            <param name="database_name" value="ref_small.fasta" />
            <param name="other" value="True" />
            <param name="log" value="" />
            <output name="output_fastx" file="sortmerna_wrapper_accept1.fastq" />
            <output name="output_other" file="sortmerna_wrapper_other1.fastq" />
            <output name="output_sam" file="sortmerna_wrapper_sam1.sam" lines_diff="2" />
        </test>
        <test>
            <param name="input_reads" value="read_small.fasta" />
            <param name="sequencing_type_selector" value="not_paired" />
            <param name="strand_search" value="" />
            <param name="databases_selector" value="history" />
            <param name="database_name" value="ref_small.fasta" />
            <param name="other" value="True" />
            <param name="log" value="" />
            <output name="output_fastx" file="sortmerna_wrapper_accept2.fasta" />
            <output name="output_other" file="sortmerna_wrapper_other2.fasta" />
            <output name="output_sam" file="sortmerna_wrapper_sam2.sam" lines_diff="2" />
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

SortMeRNA_ is a software designed to rapidly filter ribosomal RNA fragments
from metatransriptomic data produced by next-generation sequencers.
It is capable of handling large RNA databases and sorting out all fragments
matching to the database with high accuracy and specificity.

.. _SortMeRNA: http://bioinfo.lifl.fr/RNA/sortmerna/


**Input**

The input is one file of reads in FASTA or FASTQ format and any number of rRNA databases to search against.
If the user has two foward-reverse paired-sequencing reads files, they may use
the script "merge_paired_reads.sh" to interleave the reads into one file, preserving their order.

If the sequencing type for the reads is paired-ended, the user has two options under
"Sequencing type" to filter the reads and preserve their order in the file.
For a further example of each option, please refer to Section 4.2.3 in the `SortMeRNA User Manual`_.

.. _sortmerna user manual: http://bioinfo.lifl.fr/RNA/sortmerna/code/SortMeRNA-user-manual-v1.7.pdf


**Output**

The output will follow the same format (FASTA or FASTQ) as the reads. Optionally, a statistic file for the rRNA content of reads, as well as rRNA subunit distribution can be generated.


**rRNA databases**

SortMeRNA is distributed with 8 representative rRNA databases, which were
all constructed from the SILVA SSU,LSU (version 111) and the RFAM 5/5.8S
(version 11.0) databases using the tool UCLUST.

+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| Representative database  | id % | average id% | # seq (clustered) | Origin                 |  # seq (original) |
+==========================+======+=============+===================+========================+===================+
| SILVA 16S bacteria       |   85 |        91.6 |              8174 | SILVA SSU Ref NR v.111 |            244077 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| SILVA 16S archaea        |   95 |        96.7 |              3845 | SILVA SSU Ref NR v.111 |             10919 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| SILVA 18S eukarya        |   95 |        96.7 |              4512 | SILVA SSU Ref NR v.111 |             31862 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| SILVA 23S bacteria       |   98 |        99.4 |              3055 | SILVA LSU Ref v.111    |             19580 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| SILVA 23s archaea        |   98 |        99.5 |               164 | SILVA LSU Ref v.111    |               405 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| SILVA 28S eukarya        |   98 |        99.1 |              4578 | SILVA LSU Ref v.111    |              9321 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| Rfam 5S archaea/bacteria |   98 |        99.2 |             59513 | RFAM                   |            116760 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+
| Rfam 5.8S eukarya        |   98 |        98.9 |             13034 | RFAM                   |            225185 |
+--------------------------+------+-------------+-------------------+------------------------+-------------------+

id %: members of the cluster must have identity at least 'id %' identity with the representative sequence

average id %: average identity of a cluster member to the representative sequence

The user may also choose to use their own rRNA databases.

.. class:: warningmark

Note that your personal databases are indexed each time, and that
this may take some time depending on the size of the given database.
]]>
    </help>

    <citations>
        <citation type="doi">10.1093/bioinformatics/bts611</citation>
        <citation type="doi">10.1093/nar/gks1219</citation>
        <citation type="doi">10.1093/nar/gks1005</citation>
        <citation type="doi">10.1093/bioinformatics/btq461</citation>
        <citation type="doi">10.1038/nbt.2198</citation>
    </citations>
</tool>
author	iuc
date	Wed, 05 Aug 2015 02:50:43 -0400
parents	a8ac09e937f3
children	3699b6b771e0