Mercurial > repos > iuc > transdecoder

<tool id="transdecoder" name="TransDecoder" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>finds coding regions within transcripts</description>
    <macros>
        <token name="@TOOL_VERSION@">5.5.0</token>
        <token name="@VERSION_SUFFIX@">1</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">transdecoder</requirement>
        <requirement type="package" version="3.0">zip</requirement>
    </requirements>
    <version_command><![CDATA[TransDecoder.LongOrfs --version 2>&1 | grep 'TransDecoder.LongOrfs' | cut -f 2 -d ' ']]></version_command>
    <command detect_errors="exit_code"><![CDATA[
## initialize
ln -s '${t}' 'transcripts.fasta' &&

## run TransDecoder.LongOrfs
TransDecoder.LongOrfs
## Shared options
-t 'transcripts.fasta'
-G '${G}'
## LongOrfs options
#if $lo.gene_trans_map
    --gene_trans_map '${lo.gene_trans_map}'
#end if
-m $lo.m
${lo.S}
-O 'output' ## required, otherwise value of -t is used as output folder

## run TransDecoder.Predict
#if $po.predict_cond.predict_sel == 'yes'
    && TransDecoder.Predict
    ## Shared options
    -t 'transcripts.fasta'
    -G '${G}'
    ## Predict options
    --retain_long_orfs_mode $po.predict_cond.mode_cond.mode_sel
    #if $po.predict_cond.mode_cond.mode_sel == 'strict'
        --retain_long_orfs_length $po.predict_cond.mode_cond.retain_long_orfs_length
    #end if
    #if $po.predict_cond.retain_pfam_hits
        --retain_pfam_hits '$po.predict_cond.retain_pfam_hits'
    #end if
    #if $po.predict_cond.retain_blastp_hits
        --retain_blastp_hits '$po.predict_cond.retain_blastp_hits'
    #end if
    $po.predict_cond.single_best_only
    $po.predict_cond.no_refine_starts
    -T $po.predict_cond.T
    -O 'output'
#end if

## postprocessing
#if 'log' in $oo.out
    |& tee '$out_log'
#end if
&& if ls 'output/'*'.pdf' >/dev/null 2>&1; then zip 'seqlogo.zip' 'output/'*'.pdf' -q -j; fi
    ]]></command>
    <inputs>
        <param argument="-t" type="data" format="fasta" label="Select file with transcripts"/>
        <param argument="-G" type="select" label="Select genetic code">
            <option value="Acetabularia">Acetabularia</option>
            <option value="Candida">Candida</option>
            <option value="Ciliate">Ciliate</option>
            <option value="Dasycladacean">Dasycladacean</option>
            <option value="Euplotid">Euplotid</option>
            <option value="Hexamita">Hexamita</option>
            <option value="Mesodinium">Mesodinium</option>
            <option value="Mitochondrial-Ascidian">Mitochondrial-Ascidian</option>
            <option value="Mitochondrial-Chlorophycean">Mitochondrial-Chlorophycean</option>
            <option value="Mitochondrial-Echinoderm">Mitochondrial-Echinoderm</option>
            <option value="Mitochondrial-Flatworm">Mitochondrial-Flatworm</option>
            <option value="Mitochondrial-Invertebrates">Mitochondrial-Invertebrates</option>
            <option value="Mitochondrial-Protozoan">Mitochondrial-Protozoan</option>
            <option value="Mitochondrial-Pterobranchia">Mitochondrial-Pterobranchia</option>
            <option value="Mitochondrial-Scenedesmus_obliquus">Mitochondrial-Scenedesmus_obliquus</option>
            <option value="Mitochondrial-Thraustochytrium">Mitochondrial-Thraustochytrium</option>
            <option value="Mitochondrial-Trematode">Mitochondrial-Trematode</option>
            <option value="Mitochondrial-Vertebrates">Mitochondrial-Vertebrates</option>
            <option value="Mitochondrial-Yeast">Mitochondrial-Yeast</option>
            <option value="Pachysolen_tannophilus">Pachysolen_tannophilus</option>
            <option value="Peritrich">Peritrich</option>
            <option value="SR1_Gracilibacteria">SR1_Gracilibacteria</option>
            <option value="Tetrahymena">Tetrahymena</option>
            <option value="Universal" selected="true">Universal</option>
        </param>
        <section name="lo" title="LongOrfs options" expanded="true">
            <param argument="--gene_trans_map" type="data" format="tabular" optional="true" label="Select gene-to-transcript identifier mapping file" help="gene_id&lt;tab&gt;trans_id&lt;return&gt;"/>
            <param argument="-m" type="integer" value="100" min="1" label="Set minimum protein length"/>
            <param argument="-S" type="boolean" truevalue="-S" falsevalue="" label="Activate strand-specificity?" help="Only analyse top strand?"/>
        </section>
        <section name="po" title="Predict options" expanded="true">
            <!--
                TransDecoder.Predict can be skipped if only longest_orfs.pep (as a result of TransDecoder.LongOrfs) is required, e.g. for homology search via BlastP and Pfam.
            -->
            <conditional name="predict_cond">
                <param name="predict_sel" type="select" label="Should likely coding regions be predicted?" help="(TransDecoder.Predict)">
                    <option value="yes" selected="true">Yes</option>
                    <option value="no">No</option>
                </param>
                <when value="yes">
                    <conditional name="mode_cond">
                        <param argument="mode_sel" type="select" label="Select mode to retain long ORFs" help="In dynamic mode: set range according to 1% FDR in a random sequence of same GC content.">
                            <option value="dynamic" selected="true">Dynamic</option>
                            <option value="strict">Strict</option>
                        </param>
                        <when value="dynamic"/>
                        <when value="strict">
                            <param argument="--retain_long_orfs_length" type="integer" value="1000000" min="0" label="Set long ORFs length" help="Retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence marks it as coding."/>
                        </when>
                    </conditional>
                    <param argument="--retain_blastp_hits" type="data" format="tabular" optional="true" label="Select BlastP result file" help="Any ORF with a blast match will be retained in the final output. (tabular outfmt6 file)"/>
                    <param argument="--retain_pfam_hits" type="data" format="tabular" optional="true" label="Select Pfam result file" help="Domain table output file from running hmmscan to search Pfam. Any ORF with a pfam domain hit will be retained in the final output. (tabular domtblout file)"/>
                    <param argument="--single_best_only" type="boolean" truevalue="--single_best_only" falsevalue="" label="Retain only the single best ORF per transcript?" help="Prioritized by homology than ORF length."/>
                    <param argument="--no_refine_starts" type="boolean" truevalue="--no_refine_starts" falsevalue="" label="Start refinement that identifies potential start codons for 5' partial ORFs using a PWM?"/>
                    <param argument="-T" type="integer" value="500" min="1" label="Set top longest ORFs to train Markov Model" help="The first (10*value) elements are selected for removing redundancies. Then number of longst ORFs of this value are selected from the non-redundant set."/>
                </when>
                <when value="no"/>
            </conditional>
        </section>
        <section name="oo" title="Output options" expanded="true">
            <param name="out" type="select" multiple="true" optional="false" label="Select output file(s)" help="Only shown in history if selected here and generated by the specific run.">
                <!-- LongOrfs -->
                <option value="lo_cds">Longest ORFs (CDS)</option>
                <option value="lo_gff3">Longest ORFs (GFF3)</option>
                <option value="lo_pep" selected="true">Longest ORFs (PEP)</option>
                <!-- Predict -->
                <option value="bed" selected="true">Results (BED)</option>
                <option value="cds" selected="true">Results (CDS)</option>
                <option value="gff3" selected="true">Results (GFF3)</option>
                <option value="pep" selected="true">Results (PEP)</option>
                <option value="plot">Plots</option>
                <!-- Others -->
                <option value="log">Log</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <!-- LongOrfs -->
        <data name="out_lo_cds" format="fasta" from_work_dir="output/longest_orfs.cds" label="${tool.name} on ${on_string}: Longest ORFs (CDS/FASTA)">
            <filter>'lo_cds' in oo['out']</filter>
        </data>
        <data name="out_lo_gff3" format="gff3" from_work_dir="output/longest_orfs.gff3" label="${tool.name} on ${on_string}: Longest ORFs (GFF3)">
            <filter>'lo_gff3' in oo['out']</filter>
        </data>
        <data name="out_lo_pep" format="fasta" from_work_dir="output/longest_orfs.pep" label="${tool.name} on ${on_string}: Longest ORFs (PEP/FASTA)">
            <filter>'lo_pep' in oo['out']</filter>
        </data>
        <!-- Predict -->
        <data name="out_bed" format="bed" from_work_dir="transcripts.fasta.transdecoder.bed" label="${tool.name} on ${on_string}: Results (BED)">
            <filter>'bed' in oo['out'] and po['predict_cond']['predict_sel'] == 'yes'</filter>
        </data>
        <data name="out_cds" format="fasta" from_work_dir="transcripts.fasta.transdecoder.cds" label="${tool.name} on ${on_string}: Results (CDS/FASTA)">
            <filter>'cds' in oo['out'] and po['predict_cond']['predict_sel'] == 'yes'</filter>
        </data>
        <data name="out_gff3" format="gff3" from_work_dir="transcripts.fasta.transdecoder.gff3" label="${tool.name} on ${on_string}: Results (GFF3)">
            <filter>'gff3' in oo['out'] and po['predict_cond']['predict_sel'] =='yes'</filter>
        </data>
        <data name="out_pep" format="fasta" from_work_dir="transcripts.fasta.transdecoder.pep" label="${tool.name} on ${on_string}: Results (PEP/FASTA)">
            <filter>'pep' in oo['out'] and po['predict_cond']['predict_sel'] == 'yes'</filter>
        </data>
        <data name="out_plot" format="zip" from_work_dir="seqlogo.zip" label="${tool.name} on ${on_string}: Plots">
            <filter>'plot' in oo['out'] and po['predict_cond']['predict_sel'] == 'yes'</filter>
        </data>
        <!-- Others -->
        <data name="out_log" format="txt" label="${tool.name} on ${on_string}: Log">
            <filter>'log' in oo['out']</filter>
        </data>
    </outputs>
    <tests>
        <!-- #1 default -->
        <test expect_num_outputs="5">
            <param name="t" value="transcripts.fasta"/>
            <!-- LongOrfs -->
            <output name="out_lo_pep">
                <assert_contents>
                    <has_n_lines n="190"/>
                    <has_line line=">CUFF.20.1.p2 type:3prime_partial len:205 gc:Universal CUFF.20.1:612-1(-)"/>
                </assert_contents>
            </output>
            <!-- Predict -->
            <output name="out_bed">
                <assert_contents>
                    <has_n_lines n="81"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_cds">
                <assert_contents>
                    <has_n_lines n="1578"/>
                    <has_line line=">CUFF.9.1.p1 GENE.CUFF.9.1~~CUFF.9.1.p1  ORF type:complete len:156 (+),score=36.30 CUFF.9.1:173-640(+)"/>
                </assert_contents>
            </output>
            <output name="out_gff3">
                <assert_contents>
                    <has_n_lines n="542"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_pep">
                <assert_contents>
                    <has_n_lines n="608"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 -->
        <test expect_num_outputs="9">
            <param name="t" value="transcripts.fasta"/>
            <param name="G" value="Acetabularia"/>
            <section name="lo">
                <param name="m" value="101"/>
                <param name="S" value="true"/>
            </section>
            <section name="po">
                <conditional name="predict_cond">
                    <param name="predict_sel" value="yes"/>
                    <conditional name="mode_cond">
                        <param name="mode_sel" value="strict"/>
                        <param name="retain_long_orfs_length" value="1000001"/>
                    </conditional>
                    <param name="retain_blastp_hits" value="blastp.outfmt6"/>
                    <param name="retain_pfam_hits" value="pfam.domtblout"/>
                    <param name="single_best_only" value="true"/>
                    <param name="no_refine_starts" value="true"/>
                    <param name="T" value="501"/>
                </conditional>
            </section>
            <section name="oo">
                <param name="out" value="lo_pep,lo_gff3,lo_cds,bed,cds,gff3,pep,plot,log"/>
            </section>
            <!-- LongOrfs -->
            <output name="out_lo_cds">
                <assert_contents>
                    <has_n_lines n="398"/>
                    <has_text_matching expression="CUFF\.20\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_lo_gff3">
                <assert_contents>
                    <has_n_lines n="1330"/>
                    <has_text_matching expression="CUFF\.20\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_lo_pep">
                <assert_contents>
                    <has_n_lines n="398"/>
                    <has_text_matching expression="CUFF\.20\.1.+"/>
                </assert_contents>
            </output>
            <!-- Predict -->
            <output name="out_bed">
                <assert_contents>
                    <has_n_lines n="62"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_cds">
                <assert_contents>
                    <has_n_lines n="1312"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_gff3">
                <assert_contents>
                    <has_n_lines n="395"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_pep">
                <assert_contents>
                    <has_n_lines n="500"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
            <output name="out_plot">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
            <!-- Others -->
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="transdecoder is finished.+"/>
                </assert_contents>
            </output>
        </test>
        <!-- #3 -->
        <test expect_num_outputs="9">
            <param name="t" value="pasa_assemblies.fasta"/>
            <section name="lo">
                <param name="gene_trans_map" value="pasa_genetransmap.txt"/>
            </section>
            <section name="oo">
                <param name="out" value="lo_pep,lo_gff3,lo_cds,bed,cds,gff3,pep,plot,log"/>
            </section>
            <!-- LongOrfs -->
            <output name="out_lo_cds">
                <assert_contents>
                    <has_n_lines n="534"/>
                    <has_text_matching expression=">asmbl\_236\.p1.+"/>
                </assert_contents>
            </output>
            <output name="out_lo_gff3">
                <assert_contents>
                    <has_n_lines n="1742"/>
                    <has_text_matching expression="asmbl\_236.+"/>
                </assert_contents>
            </output>
            <output name="out_lo_pep">
                <assert_contents>
                    <has_n_lines n="534"/>
                    <has_text_matching expression=">asmbl\_236\.p1.+"/>
                </assert_contents>
            </output>
            <!-- Predict -->
            <output name="out_bed">
                <assert_contents>
                    <has_n_lines n="204"/>
                    <has_text_matching expression="asmbl\_99.+"/>
                </assert_contents>
            </output>
            <output name="out_cds">
                <assert_contents>
                    <has_n_lines n="3560"/>
                    <has_text_matching expression=">asmbl\_99\.p1.+"/>
                </assert_contents>
            </output>
            <output name="out_gff3">
                <assert_contents>
                    <has_n_lines n="1337"/>
                    <has_text_matching expression="asmbl\_99.+"/>
                </assert_contents>
            </output>
            <output name="out_pep">
                <assert_contents>
                    <has_n_lines n="1391"/>
                    <has_text_matching expression=">asmbl\_99\.p1.+"/>
                </assert_contents>
            </output>
            <output name="out_plot">
                <assert_contents>
                    <has_size value="714868" delta="1000"/>
                </assert_contents>
            </output>
            <!-- Others -->
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="transdecoder is finished.+"/>
                </assert_contents>
            </output>
        </test>
        <!-- #4 -->
        <test expect_num_outputs="1">
            <param name="t" value="transcripts.fasta"/>
            <section name="po">
                <conditional name="predict_cond">
                    <param name="predict_sel" value="no"/>
                </conditional>
            </section>
            <!-- LongOrfs -->
            <output name="out_pep">
                <assert_contents>
                    <has_n_lines n="190"/>
                    <has_text_matching expression="CUFF\.9\.1.+"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

TransDecoder identifies candidate coding regions within transcript sequences such as those generated by de novo RNA-Seq transcript assembly using Trinity or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks.

TransDecoder identifies likely coding sequences based on the following criteria:

 - a minimum length open reading frame (ORF) is found in a transcript sequence.
 - a log-likelihood score similar to what is computed by the GeneID software is > 0.
 - the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.
 - if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).
 - a PSSM is built/trained/used to refine the start codon prediction.
 - optional the putative peptide has a match to a Pfam domain above the noise cutoff score.

*Step 1*: Extract long open reading frames

By default, TransDecoder.LongOrfs will identify ORFs that are at least 100 amino acids long. You can lower this via the '-m' parameter, but know that the rate of false positive ORF predictions increases drastically with shorter minimum length criteria.

*Step 2*: (optional and not part of this wrapper)

The result "longest ORFs (PEP)" can be used to identify ORFs with homology to known proteins via BlastP or Pfam searches (`details <https://github.com/TransDecoder/TransDecoder/wiki#including-homology-searches-as-orf-retention-criteria>`_).

*Step 3*: Predict the likely coding regions

Optionally apply results of homology searches in this step and re-run the whole analysis.

**Input**

- FASTA file with transcripts
- (optional) gene-to-transcript identifier mapping file
- (optional) BLAST or Pfam database file (`details <https://github.com/TransDecoder/TransDecoder/wiki#including-homology-searches-as-orf-retention-criteria>`_)

**Output**

*LongOrfs*

- longest ORFs (PEP/FASTA): all ORFs meeting the minimum length criteria, regardless of coding potential
- longest ORFs (GFF3): positions of all ORFs as found in the target transcripts
- longest ORFs (CDS/FASTA): the nucleotide coding sequence for all detected ORFs

*Predict*

- Results (PEP/FASTA): peptide sequences for the final candidate ORFs; all shorter candidates within longer ORFs were removed
- Results (CDS/FASTA): nucleotide sequences for coding regions of the final candidate ORFs
- Results (GFF3): positions within the target transcripts of the final selected ORFs
- Results (BED): BED-formatted file describing ORF positions, best for viewing using GenomeView or IGV
- Plots: sequence logos and scores (compressed PDF)

*Other*

- Log file

.. class:: infomark

**References**

More information are available on `GitHub <https://github.com/TransDecoder/TransDecoder>`_.
    ]]></help>
    <citations>
        <citation type="doi">10.1038/nprot.2013.084</citation>
    </citations>
</tool>
author	iuc
date	Mon, 01 Feb 2021 20:51:40 +0000
parents	c6334cb383ff
children	ffd1300599a1