Mercurial > repos > iuc > transdecoder

<tool id="transdecoder" name="TransDecoder" version="3.0.1">
    <description>Find coding regions within transcripts</description>
    <requirements>
        <requirement type="package" version="3.0.1">transdecoder</requirement>
    </requirements>

    <command detect_errors="exit_code"><![CDATA[
        TransDecoder.LongOrfs -t '${input}'
        -m ${min_len}
        ${adv.stranded}
        -G ${adv.gen_code}
        #if str($adv.partials)
            -p ${adv.partials}
        #end if
        &&
        TransDecoder.Predict --cpu \${GALAXY_SLOTS:-1} -t '${input}'
        --retain_long_orfs ${adv.retain_long_orfs}
        ${adv.single_best_orf}
        #if str( $training_sect.training.training_selector ) == "training_top":
            -T ${training_sect.training.top_longest}
        #else
            --train '${training_sect.training.train}'
        #end if
        &&
        mv `basename '${input}'`.transdecoder.pep '$transdecoder_pep' &&
        mv `basename '${input}'`.transdecoder.cds '$transdecoder_cds' &&
        mv `basename '${input}'`.transdecoder.bed '$transdecoder_bed' &&
        mv `basename '${input}'`.transdecoder.gff3 '$transdecoder_gff3'
    ]]></command>
    <inputs>
        <param name="input" argument="-t" type="data" format="fasta" label="Transcripts" />
        <param name="min_len" argument="-m" type="integer" value="100" label="Minimum protein length" />
        <section name="adv" title="Advanced Options" expanded="False">
            <param name="stranded" argument="-S" type="boolean" truevalue="-S" falsevalue="" label="Strand-specific" help="Only analyzes top strand" />
            <param name="gen_code" argument="-G" type="select" label="Genetic code">
                <option value="universal" selected="True">universal</option>
                <option value="Euplotes">Euplotes</option>
                <option value="Tetrahymena">Tetrahymena</option>
                <option value="Candida">Candida</option>
                <option value="Acetabularia">Acetabularia</option>
                <option value="Mitochondrial-Canonical">Mitochondrial-Canonical</option>
                <option value="Mitochondrial-Vertebrates">Mitochondrial-Vertebrates</option>
                <option value="Mitochondrial-Arthropods">Mitochondrial-Arthropods</option>
                <option value="Mitochondrial-Echinoderms">Mitochondrial-Echinoderms</option>
                <option value="Mitochondrial-Molluscs">Mitochondrial-Molluscs</option>
                <option value="Mitochondrial-Ascidians">Mitochondrial-Ascidians</option>
                <option value="Mitochondrial-Nematodes">Mitochondrial-Nematodes</option>
                <option value="Mitochondrial-Platyhelminths">Mitochondrial-Platyhelminths</option>
                <option value="Mitochondrial-Yeasts">Mitochondrial-Yeasts</option>
                <option value="Mitochondrial-Euascomycetes">Mitochondrial-Euascomycetes</option>
                <option value="Mitochondrial-Protozoans">Mitochondrial-Protozoans</option>
            </param>
            <param name="partials" argument="-p" type="integer" value="" optional="true" label="Shorten potential 5' partials if they are this percentage of the original protein or longer" />
            <param name="retain_long_orfs" argument="--retain_long_orfs" type="integer" value="900" label="Retain long ORFs" help="Retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence marks it as coding (default: 900 bp => 300aa)" />
            <param argument="--single_best_orf" type="boolean" truevalue="--single_best_orf" falsevalue="" label="Retain only the single best ORF per transcript" help="Best is defined as having (optionally Pfam and/or BLAST support) and longest ORF" />
        </section>
        <section name="training_sect" title="Training Options" expanded="False">
            <conditional name="training">
                <param name="training_selector" type="select" label="Select the training method">
                    <option value="training_top" selected="True">Train with the top longest ORFs</option>
                    <option value="training_set">Train with a set of known ORFs</option>
                </param>
                <when value="training_top">
                    <param name="top_longest" argument="-T" type="integer" value="500" label="Number of top longest ORFs" help="Number of top longest ORFs to train Markov Model (hexamer stats). Note, 10x this value are first selected for use with cd-hit to remove redundancies, and then this value of longest ORFs are selected from the non-redundant set" />
                </when>
                <when value="training_set">
                    <param name="train" argument="--train" type="data" format="fasta" label="Training set of transcripts" help="FASTA file with ORFs to train Markov Mod for protein identification" />
                </when>
            </conditional>
        </section>
    </inputs>
    <outputs>
        <data name="transdecoder_pep" format="fasta" label="${tool.name} on ${on_string}: pep" />
        <data name="transdecoder_cds" format="fasta" label="${tool.name} on ${on_string}: cds" />
        <data name="transdecoder_bed" format="bed" label="${tool.name} on ${on_string}: bed" />
        <data name="transdecoder_gff3" format="gff3" label="${tool.name} on ${on_string}: gff3" />
    </outputs>
    <tests>
        <test>
            <param name="input" value="test.fa"/>
            <output name="transdecoder_gff3" file="raw/test.fa.transdecoder.gff3" compare="sim_size" />
            <output name="transdecoder_bed" file="raw/test.fa.transdecoder.bed" compare="sim_size" />
            <output name="transdecoder_cds" file="raw/test.fa.transdecoder.cds" compare="sim_size" />
            <output name="transdecoder_pep" file="raw/test.fa.transdecoder.pep" compare="sim_size" />
        </test>
        <test>
            <param name="input" value="test.fa"/>
            <param name="training_selector" value="training_top"/>
            <param name="top_longest" value="10"/>
            <output name="transdecoder_gff3" file="top/test.fa.transdecoder.gff3" compare="sim_size" />
            <output name="transdecoder_bed" file="top/test.fa.transdecoder.bed" compare="sim_size" />
            <output name="transdecoder_cds" file="top/test.fa.transdecoder.cds" compare="sim_size" />
            <output name="transdecoder_pep" file="top/test.fa.transdecoder.pep" compare="sim_size" />
        </test>
        <test>
            <param name="input" value="test.fa"/>
            <param name="gen_code" value="Mitochondrial-Arthropods"/>
            <output name="transdecoder_gff3" file="gencode/test.fa.transdecoder.gff3" compare="sim_size" />
            <output name="transdecoder_bed" file="gencode/test.fa.transdecoder.bed" compare="sim_size" />
            <output name="transdecoder_cds" file="gencode/test.fa.transdecoder.cds" compare="sim_size" />
            <output name="transdecoder_pep" file="gencode/test.fa.transdecoder.pep" compare="sim_size" />
        </test>
        <test>
            <param name="input" value="test.fa"/>
            <param name="stranded" value="true"/>
            <output name="transdecoder_gff3" file="strand/test.fa.transdecoder.gff3" compare="sim_size" />
            <output name="transdecoder_bed" file="strand/test.fa.transdecoder.bed" compare="sim_size" />
            <output name="transdecoder_cds" file="strand/test.fa.transdecoder.cds" compare="sim_size" />
            <output name="transdecoder_pep" file="strand/test.fa.transdecoder.pep" compare="sim_size" />
        </test>
    </tests>
    <help>
**What it does**

TransDecoder identifies candidate coding regions within transcript sequences, such as those generated by de novo RNA-Seq transcript assembly using Trinity, or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks.

TransDecoder identifies likely coding sequences based on the following criteria:

 - a minimum length open reading frame (ORF) is found in a transcript sequence

 - a log-likelihood score similar to what is computed by the GeneID software is > 0.

 - the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.

 - if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).

 - optional the putative peptide has a match to a Pfam domain above the noise cutoff score.

The software is primarily maintained by Brian Haas at the Broad Institute and Alexie Papanicolaou at the Commonwealth Scientific and Industrial Research Organisation (CSIRO). It is integrated into other related software such as Trinity, PASA, EVidenceModeler, and Trinotate.
    </help>
    <citations>
        <citation type="doi">10.1038/nprot.2013.084</citation>
    </citations>
</tool>
author	iuc
date	Thu, 01 Jun 2017 06:04:12 -0400
parents	b408e5a8b137
children	c6334cb383ff