view tetranscript.xml @ 1:bf4ee2810759 draft

"planemo upload for repository https://github.com/mhammell-laboratory/TEtranscripts commit de0508a070ac11901c5c4cb9caffea85f0c889fb"
author iuc
date Thu, 16 Jul 2020 08:04:29 -0400
parents 2dfbcb88d16a
children bbeab8445490
line wrap: on
line source

<?xml version="1.0"?>
<tool id="tetoolkit_tetranscripts" name="TEtranscripts" version="@TOOL_VERSION@+galaxy0">
    <description>annotates reads to genes and transposable elements</description>
    <macros>
        <token name="@TOOL_VERSION@">2.2.0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">tetranscripts</requirement>
    </requirements>
    <version_command>TEtranscripts --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
        ## initialize
        ## file extension is required
        ln -s '$GTF' 'gene_annotation.gtf' &&
        ln -s '$TE' 'transposable_annotation.gtf' &&

        ## run
        TEtranscripts
            ## required
            -t
            #for $s in $sample_rep
                '${s.t}'
            #end for
            -c
            #for $s in $sample_rep
                '${s.c}'
            #end for
            --GTF 'gene_annotation.gtf'
            --TE 'transposable_annotation.gtf'
            ## optional
            --stranded '$io.stranded'
            $io.sortByPos
            --project 'result'
            --mode '$ap.mode'
            --minread $ap.minread
            #if $ap.fragmentLength
                --fragmentLength $ap.fragmentLength
            #end if
            --iteration $ap.iteration
            --padj $ap.padj
            --foldchange $ap.foldchange
            #if 'log' in $ap.out
                --verbose 3
                |& tee log.txt
            #end if
        ]]></command>
    <inputs>
        <repeat name="sample_rep" min="2" title="Select input data">
            <param argument="-t" type="data" format="bam" label="Treatment sample file"/>
            <param argument="-c" type="data" format="bam" label="Control sample file"/>
        </repeat>
        <param argument="--GTF" type="data" format="gtf" label="Select GTF file for gene annotations"/>
        <param argument="--TE" type="data" format="gtf" label="Select GTF file for transposable element annotations"/>
        <section name="io" title="Input options">
            <param argument="--stranded" type="select" label="Select library type">
                <option value="no">Library is unstranded (no)</option>
                <option value="forward">Second-strand cDNA library e.g. QIAseq stranded (forward)</option>
                <option value="reverse">First-strand cDNA library e.g. Illumina TruSeq stranded (reverse)</option>
            </param>
            <param argument="--sortByPos" type="boolean" truevalue="--sortByPos" falsevalue="" label="Are input files sorted by chromosome position?"/>
        </section>
        <section name="ap" title="Advanced parameters">
            <param argument="--mode" type="select" label="Set TE counting mode">
                <option value="multi">Distribute among all alignments (multi)</option>
                <option value="uniq">Unique mappers only (uniq)</option>
            </param>
            <param argument="--minread" type="integer" value="1" min="0" label="Set read count cutoff"/>
            <param argument="--fragmentLength" type="integer" min="0" optional="true" label="Set average length of fragment used for single-end sequencing" help="For paired-end, estimated from the input alignment file. For single-end, ignored by default."/>
            <param argument="--iteration" type="integer" value="100" min="0" label="Set maximum number of iterations used to optimize multi-reads assignment"/>
            <param argument="--padj" type="float" value="0.05" min="0.0" max="1.0" label="Set FDR cutoff for significance"/>
            <param argument="--foldchange" type="float" value="1.0" min="0.0" label="Set fold-change ratio (absolute) cutoff for differential expression"/>
            <param name="out" type="select" multiple="true" label="Select output file(s)" help="Result files for gene TE analysis and sigDiff gene TE will be created if more than one dataset is applied.">
                <option value="cnttable" selected="true">cntTable</option>
                <option value="deseq2" selected="true">DESeq2.R</option>
                <option value="gta" selected="true">Gene TE Analysis</option>
                <option value="sgt" selected="true">SigDiff Gene TE</option>
                <option value="log">Log</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <data name="out_cnt" format="tabular" from_work_dir="result.cntTable" label="${tool.name} on ${on_string}: cntTable">
            <filter>'cnttable' in ap['out']</filter>
        </data>
        <data name="out_deseq2" format="txt" from_work_dir="result_DESeq2.R" label="${tool.name} on ${on_string}: DESeq2.R">
            <filter>'deseq2' in ap['out']</filter>
        </data>
        <data name="out_log" format="txt" from_work_dir="log.txt" label="${tool.name} on ${on_string}: log">
            <filter>'log' in ap['out']</filter>
        </data>
        <data name="out_gta" format="txt" from_work_dir="result_gene_TE_analysis.txt" label="${tool.name} on ${on_string}: Gene TE analysis">
            <filter>'gta' in ap['out']</filter>
        </data>
        <data name="out_sgt" format="txt" from_work_dir="result_sigdiff_gene_TE.txt" label="${tool.name} on ${on_string}: SigDiff Gene TE">
            <filter>'sgt' in ap['out']</filter>
        </data>
    </outputs>
    <tests>
        <!-- 
            test data sources:
            https://github.com/mhammell-laboratory/tetoolkit-test-data 
            https://github.com/mhammell-laboratory/TEtranscripts/issues/66
        -->

        <!-- #1: default -->
        <test expect_num_outputs="5">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <section name="ap">
                <param name="out" value="cnttable,deseq2,gta,sgt,log"/>
            </section>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <!-- depends on sample names -->
                    <has_text_matching expression="gene.+"/>
                    <!-- order changes -->
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_deseq2">
                <assert_contents>
                    <has_n_lines n="14"/>
                    <has_text_matching expression="data.+"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="INFO"/>
                    <has_text_matching expression=".+Done"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="71"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <!-- order changes -->
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <!-- no content, test dataset to small -->
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 -->
        <test expect_num_outputs="5">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <section name="io">
                <param name="stranded" value="forward"/>
                <param name="sortByPos" value="true"/>
            </section>
            <section name="ap">
                <param name="mode" value="uniq"/>
                <param name="minread" value="2"/>
                <param name="fragmentLength" value="10"/>
                <param name="iteration" value="90"/>
                <param name="padj" value="0.06"/>
                <param name="foldchange" value="2.0"/>
                <param name="out" value="cnttable,deseq2,gta,sgt,log"/>
            </section>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <!-- depends on sample names -->
                    <has_text_matching expression="gene.+"/>
                    <!-- order changes -->
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_deseq2">
                <assert_contents>
                    <has_n_lines n="14"/>
                    <has_text_matching expression="data.+"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="INFO.+"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <!-- order changes -->
                    <has_text_matching expression="DNAREP1.+"/>
                </assert_contents>
            </output>
            <!-- no content, test dataset to small -->
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                </assert_contents>
            </output>
        </test>
        <!-- #3 -->
        <test expect_num_outputs="5">
            <repeat name="sample_rep">
                <param name="t" value="treatment1.bam"/>
                <param name="c" value="control1.bam"/>
            </repeat>
            <repeat name="sample_rep">
                <param name="t" value="treatment2.bam"/>
                <param name="c" value="control2.bam"/>
            </repeat>
            <param name="GTF" value="gtf.gtf"/>
            <param name="TE" value="te.gtf"/>
            <section name="io">
                <param name="stranded" value="reverse"/>
            </section>
            <section name="ap">
                <param name="out" value="cnttable,deseq2,gta,sgt,log"/>
            </section>
            <output name="out_cnt">
                <assert_contents>
                    <has_n_lines n="295"/>
                    <!-- depends on sample names -->
                    <has_text_matching expression="gene.+"/>
                    <!-- order changes -->
                    <has_text_matching expression="TIRANT.+"/>
                </assert_contents>
            </output>
            <output name="out_deseq2">
                <assert_contents>
                    <has_n_lines n="14"/>
                    <has_text_matching expression="data.+"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_text_matching expression="INFO.+"/>
                </assert_contents>
            </output>
            <output name="out_gta">
                <assert_contents>
                    <has_n_lines n="23"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <!-- order changes -->
                    <has_text_matching expression="TART.+"/>
                </assert_contents>
            </output>
            <output name="out_sgt">
                <assert_contents>
                    <has_n_lines n="2"/>
                    <has_line line="baseMean&#009;log2FoldChange&#009;lfcSE&#009;stat&#009;pvalue&#009;padj"/>
                    <has_text_matching expression="Gypsy12.+"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

TEtranscripts is a software package that utilizes both unambiguously (uniquely) and ambiguously (multi-) mapped reads to perform differential enrichment analyses from high throughput sequencing experiments. Currently, most expression analysis software packates are not optimized for handling the complexities involved in quantifying highly repetitive regions of the genome, especially transposable elements (TE), from short sequencing reads. Although transposon elements make up between 20 to 80% of many eukaryotic genomes and contribute significantly to the cellular transcriptome output, the difficulty in quantifying their abundances from high throughput sequencing experiments has led them to be largely ignored in most studies. The TEtranscripts provides a noticeable improvement in the recovery of TE transcripts from RNA-Seq experiments and identification of peaks associated with repetitive regions of the genome.

**Input**

GTF files for gene annotation can be obtained from `UCSC RefSeq <http://genome.ucsc.edu/cgi-bin/hgTables>`_, Ensembl, `iGenomes <http://support.illumina.com/sequencing/sequencing_software/igenome.html>`_ or other annotation databases. GTF files for TE annotations are customly generated from `UCSC RepeatMasker <http://genome.ucsc.edu/cgi-bin/hgTables>`_ or other annotation database. They contain two custom attributes, class_id and family_id, corresponding to the class (e.g. LINE) and family (e.g. L1) of the corresponding transposable element. A unique ID (e.g. L1Md_Gf_dup1) is also assigned for each TE annotation in the transcript_id attribute.

**Output**

TEtranscripts quantifies both gene and transposable element (TE) transcript abundances from RNA-Seq experiments, utilizing both uniquely and ambiguously mapped short read sequences. It processes the short reads alignments (BAM files) and proportionally assigns read counts to the corresponding gene or TE based on the user-provided annotation files (GTF files). In addition, TEtranscripts combines multiple libraries and perform differential analysis using DESeq2.

.. class:: infomark

**References**

More information are available on the `project website <http://hammelllab.labsites.cshl.edu/software/#TEtranscripts>`_ and `github <https://github.com/mhammell-laboratory/TEtranscripts>`_.
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv422</citation>
        <citation type="doi">10.1007/978-1-4939-7710-9_11</citation>
    </citations>
</tool>