Mercurial > repos > iuc > trinity_analyze_diff_expr

<tool id="trinity_analyze_diff_expr" name="Extract and cluster differentially expressed transcripts" version="@WRAPPER_VERSION@.2">
    <description>from a Trinity assembly</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="2.6.0">bioconductor-qvalue</requirement>
        <requirement type="package" version="1.26.0">bioconductor-goseq</requirement>
        <requirement type="package" version="2.0.6">r-cluster</requirement>
    </expand>
    <command detect_errors="aggressive"><![CDATA[
    ## DE results input files must be in the working directory and have suffix .DE_results
    #import re
    #for $input in $DE_results
        #if re.search('.DE_results$',input.element_identifier)
            ## General case, where DE results files have been previously generated by run_de_analysis.pl
            ln -s "${input}" "${re.sub('[^\w\-_.]', '_', input.element_identifier)}"
        #else
            ## Particular case, where DE results files have non-standard names
            ln -s "${input}" "${re.sub('[^\w\-_.]', '_', input.element_identifier)}.DE_results"
        #end if
        &&
    #end for
    #if str( $additional_params.GO_enrichment.examine_GO_enrichment ) == "yes":
        ## DE matrix input files must be in the working directory and have the same name as DE results input files, but replacing suffix .DE_results by suffix .count_matrix
        #for $DE_matrix in $additional_params.GO_enrichment.DE_matrices
            ## Handle general case, where DE results files and DE matrix files have been previously generated by run_de_analysis.pl
            ln -s "${DE_matrix}" "${re.sub('[^\w\-_.]', '_', DE_matrix.element_identifier)}"
            &&
        #end for
    #end if

    analyze_diff_expr.pl
        --matrix "${matrix}"
        --samples "${samples}"
        -P ${p}
        -C ${c}

        #if str( $additional_params.max_DE_genes_per_comparison ):
            --max_DE_genes_per_comparison ${additional_params.max_DE_genes_per_comparison}
        #end if

        $additional_params.order_columns_by_samples_file

        #if $additional_params.max_genes_clust:
            --max_genes_clust ${additional_params.max_genes_clust}
        #end if

        #if str( $additional_params.GO_enrichment.examine_GO_enrichment ) == "yes":
            --examine_GO_enrichment
            --GO_annots "${$additional_params.GO_enrichment.GO_annots}"
            --gene_lengths "${$additional_params.GO_enrichment.gene_lengths}"
        #end if

        --output results
    ]]></command>
    <inputs>
        <param format="tabular" name="matrix" argument="--matrix" type="data" label="Expression matrix" help="Raw counts matrix produced by 'Build expression matrix for a de novo assembly of RNA-Seq data by Trinity' tool"/>
        <param format="tabular" name="samples" argument="--samples" type="data" label="Sample description" help="File describing samples and replicates"/>
        <param format="tabular" name="DE_results" type="data_collection" collection_type="list" label="Differential expression results" help="Generated by 'Differential expression analysis using a Trinity assembly' tool"/>
        <param name="p" type="float" argument="-P" value="0.001" label="p-value cutoff for FDR"/>
        <param name="c" type="float" argument="-C" value="2" label="min abs(log2(a/b)) fold change" help="Default: 2 (meaning 2^(2) or 4-fold"/>
        <section name="additional_params" title="Additional Options" expanded="False">
            <param name="max_DE_genes_per_comparison" argument="--max_DE_genes_per_comparison" type="integer" value="" optional="true" label="Maximum differential expression genes per comparison" help="Extract only up to the top number of DE features within each pairwise comparison. This is useful when you have massive numbers of DE features but still want to make useful heatmaps and other plots with more manageable numbers of data points."/>
            <param name="order_columns_by_samples_file" argument="--order_columns_by_samples_file" type="boolean" checked="false" truevalue="--order_columns_by_samples_file" falsevalue="" label="Order columns by samples file" help="Instead of clustering samples or replicates hierarchically based on gene expression patterns, order columns according to order in the --samples file."/>
            <param name="max_genes_clust" argument="--max_genes_clust" type="integer" value="10000" label="Maximum genes in cluster" help="If more than 10000, heatmaps are not generated, since too time consuming"/>
            <conditional name="GO_enrichment">
                <param type="select" name="examine_GO_enrichment" argument="--examine_GO_enrichment" label="Run GO enrichment analysis" help="To examine GO enrichment, you must first run Trinotate and then extract all GO assignments for each gene feature, with the Trinotate script extract_GO_assignments_from_Trinotate_xls.pl">
                    <option value="no">No</option>
                    <option value="yes">Yes</option>
                </param>
                <when value="no">
                </when>
                <when value="yes">
                    <param format="tabular" name="DE_matrices" type="data_collection" collection_type="list" label="Differential expression count matrices" help="Generated by 'Differential expression analysis using a Trinity assembly' tool. If not, be careful that the file names are identical to the file names of differential expression results, with extension '.count_matrix' instead of '.DE_results'."/>
                    <param format="tabular" name="GO_annots" argument="--GO_annots" type="data" label="Extracted GO assignments file" help="Generated by the Trinotate script extract_GO_assignments_from_Trinotate_xls.pl. Must have 2 columns: feature_id GO:000001,GO:00002,..."/>
                    <param format="tabular" name="gene_lengths" argument="--gene_lengths" type="data" label="Gene length file" help="Must have 2 columns: feature_id length"/>
                </when>
        </conditional>
        </section>
    </inputs>
    <outputs>
        <collection name="extracted_DE_genes" type="list" label="${tool.name} on ${on_string}: extracted differentially expressed genes">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.subset)$" ext="tabular" />
        </collection>
        <collection name="summary_files" type="list" label="${tool.name} on ${on_string}: summary files">
            <data format="tabular" name="results_matrix" from_work_dir="results.matrix"/>
            <data format="tabular" name="results_matrix_log2_centered" from_work_dir="results.matrix.log2.centered.dat"/>
            <data format="pdf" name="results_matrix_log2_centered_heatmap" from_work_dir="results.matrix.log2.centered.genes_vs_samples_heatmap.pdf"/>
            <data format="tabular" name="results_matrix_log2" from_work_dir="results.matrix.log2.dat"/>
            <data format="tabular" name="results_matrix_log2_sample_cor" from_work_dir="results.matrix.log2.sample_cor.dat"/>
            <data format="pdf" name="results_matrix_log2_sample_cor_matrix" from_work_dir="results.matrix.log2.sample_cor_matrix.pdf"/>
        </collection>
        <data format="RData" name="rdata" label="${tool.name} on ${on_string}: RData file" from_work_dir="results.matrix.RData"/>
        <collection name="GOseq_enrichment" type="list" label="${tool.name} on ${on_string}: GOseq enriched and depleted categories">
            <filter>additional_params['GO_enrichment']['examine_GO_enrichment'] == 'yes'</filter>
            <discover_datasets pattern="(?P&lt;name&gt;.+\.subset\.GOseq\.(enriched|depleted))$" ext="tabular" />
        </collection>
    </outputs>
    <tests>
        <!-- Test without GO enrichment analysis -->
        <test>
            <param name="matrix" value="count/qcheck/matrix.counts.matrix"/>
            <param name="samples" value="count/samples.txt"/>
            <param name="DE_results">
                <collection type="list">
                    <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
              </collection>
            </param>
            <output_collection name="extracted_DE_genes">
                <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.0.wt_37-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.wt_37-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.0.wt_GSNO-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.wt_GSNO-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_37-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_37-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_ph8-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_ph8-UP.subset"/>
                <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_GSNO-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_GSNO-UP.subset"/>
                <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_ph8-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_ph8-UP.subset"/>
            </output_collection>
            <output_collection name="summary_files">
                <element name="results_matrix" compare="sim_size" file="count/analyze_diff_expr/results.matrix"/>
                <element name="results_matrix_log2_centered" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.centered.dat"/>
                <element name="results_matrix_log2_centered_heatmap" delta="100" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.centered.genes_vs_samples_heatmap.pdf"/>
                <element name="results_matrix_log2" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.dat"/>
                <element name="results_matrix_log2_sample_cor" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.sample_cor.dat"/>
                <element name="results_matrix_log2_sample_cor_matrix" delta="100" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.sample_cor_matrix.pdf"/>
            </output_collection>
            <output name="rdata" compare="sim_size" file="count/analyze_diff_expr/results.matrix.RData"/>
        </test>
        <!-- Test without GO enrichment analysis, test name of DE_results input files different from those generated by tool run_de_analysis.pl -->
        <test>
            <param name="matrix" value="count/qcheck/matrix.counts.matrix"/>
            <param name="samples" value="count/samples.txt"/>
            <param name="DE_results">
                <collection type="list">
                    <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2" value="count/exp_diff/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2" value="count/exp_diff/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2" value="count/exp_diff/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
              </collection>
            </param>
            <output_collection name="extracted_DE_genes">
                <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.0.wt_37-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.wt_37-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.0.wt_GSNO-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results.P0.001_C2.wt_GSNO-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_37-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_37-UP.subset"/>
                <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_ph8-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_ph8-UP.subset"/>
                <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_GSNO-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_GSNO-UP.subset"/>
                <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_ph8-UP.subset" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_ph8-UP.subset"/>
            </output_collection>
            <output_collection name="summary_files">
                <element name="results_matrix" compare="sim_size" file="count/analyze_diff_expr/results.matrix"/>
                <element name="results_matrix_log2_centered" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.centered.dat"/>
                <element name="results_matrix_log2_centered_heatmap" delta="100" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.centered.genes_vs_samples_heatmap.pdf"/>
                <element name="results_matrix_log2" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.dat"/>
                <element name="results_matrix_log2_sample_cor" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.sample_cor.dat"/>
                <element name="results_matrix_log2_sample_cor_matrix" delta="100" compare="sim_size" file="count/analyze_diff_expr/results.matrix.log2.sample_cor_matrix.pdf"/>
            </output_collection>
            <output name="rdata" compare="sim_size" file="count/analyze_diff_expr/results.matrix.RData"/>
        </test>
        <!-- Test with GO enrichment analysis -->
        <test>
            <param name="matrix" value="count/qcheck/matrix.counts.matrix"/>
            <param name="samples" value="count/samples.txt"/>
            <param name="DE_results">
                <collection type="list">
                    <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_37_vs_wt_GSNO.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
                    <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results" value="count/exp_diff/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.DE_results" ftype="tabular" />
                </collection>
            </param>
            <section name="additional_params">
                <conditional name="GO_enrichment">
                    <param name="examine_GO_enrichment" value="yes"/>
                    <param name="DE_matrices">
                        <collection type="list">
                            <element name="input.matrix.wt_37_vs_wt_GSNO.DESeq2.count_matrix" value="count/exp_diff/input.matrix.wt_37_vs_wt_GSNO.DESeq2.count_matrix" ftype="tabular" />
                            <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.count_matrix" value="count/exp_diff/input.matrix.wt_37_vs_wt_ph8.DESeq2.count_matrix" ftype="tabular" />
                            <element name="input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.count_matrix" value="count/exp_diff/input.matrix.wt_GSNO_vs_wt_ph8.DESeq2.count_matrix" ftype="tabular" />
                        </collection>
                    </param>
                    <param name="GO_annots" value="count/trinotate/go_annotations.txt"/>
                    <param name="gene_lengths" value="count/trinotate/genes.lengths.txt"/>
                </conditional>
            </section>
            <assert_command>
                <has_text text="--examine_GO_enrichment" />
                <has_text text="--GO_annots" />
                <has_text text="--gene_lengths" />
            </assert_command>
            <output_collection name="GOseq_enrichment">
                <element name="input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.0.wt_37-UP.subset.GOseq.enriched" compare="sim_size" file="count/analyze_diff_expr/input.matrix.wt_37_vs_wt_ph8.DESeq2.DE_results.P0.001_C2.wt_37-UP.subset.GOseq.enriched"/>
            </output_collection>
        </test>
    </tests>
    <help>
<![CDATA[
Trinity_ assembles transcript sequences from Illumina RNA-Seq data.
This tool extracts the transcripts that are most differentially expressed (most significant FDR and fold-changes), once differential expression analyses have been runned.

**Inputs**

This tool uses the raw counts matrix produced by 'Build expression matrix for a de novo assembly of RNA-Seq data by Trinity' tool.

You must describe your samples and replicates with a tabular file looking like this:

=========== ================
ConditionA  CondA_replicate1
----------- ----------------
ConditionA  CondA_replicate2
----------- ----------------
ConditionB  CondB_replicate1
----------- ----------------
ConditionB  CondB_replicate2
----------- ----------------
ConditionC  CondC_replicate1
----------- ----------------
ConditionC  CondC_replicate2
----------- ----------------
ConditionC  CondC_replicate3
=========== ================

This file can be generated with the 'Describe samples and replicates' tool.
It will probably be the same file as used in the tool 'RNASeq samples quality check for transcript quantification' or in the tool 'Differential expression analysis'.
The names in column 2 must match the names given in the tool 'Build expression matrix for a de novo assembly of RNA-Seq data by Trinity'.

You must also provide as a data collection the files resulting from the differential expression analysis (outputs of tool 'Differential expression analysis').

.. _Trinity: http://trinityrnaseq.github.io
]]>
    </help>
    <expand macro="citation" />
</tool>
author	iuc
date	Mon, 22 Jan 2018 11:27:05 -0500
parents	63030102d46e
children	d61afd68a493