view limma_voom.xml @ 2:a330ddf43861 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 346319bd01bfd51f02d656e653a93f0bd1e954aa
author iuc
date Thu, 07 Sep 2017 05:27:27 -0400
parents 76d01fe0ec36
children 38aab66ae5cb
line wrap: on
line source

<tool id="limma_voom" name="limma-voom" version="1.2.0">
    <description>
        Differential expression with optional sample weights
    </description>

    <requirements>
        <requirement type="package" version="3.16.5">bioconductor-edger</requirement>
        <requirement type="package" version="3.30.13">bioconductor-limma</requirement>
        <requirement type="package" version="1.4.29">r-statmod</requirement>
        <requirement type="package" version="0.4.1">r-scales</requirement>
    </requirements>

    <version_command><![CDATA[
        echo $(R --version | grep version | grep -v GNU)", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\$otherPkgs\$limma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
    ]]></version_command>

    <command detect_errors="exit_code"><![CDATA[
Rscript '$__tool_directory__/limma_voom.R'
'$counts'

#if $anno.annoOpt=='yes':
  '$geneanno'
#else:
  None
#end if

'$outReport'
'$outReport.files_path'
$rdaOption
$normalisationOption
$weightOption
'$contrast'

#if $filterCPM.filterLowCPM=='yes':
  '$filterCPM.cpmReq'
  '$filterCPM.sampleReq'
#else:
  0
  0
#end if

#if $testOpt.wantOpt=='yes':
  '$testOpt.pAdjust'
  '$testOpt.pVal'
  '$testOpt.lfc'
#else:
  "BH"
  0.05
  0
#end if

$normCounts

#if $fact.ffile=='yes':
    '$finfo'
    'None'
#else:
    'None'
    '$fact.pfactName::$fact.pfactLevel'
    #for $sfact in $fact.sfactors:
        '$sfact.sfactName::$sfact.sfactLevel'
    #end for
#end if

&&
mkdir ./output_dir

&&
cp '$outReport.files_path'/*.tsv output_dir/
    ]]></command>

    <inputs>
        <param name="counts" type="data" format="tabular" label="Counts Data"/>

        <conditional name="anno">
            <param name="annoOpt" type="select" label="Use Gene Annotations?"
                help="If an annotation file is provided, annotations will be added to the table of differential expression results to provide descriptions for each gene.">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>
            </when>
            <when value="no" />
        </conditional>

        <conditional name="fact">
            <param name="ffile" type="select" label="Input Factor Information from file?"
                help="You can choose to input the factor information from a file or manually enter below.">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param name="finfo" type="data" format="tabular" label="Factor Information"/>
            </when>
            <when value="no" >
                <param name="pfactName" type="text" label="Primary Factor Name"
                    help="Eg. Genotype NOTE: Please only use letters, numbers or underscores.">
                    <validator type="empty_field" />
                    <validator type="regex" message="Please only use letters, numbers or underscores">^[\w]+$</validator>
                </param>
                <param name="pfactLevel" type="text" label="Primary Factor Levels"
                    help="Eg. WT,WT,WT,Mut,Mut,Mut NOTE: Please only use letters, numbers or underscores and ensure that the same levels are typed identically with cases matching.">
                    <validator type="empty_field" />
                    <validator type="regex" message="Please only use letters, numbers or underscores, and separate levels by commas">^[\w,]+$</validator>
                </param>
                <repeat name="sfactors" title="Secondary Factor" >
                    <param name="sfactName" type="text" label="Secondary Factor Name" help="Eg. Batch">
                        <validator type="empty_field" />
                        <validator type="regex" message="Please only use letters, numbers or underscores">^[\w]+$</validator>
                    </param>
                    <param name="sfactLevel" type="text" label="Secondary Factor Levels"
                        help="Eg. b1,b2,b3,b1,b2,b3 NOTE: Please only use letters, numbers or underscores and ensure that the same levels are typed identically with cases matching.">
                        <validator type="empty_field" />
                        <validator type="regex" message="Please only use letters, numbers or underscores">^[\w,]+$</validator>
                    </param>
                </repeat>
            </when>
        </conditional>

        <param name="contrast" type="text" label="Contrasts of interest" help="Eg. Mut-WT,KD-Control">
            <validator type="empty_field" />
            <validator type="regex" message="Please only use letters, numbers or underscores">^[\w,-]+$</validator>
        </param>

        <conditional name="filterCPM">
            <param name="filterLowCPM" type="select" label="Filter Low CPM?"
                help="Treat genes with very low expression as unexpressed and filter out to speed up computation.">
                <option value="yes" selected="True">Yes</option>
                <option value="no">No</option>
            </param>
            <when value="yes">
                <param name="cpmReq" type="float" value="0.5" min="0" label="Minimum CPM"/>
                <param name="sampleReq" type="integer" value="1" min="0" label="Minimum Samples"
                    help="Filter out all the genes that do not meet the minimum CPM in at least this many samples."/>
            </when>
            <when value="no"/>
        </conditional>

        <param name="weightOption" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Apply sample weights?"
            help="Apply weights if outliers are present.">
        </param>

        <param name="normalisationOption" type="select" label="Normalisation Method">
            <option value="TMM">TMM</option>
            <option value="RLE">RLE</option>
            <option value="upperquartile">Upperquartile</option>
            <option value="none">None (Don't normalise)</option>
        </param>

        <param name="normCounts" type="boolean" truevalue="yes" falsevalue="no" checked="false"
            label="Output normalised counts table?"
            help="Output a file containing the normalised counts, these are in log2 counts per million (logCPM).">
        </param>

        <param name="rdaOption" type="boolean" truevalue="yes" falsevalue="no" checked="false"
            label="Output RData?"
            help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report.">
        </param>

        <conditional name="testOpt">
            <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
                help="Enable choices for p-value adjustment method, p-value threshold and log2-fold-change threshold.">
                <option value="no" selected="True">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param name="pAdjust" type="select" label="P-Value Adjustment Method.">
                    <option value="BH">Benjamini and Hochberg (1995)</option>
                    <option value="BY">Benjamini and Yekutieli (2001)</option>
                    <option value="holm">Holm (1979)</option>
                    <option value="none">None</option>
                </param>
                <param name="pVal" type="float" value="0.05" min="0" max="1"
                    label="Adjusted Threshold"
                    help="Genes below this threshold are considered significant and highlighted in the MA plot. If either BH(1995) or BY(2001) were selected then this value is a false-discovery-rate control. If Holm(1979) was selected then this is an adjusted p-value for family-wise error rate."/>
                <param name="lfc" type="float" value="0" min="0"
                    label="Minimum log2-fold-change Required"
                    help="Genes above this threshold and below the p-value threshold are considered significant and highlighted in the MA plot."/>
            </when>
            <when value="no"/>
        </conditional>

    </inputs>

    <outputs>
        <data name="outReport" format="html" label="${tool.name} on ${on_string}: Report" />
        <collection name="outTables" type="list" label="${tool.name} on ${on_string}: Tables">
            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output_dir" visible="false" />
        </collection>
    </outputs>

    <tests>
        <test>
            <param name="counts" value="matrix.txt" />
            <param name="pfactName" value="Genotype" />
            <param name="pfactLevel" value="WT,WT,WT,Mut,Mut,Mut" />
            <param name="contrast" value="Mut-WT,WT-Mut" />
            <param name="normalisationOption" value="TMM" />
            <output_collection name="outTables" count="2">
                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WT.tsv" />
                <element name="limma-voom_WT-Mut" ftype="tabular" file="limma-voom_WT-Mut.tsv" />
            </output_collection>
            <output name="outReport" >
                <assert_contents>
                    <has_text text="Limma-voom Analysis Output" />
                    <not_has_text text="RData" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="annoOpt" value="yes" />
            <param name="geneanno" value="anno.txt" />
            <param name="counts" value="matrix.txt" />
            <param name="pfactName" value="Genotype" />
            <param name="pfactLevel" value="WT,WT,WT,Mut,Mut,Mut" />
            <param name="contrast" value="Mut-WT" />
            <param name="normalisationOption" value="TMM" />
            <output_collection name="outTables" >
                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WTanno.tsv" />
            </output_collection>
        </test>
        <test>
            <param name="rdaOption" value="yes" />
            <param name="counts" value="matrix.txt" />
            <param name="pfactName" value="Genotype" />
            <param name="pfactLevel" value="WT,WT,WT,Mut,Mut,Mut" />
            <param name="contrast" value="Mut-WT" />
            <param name="normalisationOption" value="TMM" />
            <output name="outReport" >
                <assert_contents>
                    <has_text text="RData" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="counts" value="matrix.txt" />
            <param name="pfactName" value="Genotype"/>
            <param name="pfactLevel" value="WT,WT,WT,Mut,Mut,Mut"/>
            <repeat name="sfactors">
                <param name="sfactName" value="Batch"/>
                <param name="sfactLevel" value="b1,b2,b3,b1,b2,b3"/>
            </repeat>
            <param name="contrast" value="Mut-WT" />
            <param name="normalisationOption" value="TMM" />
            <output_collection name="outTables" >
                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WTmultifact.tsv" />
            </output_collection>
        </test>
        <test>
            <param name="ffile" value="yes" />
            <param name="finfo" value="factorinfo.txt" />
            <param name="counts" value="matrix.txt" />
            <param name="contrast" value="Mut-WT" />
            <param name="normalisationOption" value="TMM" />
            <output_collection name="outTables" >
                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WTmultifact.tsv" />
            </output_collection>
        </test>
        <test>
            <param name="normCounts" value="yes" />
            <param name="counts" value="matrix.txt" />
            <param name="pfactName" value="Genotype" />
            <param name="pfactLevel" value="WT,WT,WT,Mut,Mut,Mut" />
            <param name="contrast" value="Mut-WT" />
            <param name="normalisationOption" value="TMM" />
            <output_collection name="outTables" count="2">
                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WT.tsv" />
                <element name="limma-voom_normcounts" ftype="tabular" file="limma-voom_normcounts.tsv" />
            </output_collection>
        </test>
    </tests>

    <help><![CDATA[
.. class:: infomark

**What it does**

Given a matrix of counts (e.g. from featureCounts) and optional information about the genes, this tool
produces plots and tables useful in the analysis of differential gene
expression.

-----

**Inputs**

**Counts Data:**
A matrix of counts, with rows corresponding to genes
and columns corresponding to counts for the samples.
Values must be tab separated, with the first row containing the sample/column
labels and the first column containing the row/gene labels.

Example:

    ========== ======= ======= ======= ======== ======== ========
    **GeneID** **WT1** **WT2** **WT3** **Mut1** **Mut2** **Mut3**
    ---------- ------- ------- ------- -------- -------- --------
    11287      1699    1528    1601    1463     1441     1495
    11298      1905    1744    1834    1345     1291     1346
    11302      6       8       7       5        6        5
    11303      2099    1974    2100    1574     1519     1654
    11304      356     312     337     361      397      346
    11305      2528    2438    2493    1762     1942     2027
    ========== ======= ======= ======= ======== ======== ========

**Gene Annotations:**
Optional input for gene annotations, this can contain more
information about the genes than just an ID number. The annotations will
be available in the differential expression results table and the optional normalised counts table.

Example:

    ==========  ==========  ===================================================
    **GeneID**  **Symbol**  **GeneName**
    ----------  ----------  ---------------------------------------------------
    1287        Pzp         pregnancy zone protein
    1298        Aanat       arylalkylamine N-acetyltransferase
    1302        Aatk        apoptosis-associated tyrosine kinase
    1303        Abca1       ATP-binding cassette, sub-family A (ABC1), member 1
    1304        Abca4       ATP-binding cassette, sub-family A (ABC1), member 4
    1305        Abca2       ATP-binding cassette, sub-family A (ABC1), member 2
    ==========  ==========  ===================================================

**Factor Information:**
Enter Factor Names and Levels in the tool form or provide a tab-separated file that has the samples in the same order as listed in the columns of the counts matrix. The second column should contain the Primary Factor levels (e.g. Genotype) with optional additional columns for any Secondary Factors (e.g. Batch).

Example:

    ========== ============ =========
    **Sample** **Genotype** **Batch**
    ---------- ------------ ---------
    WT1        WT           b1
    WT2        WT           b2
    WT3        WT           b3
    Mut1       Mut          b1
    Mut2       Mut          b2
    Mut3       Mut          b3
    ========== ============ =========

**Primary Factor Name:** The name of the primary factor being investigated e.g. Genotype. One primary factor must be entered and spaces must not be used.

**Primary Factor Levels:** The levels of the primary factor of interest, these must be entered in the same order as the samples to which the levels correspond, as listed in the columns of the counts matrix. Spaces must not be used and if entered in the tool form the values should be separated by commas.

**Secondary Factor Name:** Optionally, one or more secondary factors can be included. These are variables that might influence your experiment e.g. Batch, Gender. Spaces must not be used.

**Secondary Factor Levels:** The levels of the secondary factor of interest, these must be entered in the same order as the samples to which the levels correspond, as listed in the columns of the counts matrix. Spaces must not be used and if entered in the tool form the values should be separated by commas.


**Contrasts of Interest:**
The contrasts you wish to make between levels.
A common contrast would be a simple difference between two levels: "Mut-WT"
represents the difference between the mutant and wild type genotypes.
Multiple contrasts should be separated by commas and spaces must not be used.

**Filter Low CPM:**
Option to ignore the genes that do not show significant levels of
expression, this filtering is dependent on two criteria:

    * **Minimum CPM:** This is the counts per million that a gene must have in at
      least some specified number of samples.

    * **Minumum Samples:** This is the number of samples in which the CPM
      requirement must be met in order for that gene to be acknowledged.

Only genes that exhibit a CPM greater than the required amount in at least the
number of samples specified will be used for analysis. Care should be taken to
ensure that the sample requirement is appropriate. In the case of an experiment
with two experimental groups each with two members, if there is a change from
insignificant cpm to significant cpm but the sample requirement is set to 3,
then this will cause that gene to fail the criteria. When in doubt simply do not
filter.

**Normalisation Method:**
Option for using different methods to rescale the raw library
size. For more information, see calcNormFactor section in the edgeR_ user's
manual.

**Apply Sample Weights:**
Option to downweight outlier samples such that their information is still
used in the statistical analysis but their impact is reduced. Use this
whenever significant outliers are present. The MDS plotting tool in this package
is useful for identifying outliers. For more information on this option see Liu et al. (2015).

**Use Advanced Testing Options?:**
By default error rate for multiple testing is controlled using Benjamini and
Hochberg's false discovery rate control at a threshold value of 0.05. However
there are options to change this to custom values.

    * **P-Value Adjustment Method:**
      Change the multiple testing control method, the options are BH(1995) and
      BY(2001) which are both false discovery rate controls. There is also
      Holm(1979) which is a method for family-wise error rate control.

    * **Adjusted Threshold:**
      Set the threshold for the resulting value of the multiple testing control
      method. Only observations whose statistic falls below this value is
      considered significant, thus highlighted in the MA plot.

    * **Minimum log2-fold-change Required:**
      In addition to meeting the requirement for the adjusted statistic for
      multiple testing, the observation must have an absolute log2-fold-change
      greater than this threshold to be considered significant, thus highlighted
      in the MA plot.

**Outputs**

This tool outputs a table of differentially expressed genes for each contrast of interest and a HTML report with plots and additional information. Optionally you can choose to output the normalised counts table and the RData file.

-----

**Citations:**

.. class:: infomark

limma

Please cite the paper below for the limma software itself. Please also try
to cite the appropriate methodology articles that describe the statistical
methods implemented in limma, depending on which limma functions you are
using.  The methodology articles are listed in Section 2.1 of the limma
User's Guide.

    * Smyth GK (2005). Limma: linear models for microarray data. In:
      'Bioinformatics and Computational Biology Solutions using R and
      Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
      W. Huber (eds), Springer, New York, pages 397-420.

    * Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:
      precision weights unlock linear model analysis tools for
      RNA-seq read counts. Genome Biology 15, R29.

    * Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME, Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? Modelling sample and observational level variability improves power in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.

    * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic,
      A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights
      for microarray data. BMC Bioinformatics 7, Article 261.

.. class:: infomark

edgeR

Please cite the first paper for the software itself and the other papers for
the various original statistical methods implemented in edgeR.  See
Section 1.2 in the User's Guide for more detail.

    * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
      package for differential expression analysis of digital gene expression
      data. Bioinformatics 26, 139-140

    * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
      differences in tag abundance. Bioinformatics 23, 2881-2887

    * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
      binomial dispersion, with applications to SAGE data.
      Biostatistics, 9, 321-332

    * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
      of multifactor RNA-Seq experiments with respect to biological variation.
      Nucleic Acids Research 40, 4288-4297

Please report problems or suggestions to: su.s@wehi.edu.au

.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
    ]]></help>
    <citations>
        <citation type="doi">10.1093/nar/gkv412</citation>
    </citations>
</tool>