view limma_voom.xml @ 25:d6f5fa4ee473 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 55ecaab6d53c6a3f0769f802a4d68a45291f51e9"
author iuc
date Tue, 01 Mar 2022 08:03:53 +0000
parents 32511f586472
children 119b069fc845
line wrap: on
line source

<tool id="limma_voom" name="limma" version="@TOOL_VERSION@+galaxy0">
    <description>
        Perform differential expression with limma-voom or limma-trend
    </description>
    <macros>
        <token name="@TOOL_VERSION@">3.50.1</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">limma</xref>
    </xrefs>
    <edam_topics>
        <edam_topic>topic_3308</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_3563</edam_operation>
        <edam_operation>operation_3223</edam_operation>
    </edam_operations>

    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">bioconductor-limma</requirement>
        <requirement type="package" version="3.36.0">bioconductor-edger</requirement>
        <requirement type="package" version="1.4.36">r-statmod</requirement>
        <requirement type="package" version="1.1.1">r-scales</requirement>
        <requirement type="package" version="0.2.21">r-rjson</requirement>
        <requirement type="package" version="1.20.3">r-getopt</requirement>
        <requirement type="package" version="3.1.1">r-gplots</requirement>
        <requirement type="package" version="2.4.0">bioconductor-glimma</requirement>
    </requirements>

    <version_command><![CDATA[
echo $(R --version | grep version | grep -v GNU)", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\$otherPkgs\$limma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", statmod version" $(R --vanilla --slave -e "library(statmod); cat(sessionInfo()\$otherPkgs\$statmod\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scales version" $(R --vanilla --slave -e "library(scales); cat(sessionInfo()\$otherPkgs\$scales\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rjson version" $(R --vanilla --slave -e "library(rjson); cat(sessionInfo()\$otherPkgs\$rjson\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", gplots version" $(R --vanilla --slave -e "library(gplots); cat(sessionInfo()\$otherPkgs\$gplots\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", Glimma version" $(R --vanilla --slave -e "library(Glimma); cat(sessionInfo()\$otherPkgs\$Glimma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
    ]]></version_command>

    <command detect_errors="exit_code"><![CDATA[
#import json
Rscript '$__tool_directory__/limma_voom.R'

-R '$outReport'
-o '$outReport.files_path'

#if $input.format=="files":

    ## Adapted from DESeq2 wrapper
    #set $temp_factor_names = list()
    #for $fact in $input.rep_factor:
        #set $temp_factor = list()
        #for $g in $fact.rep_group:
            #set $count_files = list()
            #for $file in $g.countsFile:
                $count_files.append(str($file))
            #end for
            $temp_factor.append( {str($g.groupName): $count_files} )
        #end for

        $temp_factor.reverse()
        $temp_factor_names.append([str($fact.factorName), $temp_factor])
    #end for
    -j '#echo json.dumps(temp_factor_names)#'

#elif $input.format=="matrix":
    -m '$input.counts'
    #if $input.fact.ffile=='yes':
        -f '$input.fact.finfo'
    #else:
        -i '${ '|'.join( ['%s::%s' % ($x.factorName, $x.groupNames) for x in $input.fact.rep_factor] ) }'
    #end if
#end if

#if $anno.annoOpt=='yes':
    -a '$anno.geneanno'
#end if

#if $cont.cfile=='yes':
    -C '$cont.cinfo'
#else: 
    -D '${ ','.join( ['%s' % $x.contrast for x in $cont.rep_contrast] ) }'
#end if

#if $f.filt.filt_select == 'yes':
    #if $f.filt.cformat.format_select == 'cpm':
        -c '$f.filt.cformat.cpmReq'
        -s '$f.filt.cformat.cpmSampleReq'
    #elif $f.filt.cformat.format_select == 'counts':
            -z '$f.filt.cformat.cntReq'
        #if $f.filt.cformat.samples.count_select == 'total':
            -y
        #elif $f.filt.cformat.samples.count_select == 'sample':
            -s '$f.filt.cformat.samples.cntSampleReq'
        #end if
    #end if
#end if

#if $out.plots:
    -P $out.plots
#end if

#if $out.filtCounts:
    -F
#end if

#if $out.normCounts:
    -x
#end if

#if $out.libinfo:
    -L
#end if

#if $out.rdaOption:
    -r
#end if


-l '$adv.lfc'
-p '$adv.pVal'
-d '$adv.pAdjust'
-G '$adv.topgenes'
#if $adv.treat:
    -T
#end if

#if $deMethod.de_select == 'voom':
    #if $deMethod.weightOption:
        -w
    #end if
#elif $deMethod.de_select == 'trend':
    -t $deMethod.prior_count
#end if

-n '$adv.normalisationOption'

#if $adv.robOption:
    -b
#end if

&&
mkdir ./output_dir

&&
cp '$outReport.files_path'/*tsv output_dir/

#if 'i' in str($out.plots).split( "," ):
    && cp -r ./glimma* '$outReport.files_path'
#end if

#if $out.filtCounts or $out.normCounts:
    && cp '$outReport.files_path'/*counts output_dir/
#end if

#if $out.rscript:
    && cp '$__tool_directory__/limma_voom.R' '$rscript'
#end if
    ]]></command>

    <inputs>

        <!-- DE Method Option -->
        <conditional name="deMethod">
            <param name="de_select" type="select" label="Differential Expression Method" help="Select the limma-voom or limma-trend method. See Help section below for more information. Default: limma-voom">
                <option value="voom" selected="True">limma-voom</option>
                <option value="trend">limma-trend</option>
            </param>
            <when value="voom">
                <param name="weightOption" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Apply voom with sample quality weights?"
                help="Apply weights if outliers are present (voomWithQualityWeights). Default: False.">
                </param>
            </when>
            <when value="trend">
                <param name="prior_count" type="float" min="0" value="3" label="Prior count" help="Average count to be added to each observation to avoid taking log of zero. Default: 3." />
            </when>
        </conditional>
        <!-- Counts and Factors -->
        <conditional name="input">
            <param name="format" type="select" label="Count Files or Matrix?"
                help="You can choose to input either separate count files (one per sample) or a single count matrix">
                <option value="files">Separate Count Files</option>
                <option value="matrix">Single Count Matrix</option>
            </param>

            <when value="files">
                <repeat name="rep_factor" title="Factor" min="1">
                    <param name="factorName" type="text" label="Name" help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Optional additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section for more information. NOTE: Please only use letters, numbers or underscores.">
                    <sanitizer>
                        <valid initial="string.letters,string.digits"><add value="_" /></valid>
                    </sanitizer>
                    </param>
                    <repeat name="rep_group" title="Group" min="2" default="2">
                        <param name="groupName" type="text" label="Name"
                        help="Name of group that the counts files belong to (e.g. WT or Mut). NOTE: Please only use letters, numbers or underscores (case sensitive).">
                        <sanitizer>
                            <valid initial="string.letters,string.digits"><add value="_" /></valid>
                        </sanitizer>
                        </param>
                        <param name="countsFile" type="data" format="tabular" multiple="true" label="Counts files"/>
                    </repeat>
                </repeat>
            </when>

            <when value="matrix">
                <param name="counts" type="data" format="tabular" label="Count Matrix"/>

                <conditional name="fact">
                    <param name="ffile" type="select" label="Input factor information from file?"
                        help="You can choose to input the factor and group information for the samples from a file or manually enter below. NOTE: Please only use letters, numbers or underscores (case sensitive), the group names MUST not contain hyphens.">
                        <option value="no">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="yes">
                        <param name="finfo" type="data" format="tabular" label="Factor File"/>
                    </when>
                    <when value="no" >
                        <repeat name="rep_factor" title="Factor" min="1">
                            <param name="factorName" type="text" label="Factor Name"
                                help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section below. NOTE: Please only use letters, numbers or underscores.">
                                <validator type="empty_field" />
                                <validator type="regex" message="Please only use letters, numbers or underscores">^[\w]+$</validator>
                            </param>
                            <param name="groupNames" type="text" label="Groups"
                                help="Enter the group names for the samples separated with commas e.g. WT,WT,WT,Mut,Mut,Mut. The order of the names must match the order of the samples in the columns of the count matrix. NOTE: Please only use letters, numbers or underscores (case sensitive), the group names MUST not contain hyphens.">
                                <validator type="empty_field" />
                                <validator type="regex" message="Please only use letters, numbers or underscores, and separate levels by commas">^[\w,]+$</validator>
                            </param>
                        </repeat>
                    </when>
                </conditional>
            </when>
        </conditional>

        <!-- Gene Annotations -->
        <conditional name="anno">
            <param name="annoOpt" type="select" label="Use Gene Annotations?"
                    help="If you provide an annotation file, annotations will be added to the table(s) of differential expression results to provide descriptions for each gene, and used to label the top genes in the Volcano plot. Interactive Glimma Volcano and MD plots will also be generated. See Help section below.">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>
            </when>
            <when value="no" />
        </conditional>

        <!-- Contrasts -->
        <conditional name="cont">
            <param name="cfile" type="select" label="Input Contrast information from file?"
                help="You can choose to input the contrast information for the samples from a file or manually enter below. NOTE: Please only use letters, numbers or underscores (case sensitive), the group names MUST not contain hyphens. Use a hyphen to separate the groups you want to compare, as shown in the Help section below.">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="yes">
                <param name="cinfo" type="data" format="tabular" label="Contrasts File"/>
            </when>
            <when value="no" >
                <repeat name="rep_contrast" title="Contrast" min="1" default="1">
                    <param name="contrast" type="text" label="Contrast of Interest" help="Names of two groups to compare separated by a hyphen e.g. Mut-WT. If the order is Mut-WT the fold changes in the results will be up/down in Mut relative to WT. If you have more than one contrast enter each separately using the Insert Contrast button below. For differences between contrasts use e.g. (Mut1-Ctrl1)-(Mut2-Ctrl2). For more info, see Chapter 8 in the limma User's guide: https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf">
                    <validator type="empty_field" />
                    <validator type="regex" message="Please only use letters, numbers or underscores">^[\(\w\)-]+$</validator>
                    </param> 
                </repeat>
            </when>
        </conditional>

        <!-- Filter Options -->
        <section name="f" expanded="false" title="Filter Low Counts">
            <conditional name="filt">
                <param name="filt_select" type="select" label="Filter lowly expressed genes?" help="Treat genes with very low expression as unexpressed and filter out. See the Filter Low Counts section below for more information. Default: No">
                    <option value="no" selected="true">No</option>
                    <option value="yes">Yes</option>
                </param>
                <when value="yes">
                    <conditional name="cformat">
                        <param name="format_select" type="select" label="Filter on CPM or Count values?" help="It is slightly better to base the filtering on count-per-million (CPM) rather than the raw count values so as to avoid favoring genes expressed in samples sequenced to a higher depth. ">
                            <option value="cpm">CPM</option>
                            <option value="counts">Counts</option>
                        </param>
                        <when value="cpm">
                            <param name="cpmReq" type="float" value="1" min="0" label="Minimum CPM" help="Treat genes with CPM below this value as unexpressed and filter out. See the Filter Low Counts section below for more information."/>
                            <param name="cpmSampleReq" type="integer" value="0" min="0" label="Minimum Samples"
                                help="Filter out all genes that do not meet the Minimum CPM in at least this many samples. See the Filter Low Counts section below for more information."/>
                        </when>
                        <when value="counts">
                            <param name="cntReq" type="integer" value="0" min="0" label="Minimum Count" help="Filter out all genes that do not meet this minimum count. You can choose below to apply this filter to the total count for all samples or specify the number of samples under Minimum Samples. See the Filter Low Counts section below for more information." />
                            <conditional name="samples">
                                <param name="count_select" type="select" label="Filter on Total Count or per Sample Count values?" >
                                    <option value="total">Total</option>
                                    <option value="sample">Sample</option>
                                </param>
                                <when value="total"/>
                                <when value="sample">
                                    <param name="cntSampleReq" type="integer" value="0" min="0" label="Minimum Samples"
                                    help="Filter out all genes that do not meet the Minimum Count in at least this many samples. See the Filter Low Counts section below for more information."/>
                                </when>
                            </conditional>
                        </when>
                    </conditional>
                </when>
                <when value="no" />
            </conditional>
        </section>

        <!-- Output Options -->
        <section name="out" expanded="false" title="Output Options">
            <param name="plots" type="select" display="checkboxes" multiple="True" optional="True" label="Additional Plots" help="Select additional plots to output in the report">
                <option value="i" selected="True">Glimma Interactive Plots</option>
                <option value="d">Density Plots (if filtering)</option>
                <option value="c">CpmsVsCounts Plots (if filtering on cpms)</option>
                <option value="b">Box Plots (if normalising)</option>
                <option value="x">MDS Extra (Dims 2vs3 and 3vs4)</option>
                <option value="m">MD Plots for individual samples</option>
                <option value="h">Heatmaps (top DE genes) </option>
                <option value="s">Stripcharts (top DE genes)</option>
            </param>
            <param name="filtCounts" type="boolean" truevalue="1" falsevalue="0" checked="false"
                label="Output Filtered Counts Table?"
                help="Output a file containing the raw filtered counts if Filter Low Counts is selected. Default: No">
            </param>
            <param name="normCounts" type="boolean" truevalue="1" falsevalue="0" checked="false"
                label="Output Normalised Counts Table?"
                help="Output a file containing the normalised counts, these are in log2 counts per million (logCPM). Default: No">
            </param>
            <param name="libinfo" type="boolean" truevalue="1" falsevalue="0" checked="false"
                label="Output Library information file?"
                help="Output a tabular file showing the library sizes, normalisation factors and effective library sizes for the samples. Default: No">
            </param>
            <param name="rscript" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used will be provided as a text file in the output. Default: No"/>
            <param name="rdaOption" type="boolean" truevalue="1" falsevalue="0" checked="false"
                label="Output RData file?"
                help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report. Default: No">
            </param>
        </section>

        <!-- Advanced Options -->
        <section name="adv" expanded="false" title="Advanced Options">
            <param name="lfc" type="float" value="0" min="0"
                label="Minimum Log2 Fold Change"
                help="Genes above this threshold and below the p-value threshold are considered significant and highlighted in the MD plot. Default: 0."/>
            <param name="pVal" type="float" value="0.05" min="0" max="1"
                label="P-Value Adjusted Threshold"
                help="Genes below this threshold are considered significant and highlighted in the MD plot. If either BH(1995) or BY(2001) are selected then this value is a false-discovery-rate control. If Holm(1979) is selected then this is an adjusted p-value for family-wise error rate. Default: 0.05."/>
            <param name="pAdjust" type="select" label="P-Value Adjustment Method" help="Default: BH">
                <option value="BH" selected="true">Benjamini and Hochberg (1995)</option>
                <option value="BY">Benjamini and Yekutieli (2001)</option>
                <option value="holm">Holm (1979)</option>
                <option value="none">None</option>
            </param>
            <param name="treat" type="boolean" truevalue="1" falsevalue="0" checked="False"
                label="Test significance relative to a fold-change threshold (TREAT)"
                help="If you want to apply a cut-off on a fold change the TREAT function can be used, see Help section below. Default: No"/>
            <param name="topgenes" type="integer" value="10" min="0" max="100"
                label="Number of genes to highlight in Volcano plot, Heatmap and Stripcharts"
                help="The top DE genes will be highlighted in the Volcano plot for each contrast and can be output in heatmap and stripchart PDFs (max 100). Default: 10."/>
            <param name="normalisationOption" type="select" label="Normalisation Method" help="Default: TMM">
                <option value="TMM" selected="true">TMM</option>
                <option value="RLE">RLE</option>
                <option value="upperquartile">Upperquartile</option>
                <option value="none">None (Don't normalise)</option>
            </param>
            <param name="robOption" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Use Robust Settings?" help="Using robust settings is usually recommended to protect against outlier genes. Default: Yes" />
        </section>

    </inputs>

    <outputs>
        <data name="outReport" format="html" label="${tool.name} on ${on_string}: Report" />
        <collection name="outTables" type="list" label="${tool.name} on ${on_string}: DE tables">
            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output_dir" visible="false" />
        </collection>
        <data name="outFilt" format="tabular" from_work_dir="output_dir/*_filtcounts" label="${tool.name} on ${on_string}: Filtered counts">
            <filter>out['filtCounts']</filter>
        </data>
        <data name="outNorm" format="tabular" from_work_dir="output_dir/*_normcounts" label="${tool.name} on ${on_string}: Normalised counts">
            <filter>out['normCounts']</filter>
        </data>
        <data name="rscript" format="txt" label="${tool.name} on ${on_string}: Rscript">
            <filter>out['rscript']</filter>
        </data>
        <data name="libinfo" format="tabular" from_work_dir="libsizeinfo" label="${tool.name} on ${on_string}: Library information">
            <filter>out['libinfo']</filter>
        </data>
    </outputs>

    <tests>
        <!-- Ensure report is output -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <repeat name="rep_contrast">
                <param name="contrast" value="WT-Mut" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="2">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4573" />
                    </assert_contents>
                </element>
                <element name="limma-voom_WT-Mut" ftype="tabular" >
                     <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*-0.4573" />
                    </assert_contents>
                </element>
            </output_collection>
            <output name="outReport" >
                <assert_contents>
                    <has_text text="Limma Analysis Output" />
                    <has_text text="Glimma Interactive Results" />
                    <not_has_text text="RData" />
                </assert_contents>
            </output>
       </test>
        <!-- Ensure annotation file input works -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="annoOpt" value="yes" />
            <param name="geneanno" value="anno.txt" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="1">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="EntrezID.*Symbol.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*Abca4.*0.4573" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- Ensure Rscript and RData file can be output -->
        <test expect_num_outputs="3">
            <param name="format" value="matrix" />
            <param name="rscript" value="True"/>
            <param name="rdaOption" value="true" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output name="outReport" >
                <assert_contents>
                    <has_text text="RData" />
                </assert_contents>
            </output>
            <output name="rscript">
                <assert_contents>
                    <has_text_matching expression="Task run time" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure secondary factors work -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <repeat name="rep_factor">
                <param name="factorName" value="Batch"/>
                <param name="groupNames" value="b1,b2,b3,b1,b2,b3"/>
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="1" >
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4590" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- Ensure factors file with unordered samples works -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="ffile" value="yes" />
            <param name="finfo" value="factorinfo.txt" />
            <param name="counts" value="matrix.txt" />
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="1">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4590" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- Ensure filtered and normalised count outputs works-->
        <test expect_num_outputs="4">
            <param name="format" value="matrix" />
            <param name="filtCounts" value="true" />
            <param name="normCounts" value="true" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="filt_select" value="yes" />
            <param name="format_select" value="counts"/>
            <param name="cntReq" value="10"/>
            <param name="count_select" value="sample"/>
            <param name="cntSampleReq" value="3"/>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="1">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.45.*15.52.*4.94.*7.74.*0.0001.*5.27" />
                    </assert_contents>
                </element>
            </output_collection>
            <output name="outNorm" ftype="tabular" >
                <assert_contents>
                    <has_text_matching expression="GeneID.*Mut1.*Mut2.*Mut3.*WT1.*WT2.*WT3" />
                    <has_text_matching expression="11304.*15.7.*15.8.*15.6.*15.3.*15.2.*15.2" />
                </assert_contents>
            </output>
            <output name="outFilt" ftype="tabular" >
                <assert_contents>
                    <has_text_matching expression="GeneID.*Mut1.*Mut2.*Mut3.*WT1.*WT2.*WT3" />
                    <has_text_matching expression="11304.*361.*397.*346.*356.*312.*337" />
                    <not_has_text text="11302"/>
                </assert_contents>
            </output>
        </test>
        <!-- Ensure multiple counts files input works -->
        <test expect_num_outputs="3">
            <param name="format" value="files" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <repeat name="rep_group">
                    <param name="groupName" value="WT"/>
                    <param name="countsFile" value="WT1.counts,WT2.counts,WT3.counts"/>
                </repeat>
                <repeat name="rep_group">
                    <param name="groupName" value="Mut"/>
                    <param name="countsFile" value="Mut1.counts,Mut2.counts,Mut3.counts"/>
                </repeat>
            </repeat>
            <repeat name="rep_factor">
                <param name="factorName" value="Batch"/>
                <repeat name="rep_group">
                    <param name="groupName" value="b1"/>
                    <param name="countsFile" value="WT1.counts,Mut1.counts"/>
                </repeat>
                <repeat name="rep_group">
                    <param name="groupName" value="b2"/>
                    <param name="countsFile" value="WT2.counts,Mut2.counts"/>
                </repeat>
                <repeat name="rep_group">
                    <param name="groupName" value="b3"/>
                    <param name="countsFile" value="WT3.counts,Mut3.counts"/>
                </repeat>
            </repeat>
            <param name="annoOpt" value="yes" />
            <param name="geneanno" value="anno.txt" />
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <repeat name="rep_contrast">
                <param name="contrast" value="WT-Mut" />
            </repeat>
            <param name="topgenes" value="6" />
            <param name="normCounts" value="true" />
            <output_collection name="outTables" count="2">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*Abca4.*0.4590" />
                    </assert_contents>
                </element>
                <element name="limma-voom_WT-Mut" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="EntrezID.*Symbol.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*Abca4.*-0.4590" />
                    </assert_contents>
                </element>
            </output_collection>
            <output name="outNorm" ftype="tabular" >
                <assert_contents>
                    <has_text_matching expression="EntrezID.*Symbol.*Mut1.*Mut2.*Mut3.*WT1.*WT2.*WT3" />
                    <has_text_matching expression="11304.*Abca4.*15.7545" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure limma-trend option works -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <param name="de_select" value="trend" />
            <output name="outReport" >
                <assert_contents>
                    <has_text text="The limma-trend method was used" />
                </assert_contents>
            </output>
            <output_collection name="outTables" count="1">
                <element name="limma-trend_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4540" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- Ensure limma-trend option with annotation works -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix.txt" />
            <param name="annoOpt" value="yes" />
            <param name="geneanno" value="anno.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="Mut-WT" />
            </repeat>
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <param name="de_select" value="trend" />
            <output name="outReport" >
                <assert_contents>
                    <has_text text="The limma-trend method was used" />
                </assert_contents>
            </output>
            <output_collection name="outTables" count="1">
                <element name="limma-trend_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="EntrezID.*Symbol.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4540" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- Ensure samples and groups beginning with numbers can be handled -->
        <test expect_num_outputs="3">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix_num.txt" />
            <param name="annoOpt" value="yes" />
            <param name="geneanno" value="anno.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Group"/>
                <param name="groupNames" value="2,2,2,1,1,1" />
            </repeat>
            <param name="cfile" value="no" />
            <repeat name="rep_contrast">
                <param name="contrast" value="2-1" />
            </repeat>
            <param name="filt_select" value="yes" />
            <param name="format_select" value="counts"/>
            <param name="cntReq" value="10"/>
            <param name="count_select" value="sample"/>
            <param name="cntSampleReq" value="3"/>
            <param name="normalisationOption" value="TMM" />
            <param name="normCounts" value="true" />
            <param name="topgenes" value="6" />
            <param name="de_select" value="voom" />
            <output_collection name="outTables" count="1">
                <element name="limma-voom_X2-X1" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="EntrezID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.45.*15.52.*4.94.*7.74.*0.0001.*5.27" />
                    </assert_contents>
                </element>
            </output_collection>
            <output name="outNorm" ftype="tabular" >
                <assert_contents>
                    <has_text_matching expression="EntrezID.*2-1.*2-2.*2-3.*1-1.*1-2.*1-3" />
                    <has_text_matching expression="11304.*15.7.*15.8.*15.6.*15.3.*15.2.*15.2" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure contrasts file works -->
        <test expect_num_outputs="2">
            <param name="format" value="matrix" />
            <param name="counts" value="matrix.txt" />
            <repeat name="rep_factor">
                <param name="factorName" value="Genotype"/>
                <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
            </repeat>
            <param name="cfile" value="yes" />
            <param name="cinfo" value="contrasts.txt" ftype="tabular" />
            <param name="normalisationOption" value="TMM" />
            <param name="topgenes" value="6" />
            <output_collection name="outTables" count="3">
                <element name="limma-voom_Mut-WT" ftype="tabular" >
                    <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.4573" />
                    </assert_contents>
                </element>
                <element name="limma-voom_WT-Mut" ftype="tabular" >
                     <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*-0.4573" />
                    </assert_contents>
                </element>
                <element name="limma-voom_Mut-WT-WT-Mut" ftype="tabular" >
                     <assert_contents>
                        <has_text_matching expression="GeneID.*logFC.*AveExpr.*t.*P.Value.*adj.P.Val.*B" />
                        <has_text_matching expression="11304.*0.9146" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>

    <help><![CDATA[
.. class:: infomark

**What it does**

Given a matrix of counts (e.g. from featureCounts) and optional information about the genes, this tool performs differential expression (DE) using the limma_ Bioconductor package and produces plots and tables useful in DE analysis. Interactive Glimma_ plots and tables can also be generated and links to the Glimma plots will be provided in the report. See an example workflow here_.

In the `limma approach`_ to RNA-seq, read counts are converted to log2-counts-per-million (logCPM) and the mean-variance relationship is modelled either with precision weights or with an empirical Bayes prior trend. The precision weights approach is called “voom” and the prior trend approach is called “limma-trend”. For more information, see the Help section below.

-----

**Inputs**

**Differential Expression Method:**
Option to use the limma-voom or limma-trend approach for differential expression. The default is limma-voom.
If the sequencing depth is reasonably consistent across the RNA samples, then the simplest and most
robust approach to differential expression is to use limma-trend. This approach will usually work well if the
ratio of the largest library size to the smallest is not more than about 3-fold. When the library sizes are quite variable between samples, then the voom approach is theoretically more powerful than limma-trend. For more information see the excellent `limma User's Guide`_.

**Counts Data:**
The counts data can either be input as separate counts files (one sample per file) or a single count matrix (one sample per column). The rows correspond to genes, and columns correspond to the counts for the samples. Values must be tab separated, with the first row containing the sample/column labels and the first column containing the row/gene labels. The sample labels must start with a letter. Gene identifiers can be of any type but must be unique and not repeated within a counts file.

Example - **Separate Count Files**:

    ========== =======
    **GeneID** **WT1**
    ---------- -------
    11287      1699
    11298      1905
    11302      6
    11303      2099
    11304      356
    11305      2528
    ========== =======

Example - **Single Count Matrix**:

    ========== ======= ======= ======= ======== ======== ========
    **GeneID** **WT1** **WT2** **WT3** **Mut1** **Mut2** **Mut3**
    ---------- ------- ------- ------- -------- -------- --------
    11287      1699    1528    1601    1463     1441     1495
    11298      1905    1744    1834    1345     1291     1346
    11302      6       8       7       5        6        5
    11303      2099    1974    2100    1574     1519     1654
    11304      356     312     337     361      397      346
    11305      2528    2438    2493    1762     1942     2027
    ========== ======= ======= ======= ======== ======== ========

**Gene Annotations:**
Optional input for gene annotations, this can contain more
information about the genes than just an ID number. The annotations will
be available in the differential expression results table and the optional normalised counts table. They will also be used to generate interactive Glimma_ Volcano, MD plots and tables of differential expression. The input annotation file must contain a header row and have the gene IDs in the first column. The second column will be used to label the genes in the Volcano plot and interactive Glimma plots, additional columns will be available in the Glimma interactive table. The number of rows should match that of the counts files, add NA for any gene IDs with no annotation. The Galaxy tool **annotateMyIDs** can be used to obtain annotations for human, mouse, fly and zebrafish.

Example:

    ==========  ==========  ===================================================
    **GeneID**  **Symbol**  **GeneName**
    ----------  ----------  ---------------------------------------------------
    11287       Pzp         pregnancy zone protein
    11298       Aanat       arylalkylamine N-acetyltransferase
    11302       Aatk        apoptosis-associated tyrosine kinase
    11303       Abca1       ATP-binding cassette, sub-family A (ABC1), member 1
    11304       Abca4       ATP-binding cassette, sub-family A (ABC1), member 4
    11305       Abca2       ATP-binding cassette, sub-family A (ABC1), member 2
    ==========  ==========  ===================================================

**Factor Information:**
Enter factor names and groups in the tool form, or provide a tab-separated file that has the names of the samples in the first column and one header row. The sample names must be the same as the names in the columns of the count matrix. The second column should contain the primary factor levels (e.g. WT, Mut) with optional additional columns for any secondary factors.

Example:

    ========== ============ =========
    **Sample** **Genotype** **Batch**
    ---------- ------------ ---------
    WT1        WT           b1
    WT2        WT           b2
    WT3        WT           b3
    Mut1       Mut          b1
    Mut2       Mut          b2
    Mut3       Mut          b3
    ========== ============ =========

*Factor Name:* The name of the experimental factor being investigated e.g. Genotype, Treatment. One factor must be entered and spaces must not be used. Optionally, additional factors can be included, these are variables that might influence your experiment e.g. Batch, Gender, Subject. If additional factors are entered, an additive linear model will be used.

*Groups:* The names of the groups for the factor. The names should only contain letters, numbers and underscores, other characters such as spaces and hyphens MUST not be used. If entered into the tool form above, the order must be the same as the samples (to which the groups correspond) are listed in the columns of the counts matrix, with the values separated by commas. If the group names begin with a number an X will be added as a prefix.

**Contrasts of Interest:**
The contrasts you wish to make between levels. A common contrast would be a simple difference between two levels: "Mut-WT"
represents the difference between the mutant and wild type genotypes. Multiple contrasts must be entered separately using the Insert Contrast button, spaces must not be used. Alternatively, a tab-separated file can be input that has the names of the comparisons in the first column and one header row, as shown below.

Example:

    ============= =
    **Contrasts**
    ------------- -
    Mut-WT   
    WT-Mut   
    ============= =

**Filter Low Counts:**
Genes with very low counts across all libraries provide little evidence for differential expression.
In the biological point of view, a gene must be expressed at some minimal level before
it is likely to be translated into a protein or to be biologically important. In addition, the
pronounced discreteness of these counts interferes with some of the statistical approximations
that are used later in the pipeline. These genes should be filtered out prior to further
analysis.
As a rule of thumb, genes are dropped if they can’t possibly be expressed in all the samples
for any of the conditions. Users can set their own definition of genes being expressed. Usually
a gene is required to have a count of 5-10 in a library to be considered expressed in that
library. Users should also filter with count-per-million (CPM) rather than filtering on the
counts directly, as the latter does not account for differences in library sizes between samples.

Option to ignore the genes that do not show significant levels of
expression, this filtering is dependent on two criteria: CPM/count and number of samples. You can specify to filter on CPM (Minimum CPM) or count (Minimum Count) values:

    * **Minimum CPM:** This is the minimum count per million that a gene must have in at
      least the number of samples specified under Minimum Samples.

    * **Minimum Count:** This is the minimum count that a gene must have. It can be combined with either Filter
      on Total Count or Minimum Samples.

    * **Filter on Total Count:** This can be used with the Minimum Count filter to keep genes
      with a minimum total read count.

    * **Minimum Samples:** This is the number of samples in which the Minimum CPM/Count
      requirement must be met in order for that gene to be kept.

If the Minimum Samples filter is applied, only genes that exhibit a CPM/count greater than the required amount in at least the number of samples specified will be used for analysis. Care should be taken to
ensure that the sample requirement is appropriate. In the case of an experiment
with two experimental groups each with two members, if there is a change from
insignificant CPM/count to significant CPM/count but the sample requirement is set to 3,
then this will cause that gene to fail the criteria. When in doubt simply do not
filter or consult the `limma User's Guide`_ for filtering recommendations.

**Advanced Options:**

By default error rate for multiple testing is controlled using Benjamini and
Hochberg's false discovery rate control at a threshold value of 0.05. However
there are options to change this to custom values.

    * **Minimum log2-fold-change Required:**
      In addition to meeting the requirement for the adjusted statistic for
      multiple testing, the observation must have an absolute log2-fold-change
      greater than this threshold to be considered significant, thus highlighted
      in the MD plot.

    * **Adjusted Threshold:**
      Set the threshold for the resulting value of the multiple testing control
      method. Only observations whose statistic falls below this value is
      considered significant, thus highlighted in the MD plot.

    * **P-Value Adjustment Method:**
      Change the multiple testing control method, the options are BH(1995) and
      BY(2001) which are both false discovery rate controls. There is also
      Holm(1979) which is a method for family-wise error rate control.

**Testing relative to a threshold (TREAT):**
If there are a lot of differentially expressed genes, a fold change threshold can be applied in addition to the P-value threshold to select genes that are more likely to be biologically significant. However, ranking by P-value and discarding genes with small logFCs can increase the false discovery rate. Using the limma TREAT function performs this analysis correctly (`McCarthy and Smyth, 2009`_).

**Normalisation Method:**
The most obvious technical factor that affects the read counts, other than gene expression
levels, is the sequencing depth of each RNA sample. edgeR adjusts any differential expression
analysis for varying sequencing depths as represented by differing library sizes. This is
part of the basic modeling procedure and flows automatically into fold-change or p-value
calculations. It is always present, and doesn’t require any user intervention.
The second most important technical influence on differential expression is one that is less
obvious. RNA-seq provides a measure of the relative abundance of each gene in each RNA
sample, but does not provide any measure of the total RNA output on a per-cell basis.
This commonly becomes important when a small number of genes are very highly expressed
in one sample, but not in another. The highly expressed genes can consume a substantial
proportion of the total library size, causing the remaining genes to be under-sampled in that
sample. Unless this RNA composition effect is adjusted for, the remaining genes may falsely
appear to be down-regulated in that sample . The edgeR `calcNormFactors` function normalizes for RNA composition by finding a set of scaling factors for the library sizes that minimize the log-fold changes between the samples for most genes. The default method for computing these scale factors uses a trimmed mean of M values (TMM) between each pair of samples. We call the product of the original library size and the scaling factor the *effective library size*. The effective library size replaces the original library size in all downsteam analyses. TMM is the recommended method for most RNA-Seq data where the majority (more than half) of the genes are believed not differentially expressed between any pair of the samples. You can change the normalisation method under **Advanced Options** above. For more information, see the `calcNormFactors` section in the `edgeR User's Guide`_.

**Robust Settings**
Option to use robust settings with eBayes or TREAT, used by both limma-voom and limma-trend. Using robust settings is usually recommended to protect against outlier genes, for more information see the `limma User's Guide`_ and `Phipson et al. 2016`_. This is turned on by default.

**Prior Count:**
If the limma-trend method is used, a count (`prior.count`) is added to all counts to avoid taking a log of zero, and damp down the variances of logarithms of low counts. A default of 3 is used, as recommended in the `limma User's Guide`_.

**Apply Sample Weights:**
If the limma-voom method is used, an option is available to downweight outlier samples, such that their information is still
used in the statistical analysis but their impact is reduced. Use this
whenever significant outliers are present. The MDS plotting tool in this package
is useful for identifying outliers. For more information on this option see Liu et al. (2015).


**Outputs**

This tool outputs

    * a table of differentially expressed genes for each contrast of interest
    * a HTML report with plots and additional information

Optionally, under **Output Options** you can choose to output

    * interactive Glimma plots and tables: MDS plot, and (if annotation file is input) Volcano plot and MD plot (default: Yes)
    * additional plots in the report and as PDFs
    * a normalised counts table
    * a library size information file
    * the R script used by this tool
    * an RData file

-----

**Citations:**

Please try to cite the appropriate articles when you publish results obtained using software, as such citation is the main means by which the authors receive credit for their work.

limma

Please cite the paper below for the limma software itself. Please also try
to cite the appropriate methodology articles that describe the statistical
methods implemented in limma, depending on which limma functions you are
using.  The methodology articles are listed in Section 2.1 of the `limma
User's Guide`_.

    * Smyth GK (2005). Limma: linear models for microarray data. In:
      'Bioinformatics and Computational Biology Solutions using R and
      Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
      W. Huber (eds), Springer, New York, pages 397-420.

    * Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:
      precision weights unlock linear model analysis tools for
      RNA-seq read counts. Genome Biology 15, R29.

    * Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME, Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? Modelling sample and observational level variability improves power in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.

    * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic,
      A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights
      for microarray data. BMC Bioinformatics 7, Article 261.


edgeR

Please cite the first paper for the software itself and the other papers for
the various original statistical methods implemented in edgeR.  See
Section 1.2 in the `edgeR User's Guide`_ for more detail.

    * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
      package for differential expression analysis of digital gene expression
      data. Bioinformatics 26, 139-140

    * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
      differences in tag abundance. Bioinformatics 23, 2881-2887

    * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
      binomial dispersion, with applications to SAGE data.
      Biostatistics, 9, 321-332

    * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
      of multifactor RNA-Seq experiments with respect to biological variation.
      Nucleic Acids Research 40, 4288-4297

Please report problems or suggestions to: su.s@wehi.edu.au

.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
.. _Glimma: https://bioconductor.org/packages/release/bioc/html/Glimma.html
.. _here: https://f1000research.com/articles/5-1408/v3
.. _limma approach: https://www.ncbi.nlm.nih.gov/pubmed/25605792
.. _limma User's Guide: http://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf
.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
.. _edgeR User's Guide: https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
.. _McCarthy and Smyth, 2009: https://www.ncbi.nlm.nih.gov/pubmed/19176553
.. _Phipson et al. 2016: https://www.ncbi.nlm.nih.gov/pubmed/28367255
    ]]></help>
    <citations>
        <citation type="doi">10.1186/gb-2014-15-2-r29</citation>
        <citation type="doi">10.1093/nar/gkv412</citation>
    </citations>
</tool>