view diffexp.xml @ 0:7a80e9ec63cb

- Initial commit
author shian_su <registertonysu@gmail.com>
date Tue, 16 Dec 2014 14:38:15 +1100
parents
children b2fe55fd0651
line wrap: on
line source

<tool id="diffexp" name="Voom Rnaseq" version="1.1.0">
  <description>
      Perform differential expression analysis using pipeline based on the voom
      function of the limma bioconductor package. This tool takes a count matrix
      (tab separated) as input and produces a HTML report as output.
  </description>
  
  <requirements>
    <requirement type="R-module" version="3.5.27">edgeR</requirement>
    <requirement type="R-module" version="3.18.13">limma</requirement>
  </requirements>

  <stdio>
    <exit_code range="1:" level="fatal" description="Tool exception" />
  </stdio>
  
  <command interpreter="Rscript">
    diffexp.R $counts
              
              #if $anno.annoOpt=="yes":
                $geneanno
              #else:
                None
              #end if
              
              $outFile
              $outFile.files_path
              "no" <!-- Disabled Rda option -->
              $normalisationOption
              $weightCond.weightOption
              "$contrast"

              #if $filterCPM.filterLowCPM=="yes":
                $filterCPM.cpmReq
                $filterCPM.sampleReq
              #else:
                0
                0
              #end if
              
              #if $testOpt.wantOpt=="yes":
              	"$testOpt.pAdjust"
              	$testOpt.pVal
              	$testOpt.lfc
              #else:
                "BH"
                0.05
                0
              #end if
              
              <!--*Code commented until solution for multiple factors is found*
              #for $i, $fct in enumerate($factors):
                $fct.factName::$fct.factLevel
              #end for
              -->
              "$factName::$factLevel"
               
  </command>
   
  <inputs>
    <param name="counts" type="data" format="tabular" label="Counts Data"/>
      
    <conditional name="anno">
      <param name="annoOpt" type="select" label="Use Gene Annotations?"
             help="Annotations will be added to table of top differential 
                   expressions to provide descriptions for each gene.">
        <option value="no">No</option>
        <option value="yes">Yes</option>
      </param>
      
      <when value="yes">
        <param name="geneanno" type="data" format="tabular"
               label="Gene Annotations"/>
        </when>
    </conditional>

    <!--*Code commented until solution for multiple factors is found*
    <repeat name="factors" title="Factors" min="1" max="5" default="1">
      <param name="factName" type="text" label="Factor Name (No spaces)"
             help="Eg. Genotype"/>
        <param name="factLevel" type="text" size="100"
               label="Factor Levels (No spaces)"
               help="Eg. WT,WT,Mut,Mut,WT"/>
    </repeat>
    -->
    
    <param name="factName" type="text" label="Factor Name"
           help="Eg. Genotype."/>
    <param name="factLevel" type="text" size="100"
           label="Factor Values"
           help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same
           		 levels are typed identically when repeated, with all cases
           		 matching."/>
    
    <param name="contrast" type="text" size="30"
           label="Contrasts of interest"
           help="Eg. Mut-WT,KD-Control."/>
    
    <conditional name="filterCPM">
      <param name="filterLowCPM" type="select" label="Filter Low CPM?"
       help="Treat genes with very low expression as unexpressed and 
       			 filter out to speed up computation.">
        <option value="yes" selected="True">Yes</option>
        <option value="no">No</option>
      </param>
      
        <when value="yes">
          <param name="cpmReq" type="float" value="0.5" min="0"
                 label="Minimum CPM"/>
                 
          <param name="sampleReq" type="integer" value="1" min="0"
                 label="Minimum Samples" 
                 help="Filter out all the genes that do not meet the minimum 
                       CPM in at least this many samples."/>
        </when>
        
        <when value="no"/>
        
    </conditional>
       
    <conditional name="weightCond">
      <param name="weightOption" type="select" label="Apply sample weights?"
             display="radio" help="Apply weights if outliers are present.">
             
        <option value="no">No</option>
        <option value="yes">Yes</option>
        
      </param>
    </conditional>
    
    <param name="normalisationOption" type="select" 
           label="Normalisation Method">
           
      <option value="TMM">TMM</option>
      <option value="RLE">RLE</option>
      <option value="upperquartile">Upperquartile</option>
      <option value="none">None (Don't normalise)</option>
      
    </param>
    			 
    <conditional name="testOpt">
      <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
       help="Enable choices for p-value adjustment method, p-value threshold
             and log2-fold-change threshold.">
        <option value="no" selected="True">No</option>
      	<option value="yes">Yes</option>
      </param>
      
        <when value="yes">
        	<param name="pAdjust" type="select" label="P-Value Adjustment Method.">
          	<option value="BH">Benjamini and Hochberg (1995)</option>
            <option value="BY">Benjamini and Yekutieli (2001)</option>
            <option value="holm">Holm (1979)</option>
            <option value="none">None</option>
          </param>
          
          <param name="pVal" type="float" value="0.05" min="0" max="1"
    			 		   label="Adjusted Threshold"
    			 		   help="Genes below this threshold are considered significant and
    			 		         highlighted in the MA plot. If either BH(1995) or
    			 		         BY(2001) were selected then this value is a 
    			 		         false-discovery-rate control. If Holm(1979) was selected
    			 		         then this is an adjusted p-value for family-wise error 
    			 		         rate."/>
    			 		   
          <param name="lfc" type="float" value="0" min="0"
          			 label="Minimum log2-fold-change Required"
          			 help="Genes above this threshold and below the p-value
          			       threshold are considered significant and highlighted
          			       in the MA plot."/>
        </when>
        
        <when value="no"/>
        
    </conditional>

 <!--    <conditional name="wantRda">
      <param name="rdaOption" type="select" label="Output RData?"
             display="radio"
             help="Output all the data R used to construct the plots,
                   can be loaded into R.">
                  
        <option value="no">No</option>
        <option value="yes">Yes</option>
        
      </param>
    </conditional> -->
  </inputs>
  
  <outputs>
      <data format="html" name="outFile" label="Voom Output"/>
  </outputs>
    
  
<help>
.. class:: infomark

**What it does**

Given a matrix of counts and optional information about the genes, this tool
produces plots and tables useful in the analysis of differential gene 
expression.

.. class:: warningmark

This tool is dependent on the R packages limma_ and edgeR_ as a part of the
bioconductor project. Please ensure that these packages are installed on the
server running this tool.

-----

**Counts Data:**
A matrix of expression level with rows corresponding to particular genes
and columns corresponding to the feature count in particular samples.
Values must be tab separated and there must be a row for the sample/column
labels and a column for the row/gene labels.

Example::

	"GeneID"  "Smpl1"	"Smpl2"	"Smpl3"	"Smpl4"	"Smpl5"
	"27395"	1699	1528	1463	1441	1495
	"18777"	1905	1744	1345	1291	1346
	"15037"	6	8	4	5	5
	"21399"	2099	1974	1574	1519	1654
	"58175"	356	312	347	361	346
	"10866"	2528	2438	1762	1942	2027
	"12421"	2182	2005	1786	1799	1858
	"24069"	3	4	2	3	3
	"31926"	1337	1380	1004	1102	1000
	"71096"	0	0	2	1	6
	"59014"	1466	1426	1296	1097	1175
	...

**Gene Annotations:**
Optional input for gene annotations, this can contain more
information about the genes than just an ID number. The annotations will
be avaiable in the top differential expression table.

Example::

	"GeneID"	"Length"	"EntrezID"	"Symbols"	"GeneName"	"Chr"
	"11287"	"11287"	4681	"11287"	"Pzp"	"pregnancy zone protein"	"6"
	"11298"	"11298"	1455	"11298"	"Aanat"	"arylalkylamine N-acetyltransferase"	"11"
	"11302"	"11302"	5743	"11302"	"Aatk"	"apoptosis-associated tyrosine kinase"	"11"
	"11303"	"11303"	10260	"11303"	"Abca1"	"ATP-binding cassette, sub-family A (ABC1), member 1"	"4"
	"11304"	"11304"	7248	"11304"	"Abca4"	"ATP-binding cassette, sub-family A (ABC1), member 4"	"3"
	"11305"	"11305"	8061	"11305"	"Abca2"	"ATP-binding cassette, sub-family A (ABC1), member 2"	"2"
	...

**Factor Name:**
The name of the factor being investigated. This tool currently assumes
that only one factor is of interest.

**Factor Levels:**
The levels of the factor of interest, this must be entered in the same
order as the samples to which the levels correspond as listed in the
columns of the counts matrix. 

The values should be seperated by commas, and spaces must not be used.

**Contrasts of Interest:**
The contrasts you wish to make between levels. 

Common contrasts would be a simple difference between two levels: "Mut-WT" 
represents the difference between the mutant and wild type genotypes.

The values should be seperated by commas and spaces must not be used.

**Filter Low CPM:**
Option to ignore the genes that do not show significant levels of
expression, this filtering is dependent on two criteria:

 * **Minimum CPM:** This is the counts per million that a gene must have in at
   least some specified number of samples.

 * **Minumum Samples:** This is the number of samples in which the CPM
   requirement must be met in order for that gene to be acknowledged.

Only genes that exhibit a CPM greater than the required amount in at least the
number of samples specified will be used for analysis. Care should be taken to
ensure that the sample requirement is appropriate. In the case of an experiment
with two experimental groups each with two members, if there is a change from
insignificant cpm to significant cpm but the sample requirement is set to 3,
then this will cause that gene to fail the criteria. When in doubt simply do not
filter.


**Normalisation Method:**
Option for using different methods to rescale the raw library
size. For more information, see calcNormFactor section in the edgeR_ user's
manual.

**Apply Sample Weights:**
Option to downweight outlier samples such that their information is still
used in the statistical analysis but their impact is reduced. Use this
whenever significant outliers are present. The MDS plotting tool in this package
is useful for identifying outliers

**Use Advanced Testing Options?:**
By default error rate for multiple testing is controlled using Benjamini and
Hochberg's false discovery rate control at a threshold value of 0.05. However
there are options to change this to custom values.

  * **P-Value Adjustment Method:**
    Change the multiple testing control method, the options are BH(1995) and 
    BY(2001) which are both false discovery rate controls. There is also
    Holm(1979) which is a method for family-wise error rate control.
  
  * **Adjusted Threshold:**
    Set the threshold for the resulting value of the multiple testing control
    method. Only observations whose statistic falls below this value is
    considered significant, thus highlighted in the MA plot.
    
  * **Minimum log2-fold-change Required:**
    In addition to meeting the requirement for the adjusted statistic for
    multiple testing, the observation must have an absolute log2-fold-change
    greater than this threshold to be considered significant, thus highlighted 
    in the MA plot.

-----

**Citations:**

.. class:: infomark

limma

Please cite the paper below for the limma software itself.  Please also try
to cite the appropriate methodology articles that describe the statistical
methods implemented in limma, depending on which limma functions you are
using.  The methodology articles are listed in Section 2.1 of the limma 
User's Guide.

  * Smyth, GK (2005). Limma: linear models for microarray data. In: 
    'Bioinformatics and Computational Biology Solutions using R and 
    Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
    W. Huber (eds), Springer, New York, pages 397-420.
	  
  * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:
    precision weights unlock linear model analysis tools for
    RNA-seq read counts. Genome Biology 15, R29.

.. class:: infomark

edgeR

Please cite the first paper for the software itself and the other papers for
the various original statistical methods implemented in edgeR.  See 
Section 1.2 in the User's Guide for more detail.

	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
	  package for differential expression analysis of digital gene expression 
	  data. Bioinformatics 26, 139-140
	  
	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
	  differences in tag abundance. Bioinformatics 23, 2881-2887
	  
	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
	  binomial dispersion, with applications to SAGE data.
	  Biostatistics, 9, 321-332
	  
	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
	  of multifactor RNA-Seq experiments with respect to biological variation. 
	  Nucleic Acids Research 40, 4288-4297
	  
Report problems to: su.s@wehi.edu.au

.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html

</help>
</tool>