Mercurial > repos > shians > voom_rnaseq

diff diffexp.xml @ 0:7a80e9ec63cb
- Initial commit
author: shian_su <registertonysu@gmail.com>
date: Tue, 16 Dec 2014 14:38:15 +1100
children: b2fe55fd0651
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/diffexp.xml	Tue Dec 16 14:38:15 2014 +1100
@@ -0,0 +1,372 @@
+<tool id="diffexp" name="Voom Rnaseq" version="1.1.0">
+  <description>
+      Perform differential expression analysis using pipeline based on the voom
+      function of the limma bioconductor package. This tool takes a count matrix
+      (tab separated) as input and produces a HTML report as output.
+  </description>
+  
+  <requirements>
+    <requirement type="R-module" version="3.5.27">edgeR</requirement>
+    <requirement type="R-module" version="3.18.13">limma</requirement>
+  </requirements>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Tool exception" />
+  </stdio>
+  
+  <command interpreter="Rscript">
+    diffexp.R $counts
+              
+              #if $anno.annoOpt=="yes":
+                $geneanno
+              #else:
+                None
+              #end if
+              
+              $outFile
+              $outFile.files_path
+              "no" <!-- Disabled Rda option -->
+              $normalisationOption
+              $weightCond.weightOption
+              "$contrast"
+
+              #if $filterCPM.filterLowCPM=="yes":
+                $filterCPM.cpmReq
+                $filterCPM.sampleReq
+              #else:
+                0
+                0
+              #end if
+              
+              #if $testOpt.wantOpt=="yes":
+              	"$testOpt.pAdjust"
+              	$testOpt.pVal
+              	$testOpt.lfc
+              #else:
+                "BH"
+                0.05
+                0
+              #end if
+              
+              <!--*Code commented until solution for multiple factors is found*
+              #for $i, $fct in enumerate($factors):
+                $fct.factName::$fct.factLevel
+              #end for
+              -->
+              "$factName::$factLevel"
+               
+  </command>
+   
+  <inputs>
+    <param name="counts" type="data" format="tabular" label="Counts Data"/>
+      
+    <conditional name="anno">
+      <param name="annoOpt" type="select" label="Use Gene Annotations?"
+             help="Annotations will be added to table of top differential 
+                   expressions to provide descriptions for each gene.">
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+      </param>
+      
+      <when value="yes">
+        <param name="geneanno" type="data" format="tabular"
+               label="Gene Annotations"/>
+        </when>
+    </conditional>
+
+    <!--*Code commented until solution for multiple factors is found*
+    <repeat name="factors" title="Factors" min="1" max="5" default="1">
+      <param name="factName" type="text" label="Factor Name (No spaces)"
+             help="Eg. Genotype"/>
+        <param name="factLevel" type="text" size="100"
+               label="Factor Levels (No spaces)"
+               help="Eg. WT,WT,Mut,Mut,WT"/>
+    </repeat>
+    -->
+    
+    <param name="factName" type="text" label="Factor Name"
+           help="Eg. Genotype."/>
+    <param name="factLevel" type="text" size="100"
+           label="Factor Values"
+           help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same
+           		 levels are typed identically when repeated, with all cases
+           		 matching."/>
+    
+    <param name="contrast" type="text" size="30"
+           label="Contrasts of interest"
+           help="Eg. Mut-WT,KD-Control."/>
+    
+    <conditional name="filterCPM">
+      <param name="filterLowCPM" type="select" label="Filter Low CPM?"
+       help="Treat genes with very low expression as unexpressed and 
+       			 filter out to speed up computation.">
+        <option value="yes" selected="True">Yes</option>
+        <option value="no">No</option>
+      </param>
+      
+        <when value="yes">
+          <param name="cpmReq" type="float" value="0.5" min="0"
+                 label="Minimum CPM"/>
+                 
+          <param name="sampleReq" type="integer" value="1" min="0"
+                 label="Minimum Samples" 
+                 help="Filter out all the genes that do not meet the minimum 
+                       CPM in at least this many samples."/>
+        </when>
+        
+        <when value="no"/>
+        
+    </conditional>
+       
+    <conditional name="weightCond">
+      <param name="weightOption" type="select" label="Apply sample weights?"
+             display="radio" help="Apply weights if outliers are present.">
+             
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+        
+      </param>
+    </conditional>
+    
+    <param name="normalisationOption" type="select" 
+           label="Normalisation Method">
+           
+      <option value="TMM">TMM</option>
+      <option value="RLE">RLE</option>
+      <option value="upperquartile">Upperquartile</option>
+      <option value="none">None (Don't normalise)</option>
+      
+    </param>
+    			 
+    <conditional name="testOpt">
+      <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
+       help="Enable choices for p-value adjustment method, p-value threshold
+             and log2-fold-change threshold.">
+        <option value="no" selected="True">No</option>
+      	<option value="yes">Yes</option>
+      </param>
+      
+        <when value="yes">
+        	<param name="pAdjust" type="select" label="P-Value Adjustment Method.">
+          	<option value="BH">Benjamini and Hochberg (1995)</option>
+            <option value="BY">Benjamini and Yekutieli (2001)</option>
+            <option value="holm">Holm (1979)</option>
+            <option value="none">None</option>
+          </param>
+          
+          <param name="pVal" type="float" value="0.05" min="0" max="1"
+    			 		   label="Adjusted Threshold"
+    			 		   help="Genes below this threshold are considered significant and
+    			 		         highlighted in the MA plot. If either BH(1995) or
+    			 		         BY(2001) were selected then this value is a 
+    			 		         false-discovery-rate control. If Holm(1979) was selected
+    			 		         then this is an adjusted p-value for family-wise error 
+    			 		         rate."/>
+    			 		   
+          <param name="lfc" type="float" value="0" min="0"
+          			 label="Minimum log2-fold-change Required"
+          			 help="Genes above this threshold and below the p-value
+          			       threshold are considered significant and highlighted
+          			       in the MA plot."/>
+        </when>
+        
+        <when value="no"/>
+        
+    </conditional>
+
+ <!--    <conditional name="wantRda">
+      <param name="rdaOption" type="select" label="Output RData?"
+             display="radio"
+             help="Output all the data R used to construct the plots,
+                   can be loaded into R.">
+                  
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+        
+      </param>
+    </conditional> -->
+  </inputs>
+  
+  <outputs>
+      <data format="html" name="outFile" label="Voom Output"/>
+  </outputs>
+    
+  
+<help>
+.. class:: infomark
+
+**What it does**
+
+Given a matrix of counts and optional information about the genes, this tool
+produces plots and tables useful in the analysis of differential gene 
+expression.
+
+.. class:: warningmark
+
+This tool is dependent on the R packages limma_ and edgeR_ as a part of the
+bioconductor project. Please ensure that these packages are installed on the
+server running this tool.
+
+-----
+
+**Counts Data:**
+A matrix of expression level with rows corresponding to particular genes
+and columns corresponding to the feature count in particular samples.
+Values must be tab separated and there must be a row for the sample/column
+labels and a column for the row/gene labels.
+
+Example::
+
+	"GeneID"  "Smpl1"	"Smpl2"	"Smpl3"	"Smpl4"	"Smpl5"
+	"27395"	1699	1528	1463	1441	1495
+	"18777"	1905	1744	1345	1291	1346
+	"15037"	6	8	4	5	5
+	"21399"	2099	1974	1574	1519	1654
+	"58175"	356	312	347	361	346
+	"10866"	2528	2438	1762	1942	2027
+	"12421"	2182	2005	1786	1799	1858
+	"24069"	3	4	2	3	3
+	"31926"	1337	1380	1004	1102	1000
+	"71096"	0	0	2	1	6
+	"59014"	1466	1426	1296	1097	1175
+	...
+
+**Gene Annotations:**
+Optional input for gene annotations, this can contain more
+information about the genes than just an ID number. The annotations will
+be avaiable in the top differential expression table.
+
+Example::
+
+	"GeneID"	"Length"	"EntrezID"	"Symbols"	"GeneName"	"Chr"
+	"11287"	"11287"	4681	"11287"	"Pzp"	"pregnancy zone protein"	"6"
+	"11298"	"11298"	1455	"11298"	"Aanat"	"arylalkylamine N-acetyltransferase"	"11"
+	"11302"	"11302"	5743	"11302"	"Aatk"	"apoptosis-associated tyrosine kinase"	"11"
+	"11303"	"11303"	10260	"11303"	"Abca1"	"ATP-binding cassette, sub-family A (ABC1), member 1"	"4"
+	"11304"	"11304"	7248	"11304"	"Abca4"	"ATP-binding cassette, sub-family A (ABC1), member 4"	"3"
+	"11305"	"11305"	8061	"11305"	"Abca2"	"ATP-binding cassette, sub-family A (ABC1), member 2"	"2"
+	...
+
+**Factor Name:**
+The name of the factor being investigated. This tool currently assumes
+that only one factor is of interest.
+
+**Factor Levels:**
+The levels of the factor of interest, this must be entered in the same
+order as the samples to which the levels correspond as listed in the
+columns of the counts matrix. 
+
+The values should be seperated by commas, and spaces must not be used.
+
+**Contrasts of Interest:**
+The contrasts you wish to make between levels. 
+
+Common contrasts would be a simple difference between two levels: "Mut-WT" 
+represents the difference between the mutant and wild type genotypes.
+
+The values should be seperated by commas and spaces must not be used.
+
+**Filter Low CPM:**
+Option to ignore the genes that do not show significant levels of
+expression, this filtering is dependent on two criteria:
+
+ * **Minimum CPM:** This is the counts per million that a gene must have in at
+   least some specified number of samples.
+
+ * **Minumum Samples:** This is the number of samples in which the CPM
+   requirement must be met in order for that gene to be acknowledged.
+
+Only genes that exhibit a CPM greater than the required amount in at least the
+number of samples specified will be used for analysis. Care should be taken to
+ensure that the sample requirement is appropriate. In the case of an experiment
+with two experimental groups each with two members, if there is a change from
+insignificant cpm to significant cpm but the sample requirement is set to 3,
+then this will cause that gene to fail the criteria. When in doubt simply do not
+filter.
+
+
+**Normalisation Method:**
+Option for using different methods to rescale the raw library
+size. For more information, see calcNormFactor section in the edgeR_ user's
+manual.
+
+**Apply Sample Weights:**
+Option to downweight outlier samples such that their information is still
+used in the statistical analysis but their impact is reduced. Use this
+whenever significant outliers are present. The MDS plotting tool in this package
+is useful for identifying outliers
+
+**Use Advanced Testing Options?:**
+By default error rate for multiple testing is controlled using Benjamini and
+Hochberg's false discovery rate control at a threshold value of 0.05. However
+there are options to change this to custom values.
+
+  * **P-Value Adjustment Method:**
+    Change the multiple testing control method, the options are BH(1995) and 
+    BY(2001) which are both false discovery rate controls. There is also
+    Holm(1979) which is a method for family-wise error rate control.
+  
+  * **Adjusted Threshold:**
+    Set the threshold for the resulting value of the multiple testing control
+    method. Only observations whose statistic falls below this value is
+    considered significant, thus highlighted in the MA plot.
+    
+  * **Minimum log2-fold-change Required:**
+    In addition to meeting the requirement for the adjusted statistic for
+    multiple testing, the observation must have an absolute log2-fold-change
+    greater than this threshold to be considered significant, thus highlighted 
+    in the MA plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma 
+User's Guide.
+
+  * Smyth, GK (2005). Limma: linear models for microarray data. In: 
+    'Bioinformatics and Computational Biology Solutions using R and 
+    Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+    W. Huber (eds), Springer, New York, pages 397-420.
+	  
+  * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:
+    precision weights unlock linear model analysis tools for
+    RNA-seq read counts. Genome Biology 15, R29.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See 
+Section 1.2 in the User's Guide for more detail.
+
+	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+	  package for differential expression analysis of digital gene expression 
+	  data. Bioinformatics 26, 139-140
+	  
+	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+	  differences in tag abundance. Bioinformatics 23, 2881-2887
+	  
+	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+	  binomial dispersion, with applications to SAGE data.
+	  Biostatistics, 9, 321-332
+	  
+	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+	  of multifactor RNA-Seq experiments with respect to biological variation. 
+	  Nucleic Acids Research 40, 4288-4297
+	  
+Report problems to: su.s@wehi.edu.au
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+
+</help>
+</tool>
author	shian_su <registertonysu@gmail.com>
date	Tue, 16 Dec 2014 14:38:15 +1100
parents
children	b2fe55fd0651