comparison diffexp.xml @ 0:7a80e9ec63cb

- Initial commit
author shian_su <registertonysu@gmail.com>
date Tue, 16 Dec 2014 14:38:15 +1100
parents
children b2fe55fd0651
comparison
equal deleted inserted replaced
-1:000000000000 0:7a80e9ec63cb
1 <tool id="diffexp" name="Voom Rnaseq" version="1.1.0">
2 <description>
3 Perform differential expression analysis using pipeline based on the voom
4 function of the limma bioconductor package. This tool takes a count matrix
5 (tab separated) as input and produces a HTML report as output.
6 </description>
7
8 <requirements>
9 <requirement type="R-module" version="3.5.27">edgeR</requirement>
10 <requirement type="R-module" version="3.18.13">limma</requirement>
11 </requirements>
12
13 <stdio>
14 <exit_code range="1:" level="fatal" description="Tool exception" />
15 </stdio>
16
17 <command interpreter="Rscript">
18 diffexp.R $counts
19
20 #if $anno.annoOpt=="yes":
21 $geneanno
22 #else:
23 None
24 #end if
25
26 $outFile
27 $outFile.files_path
28 "no" <!-- Disabled Rda option -->
29 $normalisationOption
30 $weightCond.weightOption
31 "$contrast"
32
33 #if $filterCPM.filterLowCPM=="yes":
34 $filterCPM.cpmReq
35 $filterCPM.sampleReq
36 #else:
37 0
38 0
39 #end if
40
41 #if $testOpt.wantOpt=="yes":
42 "$testOpt.pAdjust"
43 $testOpt.pVal
44 $testOpt.lfc
45 #else:
46 "BH"
47 0.05
48 0
49 #end if
50
51 <!--*Code commented until solution for multiple factors is found*
52 #for $i, $fct in enumerate($factors):
53 $fct.factName::$fct.factLevel
54 #end for
55 -->
56 "$factName::$factLevel"
57
58 </command>
59
60 <inputs>
61 <param name="counts" type="data" format="tabular" label="Counts Data"/>
62
63 <conditional name="anno">
64 <param name="annoOpt" type="select" label="Use Gene Annotations?"
65 help="Annotations will be added to table of top differential
66 expressions to provide descriptions for each gene.">
67 <option value="no">No</option>
68 <option value="yes">Yes</option>
69 </param>
70
71 <when value="yes">
72 <param name="geneanno" type="data" format="tabular"
73 label="Gene Annotations"/>
74 </when>
75 </conditional>
76
77 <!--*Code commented until solution for multiple factors is found*
78 <repeat name="factors" title="Factors" min="1" max="5" default="1">
79 <param name="factName" type="text" label="Factor Name (No spaces)"
80 help="Eg. Genotype"/>
81 <param name="factLevel" type="text" size="100"
82 label="Factor Levels (No spaces)"
83 help="Eg. WT,WT,Mut,Mut,WT"/>
84 </repeat>
85 -->
86
87 <param name="factName" type="text" label="Factor Name"
88 help="Eg. Genotype."/>
89 <param name="factLevel" type="text" size="100"
90 label="Factor Values"
91 help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same
92 levels are typed identically when repeated, with all cases
93 matching."/>
94
95 <param name="contrast" type="text" size="30"
96 label="Contrasts of interest"
97 help="Eg. Mut-WT,KD-Control."/>
98
99 <conditional name="filterCPM">
100 <param name="filterLowCPM" type="select" label="Filter Low CPM?"
101 help="Treat genes with very low expression as unexpressed and
102 filter out to speed up computation.">
103 <option value="yes" selected="True">Yes</option>
104 <option value="no">No</option>
105 </param>
106
107 <when value="yes">
108 <param name="cpmReq" type="float" value="0.5" min="0"
109 label="Minimum CPM"/>
110
111 <param name="sampleReq" type="integer" value="1" min="0"
112 label="Minimum Samples"
113 help="Filter out all the genes that do not meet the minimum
114 CPM in at least this many samples."/>
115 </when>
116
117 <when value="no"/>
118
119 </conditional>
120
121 <conditional name="weightCond">
122 <param name="weightOption" type="select" label="Apply sample weights?"
123 display="radio" help="Apply weights if outliers are present.">
124
125 <option value="no">No</option>
126 <option value="yes">Yes</option>
127
128 </param>
129 </conditional>
130
131 <param name="normalisationOption" type="select"
132 label="Normalisation Method">
133
134 <option value="TMM">TMM</option>
135 <option value="RLE">RLE</option>
136 <option value="upperquartile">Upperquartile</option>
137 <option value="none">None (Don't normalise)</option>
138
139 </param>
140
141 <conditional name="testOpt">
142 <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
143 help="Enable choices for p-value adjustment method, p-value threshold
144 and log2-fold-change threshold.">
145 <option value="no" selected="True">No</option>
146 <option value="yes">Yes</option>
147 </param>
148
149 <when value="yes">
150 <param name="pAdjust" type="select" label="P-Value Adjustment Method.">
151 <option value="BH">Benjamini and Hochberg (1995)</option>
152 <option value="BY">Benjamini and Yekutieli (2001)</option>
153 <option value="holm">Holm (1979)</option>
154 <option value="none">None</option>
155 </param>
156
157 <param name="pVal" type="float" value="0.05" min="0" max="1"
158 label="Adjusted Threshold"
159 help="Genes below this threshold are considered significant and
160 highlighted in the MA plot. If either BH(1995) or
161 BY(2001) were selected then this value is a
162 false-discovery-rate control. If Holm(1979) was selected
163 then this is an adjusted p-value for family-wise error
164 rate."/>
165
166 <param name="lfc" type="float" value="0" min="0"
167 label="Minimum log2-fold-change Required"
168 help="Genes above this threshold and below the p-value
169 threshold are considered significant and highlighted
170 in the MA plot."/>
171 </when>
172
173 <when value="no"/>
174
175 </conditional>
176
177 <!-- <conditional name="wantRda">
178 <param name="rdaOption" type="select" label="Output RData?"
179 display="radio"
180 help="Output all the data R used to construct the plots,
181 can be loaded into R.">
182
183 <option value="no">No</option>
184 <option value="yes">Yes</option>
185
186 </param>
187 </conditional> -->
188 </inputs>
189
190 <outputs>
191 <data format="html" name="outFile" label="Voom Output"/>
192 </outputs>
193
194
195 <help>
196 .. class:: infomark
197
198 **What it does**
199
200 Given a matrix of counts and optional information about the genes, this tool
201 produces plots and tables useful in the analysis of differential gene
202 expression.
203
204 .. class:: warningmark
205
206 This tool is dependent on the R packages limma_ and edgeR_ as a part of the
207 bioconductor project. Please ensure that these packages are installed on the
208 server running this tool.
209
210 -----
211
212 **Counts Data:**
213 A matrix of expression level with rows corresponding to particular genes
214 and columns corresponding to the feature count in particular samples.
215 Values must be tab separated and there must be a row for the sample/column
216 labels and a column for the row/gene labels.
217
218 Example::
219
220 "GeneID" "Smpl1" "Smpl2" "Smpl3" "Smpl4" "Smpl5"
221 "27395" 1699 1528 1463 1441 1495
222 "18777" 1905 1744 1345 1291 1346
223 "15037" 6 8 4 5 5
224 "21399" 2099 1974 1574 1519 1654
225 "58175" 356 312 347 361 346
226 "10866" 2528 2438 1762 1942 2027
227 "12421" 2182 2005 1786 1799 1858
228 "24069" 3 4 2 3 3
229 "31926" 1337 1380 1004 1102 1000
230 "71096" 0 0 2 1 6
231 "59014" 1466 1426 1296 1097 1175
232 ...
233
234 **Gene Annotations:**
235 Optional input for gene annotations, this can contain more
236 information about the genes than just an ID number. The annotations will
237 be avaiable in the top differential expression table.
238
239 Example::
240
241 "GeneID" "Length" "EntrezID" "Symbols" "GeneName" "Chr"
242 "11287" "11287" 4681 "11287" "Pzp" "pregnancy zone protein" "6"
243 "11298" "11298" 1455 "11298" "Aanat" "arylalkylamine N-acetyltransferase" "11"
244 "11302" "11302" 5743 "11302" "Aatk" "apoptosis-associated tyrosine kinase" "11"
245 "11303" "11303" 10260 "11303" "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" "4"
246 "11304" "11304" 7248 "11304" "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" "3"
247 "11305" "11305" 8061 "11305" "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" "2"
248 ...
249
250 **Factor Name:**
251 The name of the factor being investigated. This tool currently assumes
252 that only one factor is of interest.
253
254 **Factor Levels:**
255 The levels of the factor of interest, this must be entered in the same
256 order as the samples to which the levels correspond as listed in the
257 columns of the counts matrix.
258
259 The values should be seperated by commas, and spaces must not be used.
260
261 **Contrasts of Interest:**
262 The contrasts you wish to make between levels.
263
264 Common contrasts would be a simple difference between two levels: "Mut-WT"
265 represents the difference between the mutant and wild type genotypes.
266
267 The values should be seperated by commas and spaces must not be used.
268
269 **Filter Low CPM:**
270 Option to ignore the genes that do not show significant levels of
271 expression, this filtering is dependent on two criteria:
272
273 * **Minimum CPM:** This is the counts per million that a gene must have in at
274 least some specified number of samples.
275
276 * **Minumum Samples:** This is the number of samples in which the CPM
277 requirement must be met in order for that gene to be acknowledged.
278
279 Only genes that exhibit a CPM greater than the required amount in at least the
280 number of samples specified will be used for analysis. Care should be taken to
281 ensure that the sample requirement is appropriate. In the case of an experiment
282 with two experimental groups each with two members, if there is a change from
283 insignificant cpm to significant cpm but the sample requirement is set to 3,
284 then this will cause that gene to fail the criteria. When in doubt simply do not
285 filter.
286
287
288 **Normalisation Method:**
289 Option for using different methods to rescale the raw library
290 size. For more information, see calcNormFactor section in the edgeR_ user's
291 manual.
292
293 **Apply Sample Weights:**
294 Option to downweight outlier samples such that their information is still
295 used in the statistical analysis but their impact is reduced. Use this
296 whenever significant outliers are present. The MDS plotting tool in this package
297 is useful for identifying outliers
298
299 **Use Advanced Testing Options?:**
300 By default error rate for multiple testing is controlled using Benjamini and
301 Hochberg's false discovery rate control at a threshold value of 0.05. However
302 there are options to change this to custom values.
303
304 * **P-Value Adjustment Method:**
305 Change the multiple testing control method, the options are BH(1995) and
306 BY(2001) which are both false discovery rate controls. There is also
307 Holm(1979) which is a method for family-wise error rate control.
308
309 * **Adjusted Threshold:**
310 Set the threshold for the resulting value of the multiple testing control
311 method. Only observations whose statistic falls below this value is
312 considered significant, thus highlighted in the MA plot.
313
314 * **Minimum log2-fold-change Required:**
315 In addition to meeting the requirement for the adjusted statistic for
316 multiple testing, the observation must have an absolute log2-fold-change
317 greater than this threshold to be considered significant, thus highlighted
318 in the MA plot.
319
320 -----
321
322 **Citations:**
323
324 .. class:: infomark
325
326 limma
327
328 Please cite the paper below for the limma software itself. Please also try
329 to cite the appropriate methodology articles that describe the statistical
330 methods implemented in limma, depending on which limma functions you are
331 using. The methodology articles are listed in Section 2.1 of the limma
332 User's Guide.
333
334 * Smyth, GK (2005). Limma: linear models for microarray data. In:
335 'Bioinformatics and Computational Biology Solutions using R and
336 Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
337 W. Huber (eds), Springer, New York, pages 397-420.
338
339 * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:
340 precision weights unlock linear model analysis tools for
341 RNA-seq read counts. Genome Biology 15, R29.
342
343 .. class:: infomark
344
345 edgeR
346
347 Please cite the first paper for the software itself and the other papers for
348 the various original statistical methods implemented in edgeR. See
349 Section 1.2 in the User's Guide for more detail.
350
351 * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
352 package for differential expression analysis of digital gene expression
353 data. Bioinformatics 26, 139-140
354
355 * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
356 differences in tag abundance. Bioinformatics 23, 2881-2887
357
358 * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
359 binomial dispersion, with applications to SAGE data.
360 Biostatistics, 9, 321-332
361
362 * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
363 of multifactor RNA-Seq experiments with respect to biological variation.
364 Nucleic Acids Research 40, 4288-4297
365
366 Report problems to: su.s@wehi.edu.au
367
368 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
369 .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
370
371 </help>
372 </tool>