0
|
1 <tool id="diffexp" name="Voom Rnaseq" version="1.1.0">
|
|
2 <description>
|
|
3 Perform differential expression analysis using pipeline based on the voom
|
|
4 function of the limma bioconductor package. This tool takes a count matrix
|
|
5 (tab separated) as input and produces a HTML report as output.
|
|
6 </description>
|
|
7
|
|
8 <requirements>
|
|
9 <requirement type="R-module" version="3.5.27">edgeR</requirement>
|
|
10 <requirement type="R-module" version="3.18.13">limma</requirement>
|
|
11 </requirements>
|
|
12
|
|
13 <stdio>
|
|
14 <exit_code range="1:" level="fatal" description="Tool exception" />
|
|
15 </stdio>
|
|
16
|
|
17 <command interpreter="Rscript">
|
|
18 diffexp.R $counts
|
|
19
|
|
20 #if $anno.annoOpt=="yes":
|
|
21 $geneanno
|
|
22 #else:
|
|
23 None
|
|
24 #end if
|
|
25
|
|
26 $outFile
|
|
27 $outFile.files_path
|
|
28 "no" <!-- Disabled Rda option -->
|
|
29 $normalisationOption
|
|
30 $weightCond.weightOption
|
|
31 "$contrast"
|
|
32
|
|
33 #if $filterCPM.filterLowCPM=="yes":
|
|
34 $filterCPM.cpmReq
|
|
35 $filterCPM.sampleReq
|
|
36 #else:
|
|
37 0
|
|
38 0
|
|
39 #end if
|
|
40
|
|
41 #if $testOpt.wantOpt=="yes":
|
|
42 "$testOpt.pAdjust"
|
|
43 $testOpt.pVal
|
|
44 $testOpt.lfc
|
|
45 #else:
|
|
46 "BH"
|
|
47 0.05
|
|
48 0
|
|
49 #end if
|
|
50
|
|
51 <!--*Code commented until solution for multiple factors is found*
|
|
52 #for $i, $fct in enumerate($factors):
|
|
53 $fct.factName::$fct.factLevel
|
|
54 #end for
|
|
55 -->
|
|
56 "$factName::$factLevel"
|
|
57
|
|
58 </command>
|
|
59
|
|
60 <inputs>
|
|
61 <param name="counts" type="data" format="tabular" label="Counts Data"/>
|
|
62
|
|
63 <conditional name="anno">
|
|
64 <param name="annoOpt" type="select" label="Use Gene Annotations?"
|
|
65 help="Annotations will be added to table of top differential
|
|
66 expressions to provide descriptions for each gene.">
|
|
67 <option value="no">No</option>
|
|
68 <option value="yes">Yes</option>
|
|
69 </param>
|
|
70
|
|
71 <when value="yes">
|
|
72 <param name="geneanno" type="data" format="tabular"
|
|
73 label="Gene Annotations"/>
|
|
74 </when>
|
|
75 </conditional>
|
|
76
|
|
77 <!--*Code commented until solution for multiple factors is found*
|
|
78 <repeat name="factors" title="Factors" min="1" max="5" default="1">
|
|
79 <param name="factName" type="text" label="Factor Name (No spaces)"
|
|
80 help="Eg. Genotype"/>
|
|
81 <param name="factLevel" type="text" size="100"
|
|
82 label="Factor Levels (No spaces)"
|
|
83 help="Eg. WT,WT,Mut,Mut,WT"/>
|
|
84 </repeat>
|
|
85 -->
|
|
86
|
|
87 <param name="factName" type="text" label="Factor Name"
|
|
88 help="Eg. Genotype."/>
|
|
89 <param name="factLevel" type="text" size="100"
|
|
90 label="Factor Values"
|
|
91 help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same
|
|
92 levels are typed identically when repeated, with all cases
|
|
93 matching."/>
|
|
94
|
|
95 <param name="contrast" type="text" size="30"
|
|
96 label="Contrasts of interest"
|
|
97 help="Eg. Mut-WT,KD-Control."/>
|
|
98
|
|
99 <conditional name="filterCPM">
|
|
100 <param name="filterLowCPM" type="select" label="Filter Low CPM?"
|
|
101 help="Treat genes with very low expression as unexpressed and
|
|
102 filter out to speed up computation.">
|
|
103 <option value="yes" selected="True">Yes</option>
|
|
104 <option value="no">No</option>
|
|
105 </param>
|
|
106
|
|
107 <when value="yes">
|
|
108 <param name="cpmReq" type="float" value="0.5" min="0"
|
|
109 label="Minimum CPM"/>
|
|
110
|
|
111 <param name="sampleReq" type="integer" value="1" min="0"
|
|
112 label="Minimum Samples"
|
|
113 help="Filter out all the genes that do not meet the minimum
|
|
114 CPM in at least this many samples."/>
|
|
115 </when>
|
|
116
|
|
117 <when value="no"/>
|
|
118
|
|
119 </conditional>
|
|
120
|
|
121 <conditional name="weightCond">
|
|
122 <param name="weightOption" type="select" label="Apply sample weights?"
|
|
123 display="radio" help="Apply weights if outliers are present.">
|
|
124
|
|
125 <option value="no">No</option>
|
|
126 <option value="yes">Yes</option>
|
|
127
|
|
128 </param>
|
|
129 </conditional>
|
|
130
|
|
131 <param name="normalisationOption" type="select"
|
|
132 label="Normalisation Method">
|
|
133
|
|
134 <option value="TMM">TMM</option>
|
|
135 <option value="RLE">RLE</option>
|
|
136 <option value="upperquartile">Upperquartile</option>
|
|
137 <option value="none">None (Don't normalise)</option>
|
|
138
|
|
139 </param>
|
|
140
|
|
141 <conditional name="testOpt">
|
|
142 <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
|
|
143 help="Enable choices for p-value adjustment method, p-value threshold
|
|
144 and log2-fold-change threshold.">
|
|
145 <option value="no" selected="True">No</option>
|
|
146 <option value="yes">Yes</option>
|
|
147 </param>
|
|
148
|
|
149 <when value="yes">
|
|
150 <param name="pAdjust" type="select" label="P-Value Adjustment Method.">
|
|
151 <option value="BH">Benjamini and Hochberg (1995)</option>
|
|
152 <option value="BY">Benjamini and Yekutieli (2001)</option>
|
|
153 <option value="holm">Holm (1979)</option>
|
|
154 <option value="none">None</option>
|
|
155 </param>
|
|
156
|
|
157 <param name="pVal" type="float" value="0.05" min="0" max="1"
|
|
158 label="Adjusted Threshold"
|
|
159 help="Genes below this threshold are considered significant and
|
|
160 highlighted in the MA plot. If either BH(1995) or
|
|
161 BY(2001) were selected then this value is a
|
|
162 false-discovery-rate control. If Holm(1979) was selected
|
|
163 then this is an adjusted p-value for family-wise error
|
|
164 rate."/>
|
|
165
|
|
166 <param name="lfc" type="float" value="0" min="0"
|
|
167 label="Minimum log2-fold-change Required"
|
|
168 help="Genes above this threshold and below the p-value
|
|
169 threshold are considered significant and highlighted
|
|
170 in the MA plot."/>
|
|
171 </when>
|
|
172
|
|
173 <when value="no"/>
|
|
174
|
|
175 </conditional>
|
|
176
|
|
177 <!-- <conditional name="wantRda">
|
|
178 <param name="rdaOption" type="select" label="Output RData?"
|
|
179 display="radio"
|
|
180 help="Output all the data R used to construct the plots,
|
|
181 can be loaded into R.">
|
|
182
|
|
183 <option value="no">No</option>
|
|
184 <option value="yes">Yes</option>
|
|
185
|
|
186 </param>
|
|
187 </conditional> -->
|
|
188 </inputs>
|
|
189
|
|
190 <outputs>
|
|
191 <data format="html" name="outFile" label="Voom Output"/>
|
|
192 </outputs>
|
|
193
|
|
194
|
|
195 <help>
|
|
196 .. class:: infomark
|
|
197
|
|
198 **What it does**
|
|
199
|
|
200 Given a matrix of counts and optional information about the genes, this tool
|
|
201 produces plots and tables useful in the analysis of differential gene
|
|
202 expression.
|
|
203
|
|
204 .. class:: warningmark
|
|
205
|
|
206 This tool is dependent on the R packages limma_ and edgeR_ as a part of the
|
|
207 bioconductor project. Please ensure that these packages are installed on the
|
|
208 server running this tool.
|
|
209
|
|
210 -----
|
|
211
|
|
212 **Counts Data:**
|
|
213 A matrix of expression level with rows corresponding to particular genes
|
|
214 and columns corresponding to the feature count in particular samples.
|
|
215 Values must be tab separated and there must be a row for the sample/column
|
|
216 labels and a column for the row/gene labels.
|
|
217
|
|
218 Example::
|
|
219
|
|
220 "GeneID" "Smpl1" "Smpl2" "Smpl3" "Smpl4" "Smpl5"
|
|
221 "27395" 1699 1528 1463 1441 1495
|
|
222 "18777" 1905 1744 1345 1291 1346
|
|
223 "15037" 6 8 4 5 5
|
|
224 "21399" 2099 1974 1574 1519 1654
|
|
225 "58175" 356 312 347 361 346
|
|
226 "10866" 2528 2438 1762 1942 2027
|
|
227 "12421" 2182 2005 1786 1799 1858
|
|
228 "24069" 3 4 2 3 3
|
|
229 "31926" 1337 1380 1004 1102 1000
|
|
230 "71096" 0 0 2 1 6
|
|
231 "59014" 1466 1426 1296 1097 1175
|
|
232 ...
|
|
233
|
|
234 **Gene Annotations:**
|
|
235 Optional input for gene annotations, this can contain more
|
|
236 information about the genes than just an ID number. The annotations will
|
|
237 be avaiable in the top differential expression table.
|
|
238
|
|
239 Example::
|
|
240
|
|
241 "GeneID" "Length" "EntrezID" "Symbols" "GeneName" "Chr"
|
|
242 "11287" "11287" 4681 "11287" "Pzp" "pregnancy zone protein" "6"
|
|
243 "11298" "11298" 1455 "11298" "Aanat" "arylalkylamine N-acetyltransferase" "11"
|
|
244 "11302" "11302" 5743 "11302" "Aatk" "apoptosis-associated tyrosine kinase" "11"
|
|
245 "11303" "11303" 10260 "11303" "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" "4"
|
|
246 "11304" "11304" 7248 "11304" "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" "3"
|
|
247 "11305" "11305" 8061 "11305" "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" "2"
|
|
248 ...
|
|
249
|
|
250 **Factor Name:**
|
|
251 The name of the factor being investigated. This tool currently assumes
|
|
252 that only one factor is of interest.
|
|
253
|
|
254 **Factor Levels:**
|
|
255 The levels of the factor of interest, this must be entered in the same
|
|
256 order as the samples to which the levels correspond as listed in the
|
|
257 columns of the counts matrix.
|
|
258
|
|
259 The values should be seperated by commas, and spaces must not be used.
|
|
260
|
|
261 **Contrasts of Interest:**
|
|
262 The contrasts you wish to make between levels.
|
|
263
|
|
264 Common contrasts would be a simple difference between two levels: "Mut-WT"
|
|
265 represents the difference between the mutant and wild type genotypes.
|
|
266
|
|
267 The values should be seperated by commas and spaces must not be used.
|
|
268
|
|
269 **Filter Low CPM:**
|
|
270 Option to ignore the genes that do not show significant levels of
|
|
271 expression, this filtering is dependent on two criteria:
|
|
272
|
|
273 * **Minimum CPM:** This is the counts per million that a gene must have in at
|
|
274 least some specified number of samples.
|
|
275
|
|
276 * **Minumum Samples:** This is the number of samples in which the CPM
|
|
277 requirement must be met in order for that gene to be acknowledged.
|
|
278
|
|
279 Only genes that exhibit a CPM greater than the required amount in at least the
|
|
280 number of samples specified will be used for analysis. Care should be taken to
|
|
281 ensure that the sample requirement is appropriate. In the case of an experiment
|
|
282 with two experimental groups each with two members, if there is a change from
|
|
283 insignificant cpm to significant cpm but the sample requirement is set to 3,
|
|
284 then this will cause that gene to fail the criteria. When in doubt simply do not
|
|
285 filter.
|
|
286
|
|
287
|
|
288 **Normalisation Method:**
|
|
289 Option for using different methods to rescale the raw library
|
|
290 size. For more information, see calcNormFactor section in the edgeR_ user's
|
|
291 manual.
|
|
292
|
|
293 **Apply Sample Weights:**
|
|
294 Option to downweight outlier samples such that their information is still
|
|
295 used in the statistical analysis but their impact is reduced. Use this
|
|
296 whenever significant outliers are present. The MDS plotting tool in this package
|
|
297 is useful for identifying outliers
|
|
298
|
|
299 **Use Advanced Testing Options?:**
|
|
300 By default error rate for multiple testing is controlled using Benjamini and
|
|
301 Hochberg's false discovery rate control at a threshold value of 0.05. However
|
|
302 there are options to change this to custom values.
|
|
303
|
|
304 * **P-Value Adjustment Method:**
|
|
305 Change the multiple testing control method, the options are BH(1995) and
|
|
306 BY(2001) which are both false discovery rate controls. There is also
|
|
307 Holm(1979) which is a method for family-wise error rate control.
|
|
308
|
|
309 * **Adjusted Threshold:**
|
|
310 Set the threshold for the resulting value of the multiple testing control
|
|
311 method. Only observations whose statistic falls below this value is
|
|
312 considered significant, thus highlighted in the MA plot.
|
|
313
|
|
314 * **Minimum log2-fold-change Required:**
|
|
315 In addition to meeting the requirement for the adjusted statistic for
|
|
316 multiple testing, the observation must have an absolute log2-fold-change
|
|
317 greater than this threshold to be considered significant, thus highlighted
|
|
318 in the MA plot.
|
|
319
|
|
320 -----
|
|
321
|
|
322 **Citations:**
|
|
323
|
|
324 .. class:: infomark
|
|
325
|
|
326 limma
|
|
327
|
|
328 Please cite the paper below for the limma software itself. Please also try
|
|
329 to cite the appropriate methodology articles that describe the statistical
|
|
330 methods implemented in limma, depending on which limma functions you are
|
|
331 using. The methodology articles are listed in Section 2.1 of the limma
|
|
332 User's Guide.
|
|
333
|
|
334 * Smyth, GK (2005). Limma: linear models for microarray data. In:
|
|
335 'Bioinformatics and Computational Biology Solutions using R and
|
|
336 Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
|
|
337 W. Huber (eds), Springer, New York, pages 397-420.
|
|
338
|
|
339 * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:
|
|
340 precision weights unlock linear model analysis tools for
|
|
341 RNA-seq read counts. Genome Biology 15, R29.
|
|
342
|
1
|
343 * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic,
|
|
344 A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights
|
|
345 for microarray data. BMC Bioinformatics 7, Article 261.
|
|
346
|
0
|
347 .. class:: infomark
|
|
348
|
|
349 edgeR
|
|
350
|
|
351 Please cite the first paper for the software itself and the other papers for
|
|
352 the various original statistical methods implemented in edgeR. See
|
|
353 Section 1.2 in the User's Guide for more detail.
|
|
354
|
|
355 * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
|
|
356 package for differential expression analysis of digital gene expression
|
|
357 data. Bioinformatics 26, 139-140
|
|
358
|
|
359 * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
|
|
360 differences in tag abundance. Bioinformatics 23, 2881-2887
|
|
361
|
|
362 * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
|
|
363 binomial dispersion, with applications to SAGE data.
|
|
364 Biostatistics, 9, 321-332
|
|
365
|
|
366 * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
|
|
367 of multifactor RNA-Seq experiments with respect to biological variation.
|
|
368 Nucleic Acids Research 40, 4288-4297
|
|
369
|
1
|
370 Please report problems or suggestions to: su.s@wehi.edu.au
|
0
|
371
|
|
372 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
|
373 .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
|
|
374
|
|
375 </help>
|
|
376 </tool>
|