comparison hairpinTool.xml @ 2:076ca575208f

First commit
author shian_su <registertonysu@gmail.com>
date Fri, 21 Feb 2014 12:52:56 +1100
parents
children 3d04308a99f9
comparison
equal deleted inserted replaced
1:aa02cf19e1b3 2:076ca575208f
1 <tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">
2 <description>
3 Analyse hairpin differential representation using edgeR
4 </description>
5
6 <requirements>
7 <requirement type="R-module">edgeR</requirement>
8 <requirement type="R-module">limma</requirement>
9 </requirements>
10
11 <stdio>
12 <exit_code range="1:" level="fatal" description="Tool exception" />
13 </stdio>
14
15 <command interpreter="Rscript">
16 hairpinTool.R $inputOpt.type
17 #if $inputOpt.type=="fastq":
18 #for $i, $fas in enumerate($inputOpt.fastq):
19 fastq::$fas.file
20 #end for
21
22 $inputOpt.hairpin
23 $inputOpt.samples
24
25 #if $inputOpt.positions.option=="yes":
26 $inputOpt.positions.barstart
27 $inputOpt.positions.barend
28 $inputOpt.positions.hpstart
29 $inputOpt.positions.hpend
30 #else:
31 1
32 5
33 37
34 57
35 #end if
36 #else:
37 $inputOpt.counts
38 $inputOpt.anno
39 "$inputOpt.factors"
40 0 0 0
41 #end if
42
43 #if $filterCPM.option=="yes":
44 $filterCPM.cpmReq
45 $filterCPM.sampleReq
46 #else:
47 -Inf
48 -Inf
49 #end if
50
51 $fdr
52 $lfc
53 $workMode.mode
54 $outFile
55 $outFile.files_path
56
57 #if $workMode.mode=="classic":
58 "$workMode.pair1"
59 "$workMode.pair2"
60 #else:
61 "$workMode.contrast"
62 $workMode.roast.option
63 #if $workMode.roast.option=="yes":
64 $workMode.roast.hairpinReq
65 $workMode.roast.select.option
66 "$workMode.roast.select.selection"
67 #else:
68 0
69 0
70 0
71 #end if
72 #end if
73 </command>
74
75 <inputs>
76 <conditional name="inputOpt">
77 <param name="type" type="select" label="Input File Type">
78 <option value="fastq">FastQ File</option>
79 <option value="counts">Table of Counts</option>
80 </param>
81
82 <when value="fastq">
83 <param name="hairpin" type="data" format="tabular"
84 label="Hairpin Annotation"/>
85
86
87 <param name="samples" type="data" format="tabular"
88 label="Sample Annotation"/>
89
90 <repeat name="fastq" title="FastQ Files">
91 <param name="file" type="data" format="fastq"/>
92 </repeat>
93
94 <conditional name="positions">
95 <param name="option" type="select"
96 label="Specify Barcode and Hairpin Locations?"
97 help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
98 <option value="no" selected="True">No</option>
99 <option value="yes">Yes</option>
100 </param>
101
102 <when value="yes">
103 <param name="barstart" type="integer" value="1"
104 label="Barcode Starting Position"/>
105 <param name="barend" type="integer" value="5"
106 label="Barcode Ending Position"/>
107
108 <param name="hpstart" type="integer" value="37"
109 label="Hairpin Starting Position"/>
110
111 <param name="hpend" type="integer" value="57"
112 label="Hairpin Ending Position"/>
113 </when>
114
115 <when value="no"/>
116 </conditional>
117 </when>
118
119 <when value="counts">
120 <param name="counts" type="data" format="tabular" label="Counts Table"/>
121 <param name="anno" type="data" format="tabular"
122 label="Hairpin Annotation"/>
123 <param name="factors" type="data" format="tabular"
124 label="Sample Annotation"/>
125 </when>
126 </conditional>
127
128 <conditional name="filterCPM">
129 <param name="option" type="select" label="Filter Low CPM?"
130 help="Ignore hairpins with very low representation when performing
131 analysis.">
132 <option value="yes">Yes</option>
133 <option value="no">No</option>
134 </param>
135
136 <when value="yes">
137 <param name="cpmReq" type="float" value="0.5" min="0" max="1"
138 label="Minimum CPM"/>
139
140 <param name="sampleReq" type="integer" value="1" min="0"
141 label="Minimum Samples"
142 help="Filter out all the genes that do not meet the minimum
143 CPM in at least this many samples."/>
144 </when>
145
146 <when value="no"/>
147
148 </conditional>
149
150 <conditional name="workMode">
151 <param name="mode" type="select" label="Analysis Type"
152 help="Classic Exact Tests are useful for simple comparisons across
153 two sampling groups. Generalised linear models allow for more
154 complex contrasts and gene level analysis to be made.">
155 <option value="classic">Classic Exact Test</option>
156 <option value="glm">Generalised Linear Model</option>
157 </param>
158
159 <when value="classic">
160 <param name="pair1" type="text" label="Compare" size="40"/>
161 <param name="pair2" type="text" label="To" size="40"
162 help="The analysis will subtract values of this group from those
163 in the group above to establish the difference."/>
164 </when>
165
166 <when value="glm">
167 <param name="contrast" type="text" size="60"
168 label="Contrasts of interest"
169 help="Specify equations defining contrasts to be made. Eg.
170 KD-Control will result in positive fold change if KD has
171 greater expression and negative if Control has greater
172 expression."/>
173
174 <conditional name="roast">
175 <param name="option" type="select"
176 label="Perform Gene Level Analysis?"
177 help="Analyse LogFC tendencies for hairpins belonging
178 to the same gene.">
179 <option value="no">No</option>
180 <option value="yes">Yes</option>
181 </param>
182
183 <when value="yes">
184 <param name="hairpinReq" type="integer" value="2" min="2"
185 label="Minimum Hairpins"
186 help="Only genes with at least this many hairpins will
187 be analysed."/>
188
189 <conditional name="select">
190 <param name="option" type="select"
191 label="Gene Selection Method">
192 <option value="rank">By p-value Rank</option>
193 <option value="geneID">By Gene Identifier</option>
194 </param>
195 <when value="rank">
196 <param name="selection" type="text" size="40" value="1:5"
197 label="Ranks of Top Genes to Plot"
198 help="Genes are ranked in ascending p-value for
199 differential representation, individual ranks can
200 be entered seperated by comma or a range seperated
201 by colon."/>
202 </when>
203 <when value="geneID">
204 <param name="selection" type="text" size="80" value=""
205 label="Symbols of Genes to Plot"
206 help="Select genes based on their identifier in the
207 'Gene' column of the sample information file.
208 Please ensure exact match with the values in input
209 file and separate selections with commas."/>
210 </when>
211 </conditional>
212
213
214 </when>
215
216 <when value="no"/>
217 </conditional>
218 </when>
219 </conditional>
220
221 <param name="fdr" type="float" value="0.05" min="0" max="1"
222 label="FDR Threshold"
223 help="All observations below this threshold will be highlighted
224 in the smear plot."/>
225 <param name="lfc" type="float" value="0" min="0"
226 label="Absolute LogFC Threshold"
227 help="In additional to meeting the FDR requirement, the absolute
228 value of the log-fold-change of the observation must be above
229 this threshold to be highlighted."/>
230 </inputs>
231
232 <outputs>
233 <data format="html" name="outFile" label="shRNAseq Analysis"/>
234 </outputs>
235
236 <help>
237 .. class:: infomark
238
239 **What it does**
240
241 Given tables containing information about the hairpins and their associated
242 barcodes, information about the samples and fastq file containing the hairpin
243 reads. This tool will generate plots and tables for the analysis of differential
244 representation.
245
246 -----
247
248 .. class:: infomark
249
250 **INPUTS**
251
252 **Input File Type:**
253
254 This tool is able to either generate counts from a raw FastQ file given the
255 information regarding the samples and hairpins. Alternatively if a table of
256 counts has already been generated it can also be used.
257
258 **Counts Table (Counts Input):**
259
260 A tab delimited text table of information regarding the counts of hairpins.
261 Should have a column 'ID' to denote the hairpins that counts correspond to. Each
262 additional column should have titles corresponding to the label for the sample.
263
264 Example::
265
266 ID Sample1 Sample2 Sample3
267 Control1 49802 48014 40148
268 Control2 12441 16352 14232
269 Control3 9842 9148 9111
270 Hairpin1 3300 3418 2914
271 Hairpin2 91418 95812 93174
272 Hairpin3 32985 31975 35104
273 Hairpin4 12082 14081 14981
274 Hairpin5 2491 2769 2691
275 Hairpin6 1294 1486 1642
276 Hairpin7 49501 49076 47611
277 ...
278
279 **Hairpin Annotation:**
280
281 A tab delimited text table of information regarding the hairpins. Should have
282 columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it
283 with the reads to produce counts and identify which gene the hairpin acts on.
284
285 NOTE: the column names are case sensitive and should be input exactly as they
286 are shown here.
287
288 Example::
289
290 ID Sequences Gene
291 Control1 TCTCGCTTGGGCGAGAGTAAG 2
292 Control2 CCGCCTGAAGTCTCTGATTAA 2
293 Control3 AGGAATTATAATGCTTATCTA 2
294 Hairpin1 AAGGCAGAGACTGACCACCTA 4
295 Hairpin2 GAGCGACCTGGTGTTACTCTA 4
296 Hairpin3 ATGGTGTAAATAGAGCTGTTA 4
297 Hairpin4 CAGCTCATCTTCTGTGAAGAA 4
298 Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4
299 Hairpin6 CCAGGCACAGATCTCAAGATA 4
300 Hairpin7 ATGACAAGAAAGACATCTCAA 7
301 ...
302
303 **Sample Annotation (FastQ Input):**
304
305 A tab delimited text table of information regarding the samples. Should have
306 columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify
307 the sample in the reads by its barcode sequence and correctly group replicates
308 for analysis. Additional columns may inserted for annotation purposes and will
309 not interfere with analysis as long as the necessary columns are present.
310
311 NOTE: the column names are case sensitive and should be input exactly as they
312 are shown here.
313
314 Example::
315
316 ID Sequences group Replicate
317 3 GAAAG Day 2 1
318 6 GAACC Day 10 1
319 9 GAAGA Day 5 GFP neg 1
320 16 GAATT Day 5 GFP pos 1
321 18 GACAC Day 2 2
322 21 GACCA Day 10 2
323 28 GACGT Day 5 GFP neg 2
324 31 GACTG Day 5 GFP pos 2
325 33 GAGAA Day 2 3
326 40 GAGCT Day 10 3
327 ...
328
329 **Specify Barcode and Hairpin Locations (FastQ Input):**
330
331 It is assumed that in the sequencing reads that the first 5 bases are the
332 barcodes and that bases 37-57 are the hairpins. If this is not the case then the
333 values of the positions can be changed, however it still requires the barcodes
334 and hairpins to be in a consistent location an in a continuous sequence.
335
336 **Filter Low CPM?:**
337
338 Often in a large screen there may members with very low counts which are of no
339 interest in the experiment, these may be filtered out to speed up computations.
340 Filtering will be based on counts per million in a required number of samples.
341
342 **Analysis Type:**
343
344 * **Classic Exact Test:** This allows two experimental groups to be compared and
345 p-values for differential representation derivec for each hairpin. Simple and
346 fast for straightforward comparisons. In this option you will have the option of
347 "*Compare* x *To* y" which implicitly subtracts the data from y from that of x
348 to produce the comparison.
349
350 * **Generalised Linear Model:** This allow for complex contrasts to be specified
351 and also gene level analysis to be performed. If this option is chosen then
352 contrasts must be explicitly stated in equations and multiple contrasts can be
353 made. In addition there will be the option to analyse hairpins on a per-gene
354 basis to see if hairpins belonging to a particular gene have any overall
355 tendencies for the direction of their log-fold-change.
356
357 **FDR Threshold:**
358 The smear plot in the output will have hairpins highlighted to signify
359 significant differential representation. The significance is determined by
360 contorlling the false discovery rate, only those with a FDR lower than the
361 threshold will be highlighted in the plot.
362
363 -----
364
365 **Citations:**
366
367 .. class:: infomark
368
369 limma
370
371 Please cite the paper below for the limma software itself. Please also try
372 to cite the appropriate methodology articles that describe the statistical
373 methods implemented in limma, depending on which limma functions you are
374 using. The methodology articles are listed in Section 2.1 of the limma
375 User's Guide.
376
377 * Smyth, GK (2005). Limma: linear models for microarray data. In:
378 'Bioinformatics and Computational Biology Solutions using R and
379 Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
380 W. Huber (eds), Springer, New York, pages 397-420.
381
382 .. class:: infomark
383
384 edgeR
385
386 Please cite the first paper for the software itself and the other papers for
387 the various original statistical methods implemented in edgeR. See
388 Section 1.2 in the User's Guide for more detail.
389
390 * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
391 package for differential expression analysis of digital gene expression
392 data. Bioinformatics 26, 139-140
393
394 * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
395 differences in tag abundance. Bioinformatics 23, 2881-2887
396
397 * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
398 binomial dispersion, with applications to SAGE data.
399 Biostatistics, 9, 321-332
400
401 * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
402 of multifactor RNA-Seq experiments with respect to biological variation.
403 Nucleic Acids Research 40, 4288-4297
404
405 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
406 .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
407 </help>
408 </tool>
409