comparison mageck_test.xml @ 0:b8da4d41aa1d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 71cef018eec5ee7ff7f3853599c027e80e2637fe
author iuc
date Wed, 14 Feb 2018 06:42:55 -0500
parents
children 5e2a28bee02d
comparison
equal deleted inserted replaced
-1:000000000000 0:b8da4d41aa1d
1 <?xml version="1.0"?>
2 <tool id="mageck_test" name="MAGeCKs test" version="@VERSION@" >
3 <description>- given a table of read counts, perform the sgRNA and gene ranking</description>
4 <macros>
5 <import>mageck_macros.xml</import>
6 </macros>
7 <expand macro="requirements" />
8 <expand macro="version" />
9 <command detect_errors="exit_code"><![CDATA[
10
11 mageck test
12 -k '$count_table'
13
14 #if $sampleids.sample_select == "treated":
15 -t '$sampleids.treatment_id'
16 #elif $sampleids.sample_select == "control":
17 --day0-label '$sampleids.day0_label'
18 #end if
19
20 #if '$control_id':
21 -c '$control_id'
22 #end if
23
24 -n output
25
26 $out.normcounts
27 $out.pdfreport
28
29 --norm-method $adv.norm_method
30 --gene-test-fdr-threshold $adv.fdr_threshold
31 --adjust-method $adv.adjust_method
32 $adv.var_samples
33 --sort-criteria $adv.sort_criteria
34 --remove-zero $adv.remove_zero
35 --gene-lfc-method $adv.lfc_method
36 #if $adv.control_sgrna:
37 --control-sgrna $adv.control_sgrna
38 #end if
39 #if $adv.cnv_norm:
40 --cnv-norm $adv.cnv_norm
41 #end if
42 #if $adv.cell_line:
43 --cell-line $adv.cell_line
44 #end if
45
46 ]]></command>
47 <inputs>
48 <param name="count_table" argument="--count-table" type="data" format="tabular" label="Counts file" help="A tab-separated file of read counts. See Help below for format" />
49 <conditional name="sampleids">
50 <param name="sample_select" type="select" label="Specify Treated samples or Control"
51 help="You can choose to either specify the treated samples or the control">
52 <option value="treated">Treated samples</option>
53 <option value="control">Control sample</option>
54 </param>
55 <when value="treated">
56 <param name="treatment_id" argument="--treatment-id" type="text" label="Treated Sample Labels (or Indexes)" help="If sample label is provided, the labels must match the labels in the first line of the count table, separated by comma (,); for example, HL60.final,KBM7.final. For sample index, 0,2 means the 1st and 3rd samples are treatment experiments. See Help below for a detailed description." />
57 </when>
58 <when value="control">
59 <param name="day0_label" argument="--day0-label" type="text" optional="true" value="" label="Control Sample Label" help="Specify the label for the control sample. For every other sample label, the module will treat it as a treatment condition and compare with control sample (usually day 0 or plasmid)" />
60 </when>
61 </conditional>
62 <param name="control_id" argument="--control-id" type="text" optional="true" label="Control Sample Labels (or Indexes)" help="If sample label is provided, the labels must match the labels in the first line of the count table, separated by comma (,). Default is all the samples not specified in treatment experiments. See Help below for a detailed description." />
63
64 <section name="out" title="Output Options">
65 <param name="normcounts" argument="--normcounts-to-file" type="boolean" truevalue="--normcounts-to-file" falsevalue="" checked="false" optional="true" label="Output normalized counts file" help="Default: No" />
66 <param name="pdfreport" argument="--pdf-report" type="boolean" truevalue="--pdf-report" falsevalue="" checked="false" optional="true" label="Output PDF report" help="Generate pdf report of the input file. Default: No" />
67 <param name="rscriptOpt" type="boolean" truevalue="True" falsevalue="" checked="false" optional="true" label="Output R script" help="Output the R script used to generate the plots in the pdf report. Default: No" />
68 <param name="out_log" type="boolean" truevalue="True" falsevalue="" checked="false" label="Output logfile" help="This file includes the logging information during the execution. Default: No" />
69 </section>
70
71 <section name="adv" title="Advanced Options">
72 <param name="norm_method" argument="--norm-method" type="select" label="Method for normalization"
73 help="If control is specified, the size factor will be estimated using control sgRNAs specified in --control-sgrna option. Default: Median" >
74 <option value="none">None</option>
75 <option value="median" selected="True">Median</option>
76 <option value="total">Total</option>
77 <option value="control">Control</option>
78 </param>
79 <param name="fdr_threshold" argument="--gene-test-fdr-threshold" type="float" value="0.25" min="0" max="1" label="Gene test FDR-adjusted Threshold" help="FDR threshold for gene test. Default: 0.25"/>
80 <param name="adjust_method" argument="--adjust-method" type="select" label="P-Value Adjustment Method" help="Method for sgRNA-level p-value adjustment, including False Discovery Rate (FDR), Holm's method (Holm), or Pounds's method (Pounds). Default: FDR">
81 <option value="fdr" selected="True">FDR</option>
82 <option value="holm">Holm</option>
83 <option value="pounds">Pounds</option>
84 </param>
85 <param name="var_samples" argument="--variance-from-all-samples" type="boolean" truevalue="--variance-from-all-samples" falsevalue="" checked="false" optional="true" label="Estimate the variance from all samples, instead of from only control samples" help="Use this option only if you believe there are relatively few essential sgRNAs or genes between control and treatment samples" />
86 <expand macro="sort_criteria" />
87 <param name="remove_zero" argument="--remove-zero" type="select" help="Whether to remove zero-count sgRNAs in control and/or treatment experiments {none,control,treatment,both}. Default: None (do not remove those zero-count sgRNAs)" >
88 <option value="none" selected="True">None</option>
89 <option value="control">Control</option>
90 <option value="treatment">Treatment</option>
91 <option value="both">Both</option>
92 </param>
93 <param name="lfc_method" argument="--gene-lfc-method" type="select" label="Gene Log-Fold Change Method." help="Method to calculate gene log fold changes (LFC) from sgRNA LFCs. Available methods include the median/mean of all sgRNAs (median/mean), or the median/mean sgRNAs that are ranked in front of the alpha cutoff in RRA (alphamedian/alphamean), or the sgRNA that has the second strongest LFC (secondbest). In the alphamedian/alphamean case, the number of sgRNAs correspond to the goodsgrna column in the output, and the gene LFC will be set to 0 if no sgRNA is in front of the alpha cutoff. Default: Median. (new since v0.5.5)">
94 <option value="median" selected="True">Median</option>
95 <option value="alphamedian">Alphamedian</option>
96 <option value="mean">Mean</option>
97 <option value="alphamean">Alphamean</option>
98 <option value="secondbest">Secondbest</option>
99 </param>
100 <param name="control_sgrna" argument="--control-sgrna" type="data" format="tabular" optional="true" label="Control sgRNAs file" help="A list of control sgRNAs for normalization and for generating the null distribution of RRA" />
101 <param name="cnv_norm" argument="--cnv-norm" type="data" format="tabular" optional="true" label="CNV profile file" help="A tab-delimited file containing the CNV status for each gene. See Help below for more information and format." />
102 <param name="cell_line" argument="--cell-line" type="text" optional="true" label="Cell line column" help="Name of the column from the CNV profile file to use. See Help below for more information" />
103 </section>
104 </inputs>
105
106 <outputs>
107 <data name="gene_summary" format="tabular" from_work_dir="output.gene_summary.txt" label="${tool.name} on ${on_string}: Gene Summary" />
108 <data name="sgrna_summary" format="tabular" from_work_dir="output.sgrna_summary.txt" label="${tool.name} on ${on_string}: sgRNA Summary" />
109 <data name="log" format="tabular" from_work_dir="output.log" label="${tool.name} on ${on_string}: Log">
110 <filter>out['out_log'] is True</filter>
111 </data>
112 <data name="normcounts" format="tabular" from_work_dir="output.normalized.txt" label="${tool.name} on ${on_string}: sgRNA Normalized Counts">
113 <filter>out['normcounts'] is True</filter>
114 </data>
115 <data name="pdfreport" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: PDF Report">
116 <filter>out['pdfreport'] is True</filter>
117 </data>
118 <data name="rscript" format="txt" from_work_dir="output.R" label="${tool.name} on ${on_string}: RScript">
119 <filter>out['rscriptOpt'] is True</filter>
120 </data>
121 </outputs>
122 <tests>
123 <test><!-- Ensure MAGeCK's default output works -->
124 <param name="count_table" value="demo/demo1/sample.txt" ftype="tabular" />
125 <param name="treatment_id" value="HL60.final,KBM7.final" />
126 <param name="control_id" value="HL60.initial,KBM7.initial" />
127 <output name="gene_summary" file="out.test.gene_summary.txt"/>
128 <output name="sgrna_summary" file="out.test.sgrna_summary.txt"/>
129 </test>
130 <test><!-- Ensure MAGeCK's additional outputs works -->
131 <param name="count_table" value="demo/demo1/sample.txt" ftype="tabular" />
132 <param name="treatment_id" value="HL60.final,KBM7.final" />
133 <param name="control_id" value="HL60.initial,KBM7.initial" />
134 <param name="out_log" value="True" />
135 <param name="normcounts" value="True" />
136 <param name="pdfreport" value="True" />
137 <param name="rscriptOpt" value="True" />
138 <output name="gene_summary" file="out.test.gene_summary.txt"/>
139 <output name="sgrna_summary" file="out.test.sgrna_summary.txt"/>
140 <output name="normcounts" file="out.test.normalized.txt"/>
141 <output name="log" file="out.test.log.txt" compare="sim_size"/>
142 <output name="pdfreport" file="out.test.pdf" compare="sim_size"/>
143 <output name="rscript" file="out.test.R" />
144 </test>
145 </tests>
146
147 <help><![CDATA[
148 .. class:: infomark
149
150 **What it does**
151
152 `Model-based Analysis of Genome-wide CRISPR-Cas9 Knockout` (MAGeCK) is a computational tool to identify important genes from the recent genome-scale CRISPR-Cas9 knockout screens (or GeCKO) technology. MAGeCK can be used for prioritizing single-guide RNAs, genes and pathways in genome-scale CRISPR/Cas9 knockout screens. MAGeCK identifies both positively and negatively selected genes simultaneously and reports robust results across different experimental conditions. MAGeCK is developed and maintained by Wei Li and Han Xu from `Prof. Xiaole Shirley Liu's lab`_ at the Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute and Harvard School of Public Health. MAGeCK has been used to identify functional lncRNAs from screens with close to `100% validation rate`_.
153
154 .. _`Model-based Analysis of Genome-wide CRISPR-Cas9 Knockout`: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
155 .. _`100% validation rate`: https://sourceforge.net/p/mageck/wiki/Home/
156 .. _`Prof. Xiaole Shirley Liu's lab`: http://liulab.dfci.harvard.edu/
157
158 -----
159
160 **mageck test**
161
162 This tests and ranks sgRNAs and genes based on the table provided.
163
164 **Inputs**
165
166 **sgRNA count file**
167
168 The input sgRNA count file be tab-delimited and list the names of the sgRNA, the gene it is targeting, followed by the read counts in each sample. A header line is optional. For example in the studies of T. Wang et al. Science 2014, there are 4 CRISPR screening samples, and they are labeled as: HL60.initial, KBM7.initial, HL60.final, KBM7.final, see below.
169
170 Example:
171
172 ============== ======== ================ ================ ============== ==============
173 **sgRNA** **gene** **HL60.initial** **KBM7.initial** **HL60.final** **KBM7.final**
174 -------------- -------- ---------------- ---------------- -------------- --------------
175 A1CF_m52595977 A1CF 213 274 883 175
176 A1CF_m52596017 A1CF 294 412 1554 1891
177 A1CF_m52596056 A1CF 421 368 566 759
178 A1CF_m52603842 A1CF 274 243 314 855
179 A1CF_m52603847 A1CF 0 50 145 266
180 ============== ======== ================ ================ ============== ==============
181
182 **Sample Labels**
183
184 In the Treatment and Control inputs above, you can use either Sample Label or Sample Index to specify samples. If sample label is used, the labels MUST match the sample labels in the first line of the count table. For example, "HL60.final,KBM7.final".
185 You can also use sample index to specify samples. The index of the sample is the order it appears in the sgRNA read count file, starting from 0. The index is used in the Treatment and Control inputs. In the example above, there are four samples, and the index of each sample is as follows:
186
187 ============ =======
188 *sample* *index*
189 ------------ -------
190 HL60.initial 0
191 KBM7.initial 1
192 HL60.final 2
193 KBM7.final 3
194 ============ =======
195
196 **Control sgRNA file**
197
198 The optional Control sgRNA file is used to generate null distribution when calculating the p values. If this option is not specified, MAGeCK generates the null distribution of RRA scores by assuming all of the genes in the library are non-essential. This approach is sometimes over-conservative, and you can improve this if you know some genes are not essential. By providing the corresponding sgRNA IDs in this option, MAGeCK will have a better estimation of p values. To use this option, you need to prepare a text file specifying the IDs of control sgRNAs, one line for one sgRNA ID.
199
200 -----
201
202 **Outputs**
203
204 **sgRNA Summary file**
205
206 An example of the sgRNA ranking output is as follows:
207
208 ================ ======== ================= =================== ================ ============== ======= =============== =========== ========= ========= =========== ============== =========== =====================
209 **sgrna** **Gene** **control_count** **treatment_count** **control_mean** **treat_mean** **LFC** **control_var** **adj_var** **score** **p.low** **p.high** **p.twosided** **FDR** **high_in_treatment**
210 ---------------- -------- ----------------- ------------------- ---------------- -------------- ------- --------------- ----------- --------- --------- ----------- -------------- ----------- ---------------------
211 INO80B_m74682554 INO80B 0.0/0.0 1220.15/1476.14 0.810860 1348.15 10.70 0.0 19.0767 308.478 1.0 1.11022e-16 2.22044e-16 1.57651e-14 True
212 NHS_p17705966 NHS 1.62172/3.90887 2327.09/1849.95 2.76529 2088.52 9.54 2.61554 68.2450 252.480 1.0 1.11022e-16 2.22044e-16 1.57651e-14 True
213 ================ ======== ================= =================== ================ ============== ======= =============== =========== ========= ========= =========== ============== =========== =====================
214
215 The contents of each column are as follows:
216
217 * **sgrna** sgRNA ID
218 * **Gene** The targeting gene
219 * **control_count** Normalized read counts in control samples
220 * **treatment_count** Normalized read counts in treatment samples
221 * **control_mean** Mean read counts in control samples
222 * **treat_mean** Mean read counts in treatment samples
223 * **LFC** The log fold change of sgRNA
224 * **control_var** The raw variance in control samples
225 * **adj_var** The adjusted variance in control samples
226 * **score** The score of this sgRNA
227 * **p.low** p-value (lower tail)
228 * **p.high** p-value (higher tail)
229 * **p.twosided** p-value (two sided)
230 * **FDR** false discovery rate
231 * **high_in_treatment** Whether the abundance is higher in treatment samples
232
233
234 **Gene Summary file**
235
236 An example of the gene summary output file is as follows:
237
238 ======= ======= ============= =============== =========== ============ ================= =========== ============= =============== =========== ============ ================= ===========
239 **id** **num** **neg|score** **neg|p-value** **neg|fdr** **neg|rank** **neg|goodsgrna** **neg|lfc** **pos|score** **pos|p-value** **pos|fdr** **pos|rank** **pos|goodsgrna** **pos|lfc**
240 ------- ------- ------------- --------------- ----------- ------------ ----------------- ----------- ------------- --------------- ----------- ------------ ----------------- -----------
241 ESPL1 12 6.4327e-10 7.558e-06 7.9e-05 1 -2.35 11 0.99725 0.99981 0.999992 615 0 -0.07
242 RPL18 12 6.4671e-10 7.558e-06 7.9e-05 2 -2.12 11 0.99799 0.99989 0.999992 620 0 -0.32
243 CDK1 12 2.6439e-09 7.558e-06 7.9e-05 3 -1.93 12 1.0 0.99999 0.999992 655 0 -0.12
244 ======= ======= ============= =============== =========== ============ ================= =========== ============= =============== =========== ============ ================= ===========
245
246 The contents of each column is as follows:
247
248 * **id** Gene ID
249 * **num** The number of targeting sgRNAs for each gene
250 * **neg|score** The RRA lo value of this gene in negative selection
251 * **neg|p-value** The raw p-value (using permutation) of this gene in negative selection
252 * **neg|fdr** The false discovery rate of this gene in negative selection
253 * **neg|rank** The ranking of this gene in negative selection
254 * **neg|goodsgrna** The number of "good" sgRNAs, i.e., sgRNAs whose ranking is below the alpha cutoff (determined by the --gene-test-fdr-threshold option), in negative selection.
255 * **neg|lfc** The log fold change of this gene in negative selection
256 * **pos|score** The number of targeting sgRNAs for each gene in positive selection (usually the same as num.neg)
257 * **pos|score** The RRA lo value of this gene in negative selection
258 * **pos|p-value** The raw p-value of this gene in positive selection
259 * **pos|fdr** The false discovery rate of this gene in positive selection
260 * **pos|rank** The ranking of this gene in positive selection
261 * **pos|goodsgrna** The number of "good" sgRNAs, i.e., sgRNAs whose ranking is below the alpha cutoff (determined by the --gene-test-fdr-threshold option), in positive selection.
262 * **pos|lfc** The log fold change of this gene in positive selection
263
264 Genes are ranked by the p.neg field (by default). If you need a ranking by the p.pos, you can use the --sort-criteria option.
265
266 -----
267
268 **More Information**
269
270 **Overview of the MAGeCK algorithm**
271
272 Briefly, read counts from different samples are first median-normalized to adjust for the effect of library sizes and read count distributions. Then the variance of read counts is estimated by sharing information across features, and a negative binomial (NB) model is used to test whether sgRNA abundance differs significantly between treatments and controls. This approach is similar to those used for differential RNA-Seq analysis. We rank sgRNAs based on P-values calculated from the NB model, and use a modified robust ranking aggregation (RRA) algorithm named α-RRA to identify positively or negatively selected genes. More specifically, α-RRA assumes that if a gene has no effect on selection, then sgRNAs targeting this gene should be uniformly distributed across the ranked list of all the sgRNAs. α-RRA ranks genes by comparing the skew in rankings to the uniform null model, and prioritizes genes whose sgRNA rankings are consistently higher than expected. α-RRA calculates the statistical significance of the skew by permutation, and a detailed description of the algorithm is presented in the Materials and methods section of the `MAGeCK paper`_. Finally, MAGeCK reports positively and negatively selected pathways by applying α-RRA to the rankings of genes in a pathway.
273
274 For more information on using MAGeCK, see the `MAGeCK website here`_.
275
276 .. _`MAGeCK paper`: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
277 .. _`MAGeCK website here`: https://sourceforge.net/p/mageck/wiki/QA/#using-mageck
278
279 ]]></help>
280 <expand macro="citations" />
281 </tool>