comparison mageck_count.xml @ 0:b80c0e046539 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 71cef018eec5ee7ff7f3853599c027e80e2637fe
author iuc
date Wed, 14 Feb 2018 06:42:18 -0500
parents
children 4d72d204dcfa
comparison
equal deleted inserted replaced
-1:000000000000 0:b80c0e046539
1 <?xml version="1.0"?>
2 <tool id="mageck_count" name="MAGeCK count" version="@VERSION@" >
3 <description>- collect sgRNA read counts from read mapping files</description>
4 <macros>
5 <import>mageck_macros.xml</import>
6 </macros>
7 <expand macro="requirements">
8 <requirement type="package" version="9.22">ghostscript</requirement>
9 </expand>
10 <expand macro="version" />
11 <command detect_errors="exit_code"><![CDATA[
12
13 #if str($reads.format_select) == "files":
14
15 #if $reads.sample.is_of_type('fastq.gz', 'fastqsanger.gz'):
16 ln -s '${reads.sample}' 'input.gz' &&
17 #set $infile = 'input.gz'
18 #elif $reads.sample.is_of_type('fastq'):
19 ln -s '${reads.sample}' 'input.fastq' &&
20 #set $infile = 'input.fastq'
21 #elif $reads.sample.is_of_type('bam'):
22 ln -s '${reads.sample}' 'input.bam' &&
23 #set $infile = 'input.bam'
24 #end if
25
26 #end if
27
28 mageck count
29
30 #if str($reads.format_select) == "files":
31 --fastq $infile
32 -l '$reads.sgrna_library_file'
33 #if $reads.sample_label:
34 --sample-label '$reads.sample_label'
35 #end if
36
37 #elif str($reads.format_select) == "table":
38 -k '$reads.counts'
39 #if '$sgrna_library_file':
40 -l '$sgrna_library_file'
41 #end if
42
43 #end if
44
45 -n output
46
47 #if $out.pdfreportOpt:
48 $out.pdfreportOpt
49 --keep-tmp
50 #end if
51
52 $out.unmappedOpt
53
54 #if $adv.trim5:
55 --trim-5 $adv.trim5
56 #end if
57 --norm-method $adv.norm_method
58 #if $adv.control_sgrna:
59 --control-sgrna $adv.control_sgrna
60 #end if
61 --sgrna-len $adv.sgrna_len
62 $adv.count_n
63 $adv.reverse_complement
64 $adv.test_run
65
66 #if $adv.gmt_file:
67 --gmt-file '$adv.gmt_file'
68 #end if
69
70 #if $out.pdfreportOpt:
71 &&
72 gs -dBATCH -dNOPAUSE -q -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite -sOutputFile=merged.pdf *.pdf
73 #end if
74 ]]></command>
75 <inputs>
76 <conditional name="reads">
77 <param name="format_select" type="select" label="Reads Files or Count Table?" help="You can choose to input either separate files of reads (one per sample) or a single count table">
78 <option value="files">Separate Reads files</option>
79 <option value="table">Single Count table</option>
80 </param>
81 <when value="files">
82 <param name="sample" argument="--fastq" type="data" format="fastq,fastq.gz,bam" multiple="false" label="Sample reads" help="The input reads must be in FASTQ, FASTQ.GZ or BAM format and all files must be in the same format." />
83 <param name="sgrna_library_file" type="data" argument="--list-seq" format="txt,tabular,tsv,csv" label="sgRNA library file" help="A library file must be provided with three columns containing the sgRNA ID, sequence, and gene it is targeting, see Help below for more information." />
84 <param name="sample_label" argument="--sample-label" type="text" optional="true" value="" label="Sample label" help="Optionally, you can specify a sample label to use in the output file header."/>
85 </when>
86 <when value="table">
87 <param name="counts" argument="-k" type="data" format="tabular" optional="true" label="Counts Table" help="Alternatively, a tab-separated file of read counts can be used as input. See Help below for format" />
88 <param name="sgrna_library_file" type="data" argument="--list-seq" format="txt,tabular,tsv,csv" optional="True" label="sgRNA library file" help="Optionally, a library file can be provided with three columns containing the sgRNA ID, sequence, and gene it is targeting, see Help below for more information." />
89 </when>
90 </conditional>
91
92 <section name="out" title="Output Options">
93 <param name="countsummaryOpt" type="boolean" truevalue="True" falsevalue="" checked="false" optional="true" label="Output summary statistics" help="Output summary statistics of the fastq files. Default: No" />
94 <param name="pdfreportOpt" argument="--pdf-report" type="boolean" truevalue="--pdf-report" falsevalue="" checked="false" optional="true" label="Output PDF report" help="Generate pdf report of the input file. Default: No" />
95 <param name="unmappedOpt" argument="--unmapped-to-file" type="boolean" truevalue="--unmapped-to-file" falsevalue="" checked="false" optional="true" label="Output unmapped reads" help="Save unmapped reads to file. Default: No" />
96 <param name="rscriptOpt" type="boolean" truevalue="True" falsevalue="" checked="false" optional="true" label="Output R script" help="Output the R script used to generate the plots in the pdf report. Default: No" />
97 <param name="logOpt" type="boolean" truevalue="True" falsevalue="" checked="false" label="Output logfile" help="This file includes the logging information, it will list some basic statistics of the dataset at the end" />
98 </section>
99
100 <section name="adv" title="Advanced Options">
101 <param name="gmt_file" argument="--gmt-file" type="data" format="tabular" optional="true" value="" label="Pathway file for QC" help="TThe pathway file used for QC, in GMT format. By default it will use the GMT file provided by MAGeCK" />
102 <param name="trim5" argument="--trim-5" type="integer" min="0" optional="true" label="5' Trim length" help="Length of trimming the 5' of the reads. Default 0" />
103 <param name="norm_method" argument="--norm-method" type="select" label="Method for normalization" help="Methods include: None (no normalization), Median (median normalization), Total (normalization by total read counts), Control (normalization by control sgRNAs specified by the --control-sgrna option). Default: Median" >
104 <option value="none">None</option>
105 <option value="median" selected="True">Median</option>
106 <option value="total">Total</option>
107 <option value="control">Control</option>
108 </param>
109 <param name="control_sgrna" argument="--control-sgrna" type="data" format="tabular" optional="true" label="Control sgRNAs file" help="A file of control sgRNA IDs for normalization and for generating the null distribution of RRA" />
110 <param name="sgrna_len" argument="--sgrna-len" type="integer" min="0" value="20" optional="true" label="Length of the sgRNA" help="The program will automatically determine the sgRNA length from the library file, so only use this if you turn on the --unmapped-to-file option. Default: 20" />
111 <param name="count_n" argument="--count-n" type="boolean" truevalue="--count-n" falsevalue="" checked="false" optional="true" label="Count sgRNAs with Ns" help="By default, sgRNAs containing Ns will be discarded" />
112 <param name="reverse_complement" argument="--reverse-complement" type="boolean" truevalue="--reverse-complement" falsevalue="" checked="false" optional="true" label="Reverse complement the sequences in library for read mapping" />
113 <param name="test_run" argument="--test-run" type="boolean" truevalue="--test-run" falsevalue="" checked="false" optional="true" label="Test running" help="If this option is on, MAGeCK will only process the first 1M records for each file" />
114 </section>
115 </inputs>
116
117 <outputs>
118 <data name="counts" format="tabular" from_work_dir="*.count.txt" label="${tool.name} on ${on_string}: sgRNA Counts" />
119 <data name="countsummary" format="tabular" from_work_dir="*.countsummary.txt" label="${tool.name} on ${on_string}: sgRNA Count Summary" >
120 <filter>out['countsummaryOpt'] is True</filter>
121 </data>
122 <data name="pdfreport" format="pdf" from_work_dir="merged.pdf" label="${tool.name} on ${on_string}: PDF Report" >
123 <filter>out['pdfreportOpt'] is True</filter>
124 </data>
125 <data name="unmapped" format="tabular" from_work_dir="*.unmapped.txt" label="${tool.name} on ${on_string}: Unmapped" >
126 <filter>out['unmappedOpt'] is True</filter>
127 </data>
128 <data name="log" format="txt" from_work_dir="*.log" label="${tool.name} on ${on_string}: Log" >
129 <filter>out['logOpt'] is True</filter>
130 </data>
131 </outputs>
132
133 <tests>
134 <!-- Ensure fastq works -->
135 <test expect_num_outputs="1">
136 <param name="sgrna_library_file" value="demo/demo2/library.txt" ftype="tabular" />
137 <param name="format_select" value="files" />
138 <param name="sample" value="demo/demo2/test1.fastq" ftype="fastq"/>
139 <output name="counts" file="out.count.fastq.txt"/>
140 </test>
141 <!-- Ensure fastq.gz input works -->
142 <test expect_num_outputs="1">
143 <param name="sgrna_library_file" value="demo/demo2/library.txt" ftype="tabular" />
144 <param name="format_select" value="files" />
145 <param name="sample" value="test1.fastq.gz" ftype="fastq.gz"/>
146 <output name="counts" file="out.count.fastq.txt"/>
147 </test>
148 <!-- Ensure BAM input works -->
149 <test expect_num_outputs="1">
150 <param name="sgrna_library_file" value="demo/demo2/library.txt" ftype="tabular" />
151 <param name="format_select" value="files" />
152 <param name="sample" value="test1.bam" ftype="bam"/>
153 <output name="counts" file="out.count.bam.txt"/>
154 </test>
155 <!-- Ensure optional outputs work -->
156 <test expect_num_outputs="5">
157 <param name="sgrna_library_file" value="demo/demo2/library.txt" ftype="tabular" />
158 <param name="format_select" value="files" />
159 <param name="sample" value="test1.fastq.gz" ftype="fastq.gz"/>
160 <param name="countsummaryOpt" value="True" />
161 <param name="unmappedOpt" value="True" />
162 <param name="pdfreportOpt" value="True" />
163 <param name="rscriptOpt" value="True" />
164 <param name="logOpt" value="True" />
165 <output name="counts" file="out.count.fastq.txt"/>
166 <output name="countsummary" file="out.countsummary.txt" compare="sim_size"/>
167 <output name="log" file="out.count.log.txt" compare="sim_size"/>
168 <output name="unmapped" file="out.count.unmapped.txt" />
169 <output name="pdfreport" file="out.countsummary.pdf" compare="sim_size" />
170 </test>
171 </tests>
172
173 <help><![CDATA[
174 .. class:: infomark
175
176 **What it does**
177
178 Model-based Analysis of Genome-wide CRISPR-Cas9 Knockout (MAGeCK_) is a computational tool to identify important genes from the recent genome-scale CRISPR-Cas9 knockout screens (or GeCKO) technology. MAGeCK can be used for prioritizing single-guide RNAs, genes and pathways in genome-scale CRISPR/Cas9 knockout screens. MAGeCK identifies both positively and negatively selected genes simultaneously and reports robust results across different experimental conditions. MAGeCK is developed and maintained by Wei Li and Han Xu from `Prof. Xiaole Shirley Liu's lab`_ at the Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute and Harvard School of Public Health. MAGeCK has been used to identify functional lncRNAs from screens with close to `100% validation rate`_.
179
180 -----
181
182 **Inputs**
183
184 By default, MAGeCK count command will automatically determine the trimming length of the fastq file.
185
186 **sgRNA library file**
187
188 When starting from FASTQ, FASTQ.GZ or BAM files, MAGeCK needs to know the sgRNA sequences and targeting genes. Such information is provided in the sgRNA library file and can be specified in the tool form above. The sgRNA library file can be provided in .tsv or .csv format. There are three columns in the library file: the sgRNA ID, the sequence, and the gene it is targeting.
189
190 Example:
191
192 ============ ==================== ========
193 **sgRNA ID** **Sequence** **Gene**
194 ------------ -------------------- --------
195 s_10007 TGTTCACAGTATAGTTTGCC CCNA1
196 s_10008 TTCTCCCTAATTGCTTGCTG CCNA1
197 s_10027 ACATGTTGCTTCCCCTTGCA CCNC
198 ============ ==================== ========
199
200 **Control sgRNA file**
201
202 The optional Control sgRNAs file is used to generate null distribution when calculating the p values. If this option is not specified, MAGeCK generates the null distribution of RRA scores by assuming all of the genes in the library are non-essential, see **More Information** below. This approach is sometimes over-conservative, and you can improve this if you know some genes are not essential. By providing the corresponding sgRNA IDs in this option, MAGeCK will have a better estimation of p values. To use this option, you need to prepare a text file specifying the IDs of control sgRNAs, one line for one sgRNA ID.
203
204 -----
205
206 **Outputs**
207
208 **sgRNA Count file**
209
210 An example of the sgRNA count output file is shown below. This file can be used with **MAGeCK test**.
211
212 Example:
213
214 ============== ======== ================
215 **sgRNA** **Gene** **Sample Label**
216 -------------- -------- ----------------
217 A1CF_m52595977 A1CF 213
218 A1CF_m52596017 A1CF 294
219 A1CF_m52596056 A1CF 421
220 A1CF_m52603842 A1CF 274
221 A1CF_m52603847 A1CF 0
222 ============== ======== ================
223
224
225 **Count Summary**
226
227 MAGeCK can produce a **Count Summary** file containing statistics of the input file (the statistics of fastq file are also in the PDF report). An example count summary file is shown below.
228
229 Example:
230
231 ========== ===== ===== ====== ========== =========== ========== ========= ======== ============ ======================= ========================== ============
232 File Label Reads Mapped Percentage TotalsgRNAs Zerocounts GiniIndex NegSelQC NegSelQCPval NegSelQCPvalPermutation NegSelQCPvalPermutationFDR NegSelQCGene
233 ========== ===== ===== ====== ========== =========== ========== ========= ======== ============ ======================= ========================== ============
234 InputFile1 L1 2500 1453 0.5812 2550 1276 0.5267 0 1 1 1 0.0
235 ========== ===== ===== ====== ========== =========== ========== ========= ======== ============ ======================= ========================== ============
236
237 -----
238
239 **More Information**
240
241 **Overview of the MAGeCK algorithm**
242
243 Briefly, read counts from different samples are first median-normalized to adjust for the effect of library sizes and read count distributions. Then the variance of read counts is estimated by sharing information across features, and a negative binomial (NB) model is used to test whether sgRNA abundance differs significantly between treatments and controls. This approach is similar to those used for differential RNA-Seq analysis. We rank sgRNAs based on P-values calculated from the NB model, and use a modified robust ranking aggregation (RRA) algorithm named α-RRA to identify positively or negatively selected genes. More specifically, α-RRA assumes that if a gene has no effect on selection, then sgRNAs targeting this gene should be uniformly distributed across the ranked list of all the sgRNAs. α-RRA ranks genes by comparing the skew in rankings to the uniform null model, and prioritizes genes whose sgRNA rankings are consistently higher than expected. α-RRA calculates the statistical significance of the skew by permutation, and a detailed description of the algorithm is presented in the Materials and methods section of the MAGeCK paper. Finally, MAGeCK reports positively and negatively selected pathways by applying α-RRA to the rankings of genes in a pathway.
244
245 **MAGeCK FAQs**
246
247 **The 5' trim length option can only trim a fixed length of nucleotides before sgRNA, but what if the trimming length is different in different reads?**
248 MAGeCK enables automatically determining trimming length, even the length may be different within the same fastq files.
249 Alternatively, you can use **cutadapt** to trim the adaptor sequences of variable length before running MAGeCK.
250
251 **How do I get the simple statistics of my input files?**
252 MAGeCK produces a **Count Summary** file containing the statistics of the input files, the statistics are also in the PDF report. The statistics can also be found in the log file for **MAGeCK** count.
253
254 **How do I know the quality of my samples?**
255 For simple QC terms, you can just take a look at the sample statistics. Generally in a good negative selection sample:
256
257 #. the mapped reads should be over 60 percent of the total number reads
258 #. the number of zero-count sgRNAs should be few (<5%, and prefered <1%). One exception is in positive selection experiments, where the number of zero-count sgRNAs may be much higher, but the percentage of mapped reads should be reasonably high.
259
260 For more information on using MAGeCK, see the `MAGeCK website here`_.
261
262 .. _MAGeCK: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
263 .. _`100% validation rate`: https://sourceforge.net/p/mageck/wiki/Home/
264 .. _`Prof. Xiaole Shirley Liu's lab`: http://liulab.dfci.harvard.edu/
265 .. _`MAGeCK website here`: https://sourceforge.net/p/mageck/wiki/QA/#using-mageck
266
267 ]]></help>
268 <expand macro="citations" />
269 </tool>