comparison genrich.xml @ 0:a41d96fc0b20 draft

planemo upload for repository https://github.com/jsh58/Genrich commit 38aa99ebf650c22a1c4965f6f008882aea7033ba
author iuc
date Mon, 15 Jul 2019 09:43:27 -0400
parents
children db50f51a2952
comparison
equal deleted inserted replaced
-1:000000000000 0:a41d96fc0b20
1 <tool id="genrich" name="Genrich" version="0.5">
2 <description>Detecting sites of genomic enrichment</description>
3 <requirements>
4 <requirement type="package" version="0.5">genrich</requirement>
5 <requirement type="package" version="1.9">samtools</requirement>
6 </requirements>
7
8 <version_command>Genrich --version</version_command>
9
10 <command detect_errors="exit_code"><![CDATA[
11
12 #set $file_stderr = 'genrich_stderr'
13
14 Genrich
15
16 ###########
17 ## Input ##
18 ###########
19
20 ## Treatment File(s)
21 #if str($treatment.t_multi_select) == "Yes":
22 -t ${ ' '.join( [ "'%s'" % $x for $x in $treatment.input_treatment_file] ) }
23 #else
24 -t '$treatment.input_treatment_file'
25 #end if
26
27 ## Control File(s)
28 #if str($control.c_select) == "Yes":
29 #if str($control.c_multiple.c_multi_select) == "Yes":
30 -c ${ ' '.join( [ "'%s'" % $x for $x in $control.c_multiple.input_control_file] ) }
31 #else
32 -c '$control.c_multiple.input_control_file'
33 #end if
34 #end if
35
36 ####################
37 ## Filter Options ##
38 ####################
39
40 #if $filter_options.duplicates:
41 '$filter_options.duplicates' '${out_dups}'
42 #end if
43
44 #if $filter_options.exclude_chr:
45 -e '$filter_options.exclude_chr'
46 #end if
47
48 #if str($cond_exclude.exclude_select) == "Yes":
49 -E $cond_exclude.erf
50 #end if
51
52 -m $filter_options.min_mapq
53 -s $filter_options.alignment_score
54 $filter_options.unpaired
55
56 #if $filter_options.alignment_lengths:
57 -w $filter_options.alignment_lengths
58 #end if
59
60 $filter_options.alignment_lengths2
61
62 ##################
63 ## ATAC Options ##
64 ##################
65
66 $atac_options.atac
67 -d $atac_options.expand_sites
68
69 #########################
70 ## Peakcalling Options ##
71 #########################
72
73 -q $peakcalling_options.max_q
74
75 #if $peakcalling_options.max_p:
76 -p $peakcalling_options.max_p
77 #end if
78
79 -a $peakcalling_options.min_auc
80 -l $peakcalling_options.min_peak_length
81 -g $peakcalling_options.max_dist
82
83 ###################
84 ## Other Options ##
85 ###################
86
87 $other_options.skip_peak_calling
88 -v
89
90 ####################
91 ## Output Options ##
92 ####################
93
94 #if $output_options.bedgraph1:
95 -f '${out_bedgraph1}'
96 #end if
97
98 #if $output_options.bedgraph2:
99 -k '${out_bedgraph2}'
100 #end if
101
102 #if $output_options.bed:
103 -b '${out_bed}'
104 #end if
105
106 -o '${outfile}'
107
108 2>&1 > $file_stderr &&
109 exit_code_for_galaxy=\$? &&
110 cat $file_stderr 2>&1 &&
111 exit \$exit_code_for_galaxy
112
113 ]]></command>
114 <inputs>
115 <conditional name="treatment">
116 <param name="t_multi_select" type="select" label="Are you pooling Treatment Files?" help="For more information, see Help section below" >
117 <option value="No" selected="True">No</option>
118 <option value="Yes">Yes</option>
119 </param>
120 <when value="No" >
121 <param name="input_treatment_file" argument="-t" type="data" format="qname_sorted.bam" label="Treatment File(s)" />
122 </when>
123 <when value="Yes">
124 <param name="input_treatment_file" argument="-t" type="data" format="qname_sorted.bam" multiple="true" label="Treatment Files" />
125 </when>
126 </conditional>
127
128 <conditional name="control">
129 <param name="c_select" type="select" label="Do you have a Control File?" >
130 <option value="Yes">Yes</option>
131 <option value="No" selected="True">No</option>
132 </param>
133 <when value="Yes">
134 <conditional name="c_multiple">
135 <param name="c_multi_select" type="select" label="Are you pooling Control Files?" help="For more information, see Help section below" >
136 <option value="No" selected="True">No</option>
137 <option value="Yes">Yes</option>
138 </param>
139 <when value="No" >
140 <param name="input_control_file" argument="-c" type="data" format="qname_sorted.bam" label="Control File(s)" />
141 </when>
142 <when value="Yes">
143 <param name="input_control_file" argument="-c" type="data" format="qname_sorted.bam" multiple="true" label="Control Files" />
144 </when>
145 </conditional>
146 </when>
147 <when value="No" />
148 </conditional>
149
150 <!-- Filter Options -->
151 <conditional name="cond_exclude">
152 <param name="exclude_select" type="select" label="Do you have a BED file of genomic regions to exclude?" help="Input BED file of genomic regions to exclude." >
153 <option value="No" selected="True">No</option>
154 <option value="Yes">Yes</option>
155 </param>
156 <when value="No" />
157 <when value="Yes">
158 <param name="erf" argument="-E" type="data" format="bed" label="BED File" />
159 </when>
160 </conditional>
161
162 <section name="filter_options" title="Filter Options">
163 <param name="duplicates" argument="-r" type="boolean" value="False" truevalue="-r -R" falsevalue="" label="Remove PCR duplicates" help="In this process, it analyzes reads/fragments based on their alignments, in three separate groups (proper pairs, discordant pairs, and singletons), and removes those identified as duplicates from further analysis. One novel feature is that this evaluation takes into account reads/fragments with multiple alignments."/>
164 <param name="exclude_chr" argument="-e" type="text" optional="True" label="Comma-separated list of chromosomes to exclude" help="All alignments to the given list of chromosomes (reference sequences) are excluded from peak-calling. More details can be found in the tool description.">
165 <sanitizer>
166 <valid initial="string.printable">
167 <remove value="&apos;"/>
168 </valid>
169 </sanitizer>
170 </param>
171 <param name="min_mapq" argument="-m" type="integer" min="0" value="0" label="Minimum MAPQ to keep an alignment." help="All alignments with MAPQ less than the given value are eliminated. This is equivalent to filtering with samtools view -q. This option should not be used if the SAM/BAM lists multiple alignments for some reads/fragments. Instead, filtering should be accomplished via -s. (def. 0)" />
172 <param name="alignment_score" argument="-s" type="float" min="0.0" value="0.0" label="Keep sec alns with AS >= bestAS." help="Genrich considers all secondary alignments of multimapping reads, but, by default, it keeps only the alignments whose scores are equal to the best score for the read/fragment. Setting a value such as -s 20 causes Genrich also to keep secondary alignments whose scores are within 20 of the best. (def. 0)" />
173 <param name="unpaired" argument="-y" type="boolean" value="False" truevalue="-y" falsevalue="" label="Keep unpaired alignments." help="Unpaired alignments are kept, just as they appear in the SAM/BAM. (def. false)"/>
174 <param name="alignment_lengths" argument="-w" type="integer" min="1" optional="True" value="" label="Keep unpaired alignments with a certain length." help="Unpaired alignments are kept, with their lengths changed to the given value (from their 5' ends). (def. not defined)" />
175 <param name="alignment_lengths2" argument="-x" type="boolean" value="False" truevalue="-x" falsevalue="" label="Keep unpaired alns, lengths changed to paired average." help="Unpaired alignments are kept, with their lengths changed to the average length of fragments inferred from properly paired alignments (excluding those aligning to skipped chromosomes [-e]). (def. not defined)"/>
176 </section>
177
178 <!-- ATAC Options -->
179 <section name="atac_options" title="ATAC Options">
180 <param name="atac" argument="-j" type="boolean" value="False" truevalue="-j" falsevalue="" label="Use ATAC-seq mode." help="Use ATAC-seq mode (def. false)"/>
181 <param name="expand_sites" argument="-d" type="integer" min="0" value="100" label="Expand cut sites." help="Expand cut sites to x bp (def. 100)" />
182 </section>
183
184 <!-- Peakcalling Options -->
185 <section name="peakcalling_options" title="Peakcalling Options">
186 <param name="max_q" argument="-q" type="float" min="0.0" max="1.0" value="0.05" label="Maximum q-value." help="Maximum q-value (FDR-adjusted p-value). These parameters establish the statistical threshold below which a base is considered significantly enriched in the experimental sample(s) vs. the control/background. The significance value is automatically converted to a -log10 scale by Genrich. (def. 0.05)" />
187 <param name="max_p" argument="-p" type="float" min="0" max="1.0" optional="True" value="" label="Maximum p-value." help="When -p is selected, q-values are not calculated (reported as -1). (def. turned off)" />
188 <param name="min_auc" argument="-a" type="float" min="0" value="20.0" label="Minimum AUC for a peak." help="Minimum AUC for a peak. (def. 20.0)" />
189 <param name="min_peak_length" argument="-l" type="integer" min="0" value="0" label="Minimum length of a peak." help="With this option, any potential peak whose length is below the specified value is discarded, regardless of its significance. The default of 0 means that no peaks are eliminated on this basis. (def. 0)" />
190 <param name="max_dist" argument="-g" type="integer" min="0" value="100" label="Maximum distance between signif. sites." help="This parameter sets the maximum distance between sites that achieve significance in order for them to be linked together into the same potential peak. (def. 100)" />
191 </section>
192
193 <!-- Other Options -->
194 <section name="other_options" title="Other Options">
195 <param name="skip_peak_calling" argument="-X" type="boolean" value="False" truevalue="-X" falsevalue="" label="Skip peak-calling." help="This is a convenience option for those who are unsure of the peak-calling parameters but do not want to run the full analysis multiple times. Genrich interprets the alignment files (including identifying PCR duplicates) and produces intermediate log files, but does not perform the peak-calling step."/>
196 </section>
197
198 <!-- Output Options -->
199 <section name="output_options" title="Output Options">
200 <param name="bedgraph1" argument="-f" type="boolean" value="False" label="Bedgraph-ish p/q Values" help="Output bedgraph-ish file for p/q values."/>
201 <param name="bedgraph2" argument="-k" type="boolean" value="False" label="Bedgraph-ish Pileups" help="Output bedgraph-ish file for pileups and p-values."/>
202 <param name="bed" argument="-b" type="boolean" value="False" label="Bed File" help="Output BED file for reads/fragments/intervals."/>
203 </section>
204 </inputs>
205
206
207 <outputs>
208 <data name="outfile" format="encodepeak" label="${tool.name} on ${on_string}"/>
209
210 <data name="out_bedgraph1" format="bedgraph" from_work_dir="*.bedgraph" label="${tool.name} on ${on_string}: Bedgraph p/q">
211 <filter>(output_options['bedgraph1'] is True)</filter>
212 </data>
213 <data name="out_bedgraph2" format="bedgraph" from_work_dir="*.bedgraph" label="${tool.name} on ${on_string}: Bedgraph Pileups">
214 <filter>(output_options['bedgraph2'] is True)</filter>
215 </data>
216 <data name="out_bed" format="bed" from_work_dir="*.bed" label="${tool.name} on ${on_string}: Bed reads/fragments/intervals">
217 <filter>(output_options['bed'] is True)</filter>
218 </data>
219 <data name="out_dups" format="txt" from_work_dir="*.txt" label="${tool.name} on ${on_string}: PCR duplicates">
220 <filter>(filter_options['duplicates'] is True)</filter>
221 </data>
222 </outputs>
223 <tests>
224 <!-- ATAC Test Data -->
225 <test expect_num_outputs="4">
226 <param name="input_treatment_file" ftype="bam" value="atac_test.bam" />
227 <param name="atac" value="True" />
228 <param name="bedgraph1" value="True" />
229 <param name="bedgraph2" value="True" />
230 <param name="bed" value="True" />
231 <output name="outfile" ftype="encodepeak" file="atac_out.encodepeak" />
232 <output name="out_bedgraph1" ftype="bedgraph" file="atac_out2.bedgraph" />
233 <output name="out_bedgraph2" ftype="bedgraph" file="atac_out3.bedgraph" compare="contains" lines_diff="1" />
234 <output name="out_bed" ftype="bed" file="atac_out4.bed" />
235 </test>
236 <!-- ChIP Test Data with Control-->
237 <test expect_num_outputs="4">
238 <param name="input_treatment_file" ftype="bam" value="CTCF_PE_ChIP_chr22.bam" />
239 <param name="input_control_file" ftype="bam" value="CTCF_PE_CTRL_chr22.bam" />
240 <param name="c_select" value="Yes" />
241 <param name="bedgraph1" value="True" />
242 <param name="bedgraph2" value="True" />
243 <param name="bed" value="True" />
244 <output name="outfile" ftype="encodepeak" file="CTCF.encodepeak" />
245 <output name="out_bedgraph1" ftype="bedgraph" file="CTCF1.bedgraph" />
246 <output name="out_bedgraph2" ftype="bedgraph" file="CTCF2.bedgraph" compare="contains" lines_diff="1" />
247 <output name="out_bed" ftype="bed" file="CTCF.bed" />
248 </test>
249 </tests>
250 <help><![CDATA[
251
252 .. class:: infomark
253
254 **What it does**
255
256 -------------------
257
258 **Genrich** Genrich is a peak-caller for genomic enrichment assays (e.g. ChIP-seq, ATAC-seq). It analyzes alignment files generated following the assay and produces a file detailing peaks of significant enrichment.
259
260 ATAC-seq is a method for assessing genomic regions of open chromatin. Since only the ends of the DNA fragments indicate where the transposase enzyme was able to insert into the chromatin, it may not be optimal to interpret alignments. Genrich has an alternative analysis mode for ATAC-seq in which it creates intervals centered on transposase cut sites. The remainder of the peak-calling process (calculating pileups and significance values) is identical to the default analysis mode. Note that the interval lengths (not the fragment lengths) are used to sum the total sequence information for the calculation of control/background pileup values.
261
262 -------------------
263
264 **Inputs**
265
266 -------------------
267
268 Genrich analyzes alignment files in SAM/BAM format. SAM files must have a header.
269 SAM/BAM files for multiple replicates can be specified, comma-separated (or space-separated, in quotes).
270 Multiple SAM/BAM files for a single replicate should be combined in advance via samtools merge.
271 The SAM/BAM files must be sorted by queryname (via samtools sort -n).
272
273
274 -----------
275
276 **Outputs**
277
278 -----------
279
280 As indicated, the output file is in ENCODE narrowPeak format. Here are details of the fields:
281 * 1. chrom Name of the chromosome
282 * 2. chromStart Starting position of the peak (0-based)
283 * 3. chromEnd Ending position of the peak (not inclusive)
284 * 4. name peak_N, where N is the 0-based count
285 * 5. score Average AUC (total AUC / bp) × 1000, rounded to the nearest int (max. 1000)
286 * 6. strand . (no orientation)
287 * 7. signalValue Total area under the curve (AUC)
288 * 8. pValue Summit -log10(p-value)
289 * 9. qValue Summit -log10(q-value), or -1 if not available (e.g. with -p)
290 * 10. peak Summit position (0-based offset from chromStart): the midpoint of the peak interval with the highest significance (the longest interval in case of ties)
291
292 Example:
293 chr1 894446 894988 peak_10 402 . 217.824936 4.344683 1.946031 317
294 chr1 895834 896167 peak_11 343 . 114.331093 4.344683 1.946031 90
295
296 Optional files
297
298 -c Input SAM/BAM file(s) for control sample(s)
299
300 Alignment files for control samples (e.g. input DNA) can be specified, although this is not strictly required.
301 SAM/BAM files for multiple replicates can be listed, comma-separated (or space-separated, in quotes) and in the same order as the experimental files. Missing control files should be indicated with null.
302
303 -f Output bedgraph-ish file for p/q values
304
305 With a single replicate, this log file lists experimental/control pileup values, p- and q-values, and significance (*) for each interval.
306
307 Example:
308 chr1 894435 894436 33.000000 2.477916 3.183460 1.208321
309 chr1 894436 894442 34.000000 2.477916 3.231466 1.241843
310 chr1 894442 894446 35.000000 2.477916 3.278469 1.274561
311 chr1 894446 894447 36.000000 2.477916 3.324516 1.306471 *
312 chr1 894447 894450 39.000000 2.477916 3.457329 1.398035 *
313 chr1 894450 894451 40.000000 2.477916 3.499948 1.427253 *
314 chr1 894451 894460 41.000000 2.477916 3.541798 1.455938 *
315
316 With multiple replicates, this log file lists p-values of each replicate, combined p-value, q-value, and significance for each interval.
317 Note that this file (as well as the -k file, below) is called "bedgraph-ish" because it contains multiple dataValue fields, which isn't strictly allowed in the bedGraph format. However, a simple application of awk can produce the desired bedgraph files for visualization purposes (see this awk reference for a guide to printing specific fields of input records).
318 When peak-calling is skipped (-X), the significance column is not produced.
319
320 -k Output bedgraph-ish file for pileups and p-values
321
322 For each replicate, sequentially, this file lists a header line (# experimental file: <name>; control file: <name>), followed by experimental/control pileups and a p-value for each interval. This is the way to examine pileup values with multiple replicates, since the -f log file does not supply them in that case.
323
324 -b Output BED file for reads/fragments/intervals
325
326 This is an unsorted BED file of the reads/fragments/intervals analyzed. The 4th column gives the read name, number of valid alignments, 'E'xperimental or 'C'ontrol, and sample number (0-based), e.g. SRR5427886.59_2_E_0.
327
328 -R Output file for PCR duplicates (only with -r)
329
330 This log file lists the header of each read/fragment classified as a PCR duplicate, followed by the alignment, the header of the read/fragment it matched, and the alignment type.
331
332 Example:
333 SRR5427886.5958 chr4:185201876-185201975 SRR5427886.4688 paired
334 SRR5427886.1826 chr12:34372610,+;chr1:91852878,- SRR5427886.2040 discordant
335 SRR5427886.10866 chr14:53438632,+ SRR5427886.4746 single
336
337 The duplicates from multiple input files are separated by a comment line listing the next filename, such as # experimental file #0: SRR5427886.bam.
338 This file can be used to filter the original SAM/BAM file, using a simple script such as getReads.py, for example.
339
340
341 --------------------
342
343 **More Information**
344
345 --------------------
346
347 See the excellent `Genrich documentation`_
348
349 .. _`Genrich documentation`: https://github.com/jsh58/Genrich
350
351
352 --------------------
353
354 **Galaxy Wrapper Development**
355
356 --------------------
357
358 Author: Florian Heyl <heylf@informatik.uni-freiburg.de>
359
360
361 ]]></help>
362 <citations>
363 <citation type="bibtex">
364 @misc{genrich,
365 title = {Genrich},
366 url = {https://github.com/jsh58/Genrich},
367 urldate = {2019-07-15},
368 author = {John M. Gaspar},
369 year = {2018},
370 }
371 </citation>
372 </citations>
373 </tool>