comparison homer_findMotifsGenome.xml @ 0:ec974e69e0b5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/homer commit e49d856e0181edc6120220a1b819cba2466a4289"
author iuc
date Sun, 08 Aug 2021 11:02:42 +0000
parents
children 3126da33847c
comparison
equal deleted inserted replaced
-1:000000000000 0:ec974e69e0b5
1 <tool id="homer_findMotifsGenome" name="findMotifsGenome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05" license="MIT">
2 <description/>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="xrefs"/>
7 <expand macro="requirements"/>
8 <command detect_errors="exit_code"><![CDATA[
9 ## Taken from fastqc:
10 #import re
11 #import os
12 #set input_name = re.sub('[^\w\-\s]', '_', str($input.element_identifier))
13 ln -s '${input}' '${input_name}' &&
14 #set output = $input_name + '_motif'
15 ## Process the genome:
16 #if str( $genome.source ) == "installed":
17 #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.all_fasta_source.fields.value)) + '.fa'
18 ln -s '$genome.all_fasta_source.fields.path' '$genome_file' &&
19 #elif str( $genome.source ) == "preparsed":
20 #set genome_file = os.path.split(str($genome.homer_preparse_source.fields.path_fasta))[-1]
21 ln -s '$genome.homer_preparse_source.fields.path_fasta' '$genome_file' &&
22 #elif str( $genome.source ) == "history":
23 #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.fasta.name)) + '.fa'
24 ln -s '$genome.fasta' '$genome_file' &&
25 #end if
26 ## Command:
27 findMotifsGenome.pl
28 ## Peak:
29 '${input_name}'
30 ## Genome:
31 '$genome_file'
32 ## Ouptut folder:
33 '${output}'
34 ## Options
35 #if str( $genome.source ) == "preparsed":
36 -preparsedDir '$genome.homer_preparse_source.fields.path'
37 #if str( $genome.homer_preparse_source.fields.mask ) == 'True':
38 -mask
39 #end if
40 #if str( $genome.choose_center.center ) == "centered":
41 -size '$genome.homer_preparse_source.fields.size'
42 #else
43 #set sizee = int($genome.choose_center.sizes) + int($genome.homer_preparse_source.fields.size)
44 -size '$genome.choose_center.sizes','${sizee}'
45 #end if
46 #else:
47 #if $genome.mask
48 -mask
49 #end if
50 #if $genome.fixed_size.size_fixed == "given":
51 -size given
52 #else:
53 #if str( $genome.fixed_size.choose_center.center ) == "centered":
54 -size '$genome.fixed_size.size'
55 #else
56 #set sizee = int($genome.fixed_size.choose_center.sizes) + int($genome.fixed_size.size)
57 -size '$genome.fixed_size.choose_center.sizes','${sizee}'
58 #end if
59 #end if
60 #end if
61 -len '$len'
62 -S $S
63 -mis $mis
64 $norevopp
65 $nomotif
66 $rna
67 -mset $motif_options.mset
68 $motif_options.basic
69 $motif_options.bits
70 $motif_options.nocheck
71 #if $motif_options.mcheck:
72 -mcheck '$motif_options.mcheck'
73 #end if
74 $motif_options.noknown
75 #if $motif_options.mknown:
76 -mknown '$motif_options.mknown'
77 #end if
78 $motif_options.nofacts
79 $motif_options.seqlogo
80 $advanced.norm
81 $advanced.h
82 #if str($advanced.N):
83 -N $advanced.N
84 #end if
85 -local $advanced.local
86 -redundant $advanced.redundant
87 -maxN $advanced.maxN
88 #if $advanced.maskMotif:
89 -maskMotif '$advanced.maskMotif'
90 #end if
91 #if $advanced.opt:
92 -opt '$advanced.opt'
93 #end if
94 $advanced.rand
95 #if $advanced.ref:
96 -ref '$advanced.ref'
97 #end if
98 $advanced.oligo
99 #if $advanced.fdr:
100 -fdr $advanced.fdr
101 #end if
102 #if str( $advanced.homer12.version ) == "homer2":
103 -nlen '$advanced.homer12.nlen'
104 -nmax '$advanced.homer12.nmax'
105 $advanced.homer12.neutral
106 -e '$advanced.homer12.e'
107 $advanced.homer12.quickMask
108 -minlp '$advanced.homer12.minlp'
109 #elif str( $advanced.homer12.version ) == "homer1":
110 -depth '$advanced.homer12.depth'
111 #end if
112 #if not $nomotif:
113 && cp '${output}'/homerResults.html outputHomer.html
114 && cp -r '${output}' '${html_homer_file.files_path}'
115 #end if
116 #if not $motif_options.noknown:
117 && cp '${output}'/knownResults.html outputKnown.html
118 && cp -r '${output}' '${html_file.files_path}'
119 #end if
120 ]]></command>
121 <inputs>
122 <param name="input" type="data" format="bed,encodepeak,tabular" label="Peak file"/>
123 <conditional name="genome">
124 <param name="source" type="select" label="Will you select a reference genome from your history or use a installed genome?">
125 <option value="preparsed">Preparsed (fasta is available and has been preparsed to specific size)</option>
126 <option value="installed">Installed (fasta is available but will be preparsed as run time)</option>
127 <option value="history">From History (fasta will be preparsed at run time)</option>
128 </param>
129 <when value="preparsed">
130 <param name="homer_preparse_source" type="select" label="Preparsed FASTA">
131 <options from_data_table="homer_preparse">
132 <filter type="sort_by" column="2"/>
133 <filter type="static_value" column="version" value="@IDX_VERSION@"/>
134 <validator type="no_options" message="No preparsed genomes are available"/>
135 </options>
136 </param>
137 <expand macro="choose_center"/>
138 </when>
139 <when value="installed">
140 <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
141 <options from_data_table="all_fasta">
142 <filter type="sort_by" column="2"/>
143 <validator type="no_options" message="No references are available"/>
144 </options>
145 </param>
146 <expand macro="mask_size"/>
147 </when>
148 <when value="history">
149 <param name="fasta" type="data" format="fasta" label="Select reference genome"/>
150 <expand macro="mask_size"/>
151 </when>
152 </conditional>
153 <param argument="-len" type="text" value="8,10,12" label="comma-separated motif lengths" help="values greater 12 may cause the program to run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)">
154 <validator type="regex" message="motif lengths must be comma-separated integers without space">^(\d+,)*(\d+)$</validator>
155 </param>
156 <param argument="-S" type="integer" min="1" value="25" label="Number of motifs to find"/>
157 <param argument="-mis" type="integer" min="0" value="2" label="Number of mismatches during global optimisation"/>
158 <param argument="-norevopp" type="boolean" truevalue="-norevopp" falsevalue="" checked="false" label="Don't search reverse strand for motifs"/>
159 <param argument="-nomotif" type="boolean" truevalue="-nomotif" falsevalue="" checked="false" label="Don't search for de novo motif enrichment"/>
160 <param argument="-rna" type="boolean" truevalue="-rna" falsevalue="" checked="false" label="output RNA motif logos and compare to RNA motif database" help="automatically sets -norevopp"/>
161 <section name="motif_options" title="Known Motif Options/Visualization" expanded="False">
162 <param argument="-mset" type="select" label="Check against motif collects">
163 <option value="auto" selected="True">automatic</option>
164 <option value="vertebrates">vertebrates</option>
165 <option value="insects">insects</option>
166 <option value="worms">worms</option>
167 <option value="plants">plants</option>
168 <option value="yeast">yeast</option>
169 <option value="all">all</option>
170 </param>
171 <param argument="-basic" type="boolean" truevalue="-basic" falsevalue="" checked="false" label="Just visualize de novo motifs, don't check similarity with known motifs"/>
172 <param argument="-bits" type="boolean" truevalue="-bits" falsevalue="" checked="false" label="Scale sequence logos by information content" help="TODO"/>
173 <param argument="-nocheck" type="boolean" truevalue="-nocheck" falsevalue="" checked="false" label="Don't search for de novo vs. known motif similarity"/>
174 <param argument="-mcheck" type="data" optional="true" format="txt" label="known motifs to check against de novo motifs"/>
175 <param argument="-noknown" type="boolean" truevalue="-noknown" falsevalue="" checked="false" label="Don't search for known motif enrichment"/>
176 <param argument="-mknown" type="data" optional="true" format="txt" label="Known motifs to check for enrichment"/>
177 <param argument="-nofacts" type="boolean" truevalue="-nofacts" falsevalue="" checked="false" label="Omit humor"/>
178 <param argument="-seqlogo" type="boolean" truevalue="-seqlogo" falsevalue="" checked="false" label="Use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now"/>
179 </section>
180 <section name="advanced" title="Advanced options" expanded="false">
181 <param name="norm" type="select" label="Sequence normalization options:">
182 <option value="-gc" selected="true">use GC% for sequence content normalization</option>
183 <option value="-cpg">use CpG% instead of GC% for sequence content normalization</option>
184 <option value="-noweight">no CG correction</option>
185 </param>
186 <param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Use hypergeometric for p-values, binomial is default"/>
187 <param argument="-N" type="integer" min="0" value="" optional="true" label="Number of sequences to use for motif finding, default=max(50k, 2x input)"/>
188 <param argument="-local" type="integer" min="0" value="0" label="local background size in bp for each side of regions" help="0 means no local background."/>
189 <param argument="-redundant" type="float" min="0" max="2" value="2" label="Remove redundant sequences matching greater than # fraction, i.e. -redundant 0.5"/>
190 <param argument="-maxN" type="float" min="0" max="1" value="0.7" label="maximum percentage of N's in sequence to consider for motif finding"/>
191 <param argument="-maskMotif" type="data" format="txt" multiple="true" optional="true" label="motifs to mask before motif finding"/>
192 <param argument="-opt" type="data" format="txt" multiple="true" optional="true" label="motifs to optimize or change length of"/>
193 <param argument="-rand" type="boolean" truevalue="-rand" falsevalue="" checked="false" label="randomize target and background sequences labels"/>
194 <param argument="-ref" optional="true" type="data" format="tabular,bed,encodepeak" label="use file for target and background - first argument is list of peak ids for targets"/>
195 <param argument="-oligo" type="boolean" truevalue="-oligo" falsevalue="" checked="false" label="Perform analysis of individual oligo enrichment"/>
196 <param argument="-fdr" type="integer" min="0" value="" label="Number of randomizations to calculate empirical FDR for de novo discovery" optional="true"/>
197 <conditional name="homer12">
198 <param name="version" type="select" label="Which homer version do you want to use">
199 <option value="homer2" selected="true">homer2 (default)</option>
200 <option value="homer1">homer1 (to force the use of the original homer)</option>
201 </param>
202 <when value="homer2">
203 <param argument="-nlen" type="integer" min="0" value="3" label="length of lower-order oligos to normalize in background"/>
204 <param argument="-nmax" type="integer" min="0" value="160" label="Max normalization iterations"/>
205 <param argument="-neutral" type="boolean" truevalue="-neutral" falsevalue="" checked="false" label="weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc."/>
206 <param argument="-olen" type="integer" min="0" value="" optional="true" label="lower-order oligo normalization for oligo table, use if -nlen isn't working well"/>
207 <param argument="-e" type="float" min="0" max="1" value="0" label="" help="Maximum expected motif instance per bp in random sequence"/>
208 <param argument="-quickMask" type="boolean" truevalue="-quickMask" falsevalue="" checked="false" label="skip full masking after finding motifs, similar to original homer"/>
209 <param argument="-minlp" type="float" value="-10" label="stop looking for motifs when seed logp score gets above this number"/>
210 </when>
211 <when value="homer1">
212 <param argument="-depth" type="select" label="time spent on local optimization default">
213 <option value="low">low</option>
214 <option value="med" selected="true">med</option>
215 <option value="high">high</option>
216 <option value="allnight">allnight</option>
217 </param>
218 </when>
219 </conditional>
220 </section>
221 </inputs>
222 <outputs>
223 <data format="html" name="html_file" from_work_dir="outputKnown.html" label="${tool.name} on ${on_string}: Known motifs">
224 <filter>motif_options['noknown'] is False</filter>
225 </data>
226 <data format="html" name="html_homer_file" from_work_dir="outputHomer.html" label="${tool.name} on ${on_string}: De novo motifs">
227 <filter>nomotif is False</filter>
228 </data>
229 </outputs>
230 <tests>
231 <test expect_num_outputs="2">
232 <param name="input" value="fake_phix_peaks.bed"/>
233 <conditional name="genome">
234 <param name="source" value="installed"/>
235 <param name="all_fasta_source" value="phiX174"/>
236 </conditional>
237 <output name="html_file" file="motif_test1/knownResults.html" ftype="html" lines_diff="2"/>
238 <output name="html_homer_file">
239 <assert_contents>
240 <has_text text="fake_phix_peaks_bed_motif/ - Homer de novo Motif Results"/>
241 <has_text text="Total target sequences = 1"/>
242 <has_text text="Jaspar"/>
243 </assert_contents>
244 </output>
245 </test>
246 <test expect_num_outputs="2">
247 <param name="input" value="CTCF_peaks_shifted.bed"/>
248 <conditional name="genome">
249 <param name="source" value="history"/>
250 <param name="fasta" value="chr2_subset.fa"/>
251 </conditional>
252 <output name="html_file">
253 <assert_contents>
254 <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
255 <has_text text="Total Target Sequences = 40"/>
256 <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/>
257 </assert_contents>
258 </output>
259 <output name="html_homer_file">
260 <assert_contents>
261 <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/>
262 <has_text text="Total target sequences = 40"/>
263 <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/>
264 </assert_contents>
265 </output>
266 </test>
267 <test expect_num_outputs="2">
268 <param name="input" value="CTCF_peaks_shifted.bed"/>
269 <param name="mask" value="true"/>
270 <conditional name="genome">
271 <param name="source" value="history"/>
272 <param name="fasta" value="chr2_subset.fa"/>
273 </conditional>
274 <output name="html_file">
275 <assert_contents>
276 <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
277 <has_text text="Total Target Sequences = 34"/>
278 <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/>
279 </assert_contents>
280 </output>
281 <output name="html_homer_file">
282 <assert_contents>
283 <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/>
284 <has_text text="Total target sequences = 34"/>
285 <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/>
286 </assert_contents>
287 </output>
288 </test>
289 <test expect_num_outputs="1">
290 <param name="input" value="CTCF_peaks_shifted.bed"/>
291 <conditional name="genome">
292 <param name="source" value="history"/>
293 <param name="fasta" value="chr2_subset.fa"/>
294 </conditional>
295 <section name="motif_options">
296 <param name="mset" value="plants"/>
297 </section>
298 <param name="nomotif" value="true"/>
299 <output name="html_file">
300 <assert_contents>
301 <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
302 <has_text text="Total Target Sequences = 40"/>
303 <has_text text="RAP26"/>
304 </assert_contents>
305 </output>
306 </test>
307 </tests>
308 <help><![CDATA[
309
310 .. class:: infomark
311
312 This is a wrapper for findMotifsGenome.pl from HOMER but not all options are included.
313
314 Program will find de novo and known motifs in regions in the genome.
315
316 Usage::
317
318 findMotifsGenome.pl <pos file> <genome> <output directory> [additional options]
319
320 Example::
321
322 findMotifsGenome.pl peaks.txt mm8r peakAnalysis -size 200 -len 8
323
324 Possible Genomes::
325
326 -- or --
327 Custom: provide the path to genome FASTA files (directory or single file)
328 Heads up: will create the directory "preparsed/" in same location.
329
330 Basic options::
331
332 -mask (mask repeats/lower case sequence, can also add 'r' to genome, i.e. mm9r)
333 -bg <background position file> (genomic positions to be used as background, default=automatic)
334 removes background positions overlapping with target positions unless -keepOverlappingBg is used
335 -chopify (chop up large background regions to the avg size of target regions)
336 -len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the program
337 to run out of memory - in these cases decrease the number of sequences analyzed (-N),
338 or try analyzing shorter sequence regions (i.e. -size 100)]
339 -size <#> (fragment size to use for motif finding, default=200)
340 -size <#,#> (i.e. -size -100,50 will get sequences from -100 to +50 relative from center)
341 -size given (uses the exact regions you give it)
342 -S <#> (Number of motifs to optimize, default: 25)
343 -mis <#> (global optimization: searches for strings with # mismatches, default: 2)
344 -norevopp (don't search reverse strand for motifs)
345 -nomotif (don't search for de novo motif enrichment)
346 -rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp)
347
348 Scanning sequence for motifs::
349
350 -find <motif file> (This will cause the program to only scan for motifs)
351
352 Known Motif Options/Visualization::
353
354 -mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto)
355 -basic (just visualize de novo motifs, don't check similarity with known motifs)
356 -bits (scale sequence logos by information content, default: doesn't scale)
357 -nocheck (don't search for de novo vs. known motif similarity)
358 -mcheck <motif file> (known motifs to check against de novo motifs,
359 -float (allow adjustment of the degeneracy threshold for known motifs to improve p-value[dangerous])
360 -noknown (don't search for known motif enrichment, default: -known)
361 -mknown <motif file> (known motifs to check for enrichment,
362 -nofacts (omit humor)
363 -seqlogo (use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now)
364
365 Sequence normalization options::
366
367 -gc (use GC% for sequence content normalization, now the default)
368 -cpg (use CpG% instead of GC% for sequence content normalization)
369 -noweight (no CG correction)
370 Also -nlen <#>, -olen <#>, see homer2 section below.
371
372 Advanced options::
373
374 -h (use hypergeometric for p-values, binomial is default)
375 -N <#> (Number of sequences to use for motif finding, default=max(50k, 2x input)
376 -local <#> (use local background, # of equal size regions around peaks to use i.e. 2)
377 -redundant <#> (Remove redundant sequences matching greater than # percent, i.e. -redundant 0.5)
378 -maxN <#> (maximum percentage of N's in sequence to consider for motif finding, default: 0.7)
379 -maskMotif <motif file1> [motif file 2]... (motifs to mask before motif finding)
380 -opt <motif file1> [motif file 2]... (motifs to optimize or change length of)
381 -rand (randomize target and background sequences labels)
382 -ref <peak file> (use file for target and background - first argument is list of peak ids for targets)
383 -oligo (perform analysis of individual oligo enrichment)
384 -dumpFasta (Dump fasta files for target and background sequences for use with other programs)
385 -preparse (force new background files to be created)
386 -preparsedDir <directory> (location to search for preparsed file and/or place new files)
387 -keepFiles (keep temporary files)
388 -fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations)
389
390 homer2 specific options::
391
392 -homer2 (use homer2 instead of original homer, default)
393 -nlen <#> (length of lower-order oligos to normalize in background, default: -nlen 3)
394 -nmax <#> (Max normalization iterations, default: 160)
395 -neutral (weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc.)
396 -olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well)
397 -p <#> (Number of processors to use, default: 1)
398 -e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01)
399 -cache <#> (size in MB for statistics cache, default: 500)
400 -quickMask (skip full masking after finding motifs, similar to original homer)
401 -minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10)
402
403 Original homer specific options::
404
405 -homer1 (to force the use of the original homer)
406 -depth [low|med|high|allnight] (time spent on local optimization default: med)
407
408
409 ]]></help>
410 <expand macro="citation"/>
411 </tool>