diff homer_findMotifsGenome.xml @ 0:ec974e69e0b5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/homer commit e49d856e0181edc6120220a1b819cba2466a4289"
author iuc
date Sun, 08 Aug 2021 11:02:42 +0000
parents
children 3126da33847c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/homer_findMotifsGenome.xml	Sun Aug 08 11:02:42 2021 +0000
@@ -0,0 +1,411 @@
+<tool id="homer_findMotifsGenome" name="findMotifsGenome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05" license="MIT">
+    <description/>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+## Taken from fastqc:
+#import re
+#import os
+#set input_name = re.sub('[^\w\-\s]', '_', str($input.element_identifier))
+ln -s '${input}' '${input_name}' &&
+#set output = $input_name + '_motif'
+## Process the genome:
+#if str( $genome.source ) == "installed":
+    #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.all_fasta_source.fields.value)) + '.fa'
+    ln -s '$genome.all_fasta_source.fields.path' '$genome_file' &&
+#elif str( $genome.source ) == "preparsed":
+    #set genome_file = os.path.split(str($genome.homer_preparse_source.fields.path_fasta))[-1]
+    ln -s '$genome.homer_preparse_source.fields.path_fasta' '$genome_file' &&
+#elif str( $genome.source ) == "history":
+    #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.fasta.name)) + '.fa'
+    ln -s '$genome.fasta' '$genome_file' &&
+#end if
+## Command:
+findMotifsGenome.pl
+## Peak:
+'${input_name}'
+## Genome:
+'$genome_file'
+## Ouptut folder:
+'${output}'
+## Options
+#if str( $genome.source ) == "preparsed":
+    -preparsedDir '$genome.homer_preparse_source.fields.path'
+    #if str( $genome.homer_preparse_source.fields.mask ) == 'True':
+        -mask
+    #end if
+    #if str( $genome.choose_center.center ) == "centered":
+        -size '$genome.homer_preparse_source.fields.size'
+    #else
+        #set sizee = int($genome.choose_center.sizes) + int($genome.homer_preparse_source.fields.size)
+        -size '$genome.choose_center.sizes','${sizee}'
+    #end if
+#else:
+    #if $genome.mask
+        -mask
+    #end if
+    #if $genome.fixed_size.size_fixed == "given":
+        -size given
+    #else:
+        #if str( $genome.fixed_size.choose_center.center ) == "centered":
+            -size '$genome.fixed_size.size'
+        #else
+            #set sizee = int($genome.fixed_size.choose_center.sizes) + int($genome.fixed_size.size)
+            -size '$genome.fixed_size.choose_center.sizes','${sizee}'
+        #end if
+    #end if
+#end if
+-len '$len'
+-S $S
+-mis $mis
+$norevopp
+$nomotif
+$rna
+-mset $motif_options.mset
+$motif_options.basic
+$motif_options.bits
+$motif_options.nocheck
+#if $motif_options.mcheck:
+    -mcheck '$motif_options.mcheck'
+#end if
+$motif_options.noknown
+#if $motif_options.mknown:
+    -mknown '$motif_options.mknown'
+#end if
+$motif_options.nofacts
+$motif_options.seqlogo
+$advanced.norm
+$advanced.h
+#if str($advanced.N):
+    -N $advanced.N
+#end if
+-local $advanced.local
+-redundant $advanced.redundant
+-maxN $advanced.maxN
+#if $advanced.maskMotif:
+    -maskMotif '$advanced.maskMotif'
+#end if
+#if $advanced.opt:
+    -opt '$advanced.opt'
+#end if
+$advanced.rand
+#if $advanced.ref:
+    -ref '$advanced.ref'
+#end if
+$advanced.oligo
+#if $advanced.fdr:
+    -fdr $advanced.fdr
+#end if
+#if str( $advanced.homer12.version ) == "homer2":
+    -nlen '$advanced.homer12.nlen'
+    -nmax '$advanced.homer12.nmax'
+    $advanced.homer12.neutral
+    -e '$advanced.homer12.e'
+    $advanced.homer12.quickMask
+    -minlp '$advanced.homer12.minlp'
+#elif str( $advanced.homer12.version ) == "homer1":
+    -depth '$advanced.homer12.depth'
+#end if
+#if not $nomotif:
+    && cp '${output}'/homerResults.html outputHomer.html
+    && cp -r '${output}' '${html_homer_file.files_path}'
+#end if
+#if not $motif_options.noknown:
+    && cp '${output}'/knownResults.html outputKnown.html
+    && cp -r '${output}' '${html_file.files_path}'
+#end if
+        ]]></command>
+    <inputs>
+        <param name="input" type="data" format="bed,encodepeak,tabular" label="Peak file"/>
+        <conditional name="genome">
+            <param name="source" type="select" label="Will you select a reference genome from your history or use a installed genome?">
+                <option value="preparsed">Preparsed (fasta is available and has been preparsed to specific size)</option>
+                <option value="installed">Installed (fasta is available but will be preparsed as run time)</option>
+                <option value="history">From History (fasta will be preparsed at run time)</option>
+            </param>
+            <when value="preparsed">
+                <param name="homer_preparse_source" type="select" label="Preparsed FASTA">
+                    <options from_data_table="homer_preparse">
+                        <filter type="sort_by" column="2"/>
+                        <filter type="static_value" column="version" value="@IDX_VERSION@"/>
+                        <validator type="no_options" message="No preparsed genomes are available"/>
+                    </options>
+                </param>
+                <expand macro="choose_center"/>
+            </when>
+            <when value="installed">
+                <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta">
+                        <filter type="sort_by" column="2"/>
+                        <validator type="no_options" message="No references are available"/>
+                    </options>
+                </param>
+                <expand macro="mask_size"/>
+            </when>
+            <when value="history">
+                <param name="fasta" type="data" format="fasta" label="Select reference genome"/>
+                <expand macro="mask_size"/>
+            </when>
+        </conditional>
+        <param argument="-len" type="text" value="8,10,12" label="comma-separated motif lengths" help="values greater 12 may cause the program to run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)">
+            <validator type="regex" message="motif lengths must be comma-separated integers without space">^(\d+,)*(\d+)$</validator>
+        </param>
+        <param argument="-S" type="integer" min="1" value="25" label="Number of motifs to find"/>
+        <param argument="-mis" type="integer" min="0" value="2" label="Number of mismatches during global optimisation"/>
+        <param argument="-norevopp" type="boolean" truevalue="-norevopp" falsevalue="" checked="false" label="Don't search reverse strand for motifs"/>
+        <param argument="-nomotif" type="boolean" truevalue="-nomotif" falsevalue="" checked="false" label="Don't search for de novo motif enrichment"/>
+        <param argument="-rna" type="boolean" truevalue="-rna" falsevalue="" checked="false" label="output RNA motif logos and compare to RNA motif database" help="automatically sets -norevopp"/>
+        <section name="motif_options" title="Known Motif Options/Visualization" expanded="False">
+            <param argument="-mset" type="select" label="Check against motif collects">
+                <option value="auto" selected="True">automatic</option>
+                <option value="vertebrates">vertebrates</option>
+                <option value="insects">insects</option>
+                <option value="worms">worms</option>
+                <option value="plants">plants</option>
+                <option value="yeast">yeast</option>
+                <option value="all">all</option>
+            </param>
+            <param argument="-basic" type="boolean" truevalue="-basic" falsevalue="" checked="false" label="Just visualize de novo motifs, don't check similarity with known motifs"/>
+            <param argument="-bits" type="boolean" truevalue="-bits" falsevalue="" checked="false" label="Scale sequence logos by information content" help="TODO"/>
+            <param argument="-nocheck" type="boolean" truevalue="-nocheck" falsevalue="" checked="false" label="Don't search for de novo vs. known motif similarity"/>
+            <param argument="-mcheck" type="data" optional="true" format="txt" label="known motifs to check against de novo motifs"/>
+            <param argument="-noknown" type="boolean" truevalue="-noknown" falsevalue="" checked="false" label="Don't search for known motif enrichment"/>
+            <param argument="-mknown" type="data" optional="true" format="txt" label="Known motifs to check for enrichment"/>
+            <param argument="-nofacts" type="boolean" truevalue="-nofacts" falsevalue="" checked="false" label="Omit humor"/>
+            <param argument="-seqlogo" type="boolean" truevalue="-seqlogo" falsevalue="" checked="false" label="Use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now"/>
+        </section>
+        <section name="advanced" title="Advanced options" expanded="false">
+            <param name="norm" type="select" label="Sequence normalization options:">
+                <option value="-gc" selected="true">use GC% for sequence content normalization</option>
+                <option value="-cpg">use CpG% instead of GC% for sequence content normalization</option>
+                <option value="-noweight">no CG correction</option>
+            </param>
+            <param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Use hypergeometric for p-values, binomial is default"/>
+            <param argument="-N" type="integer" min="0" value="" optional="true" label="Number of sequences to use for motif finding, default=max(50k, 2x input)"/>
+            <param argument="-local" type="integer" min="0" value="0" label="local background size in bp for each side of regions" help="0 means no local background."/>
+            <param argument="-redundant" type="float" min="0" max="2" value="2" label="Remove redundant sequences matching greater than # fraction, i.e. -redundant 0.5"/>
+            <param argument="-maxN" type="float" min="0" max="1" value="0.7" label="maximum percentage of N's in sequence to consider for motif finding"/>
+            <param argument="-maskMotif" type="data" format="txt" multiple="true" optional="true" label="motifs to mask before motif finding"/>
+            <param argument="-opt" type="data" format="txt" multiple="true" optional="true" label="motifs to optimize or change length of"/>
+            <param argument="-rand" type="boolean" truevalue="-rand" falsevalue="" checked="false" label="randomize target and background sequences labels"/>
+            <param argument="-ref" optional="true" type="data" format="tabular,bed,encodepeak" label="use file for target and background - first argument is list of peak ids for targets"/>
+            <param argument="-oligo" type="boolean" truevalue="-oligo" falsevalue="" checked="false" label="Perform analysis of individual oligo enrichment"/>
+            <param argument="-fdr" type="integer" min="0" value="" label="Number of randomizations to calculate empirical FDR for de novo discovery" optional="true"/>
+            <conditional name="homer12">
+                <param name="version" type="select" label="Which homer version do you want to use">
+                    <option value="homer2" selected="true">homer2 (default)</option>
+                    <option value="homer1">homer1 (to force the use of the original homer)</option>
+                </param>
+                <when value="homer2">
+                    <param argument="-nlen" type="integer" min="0" value="3" label="length of lower-order oligos to normalize in background"/>
+                    <param argument="-nmax" type="integer" min="0" value="160" label="Max normalization iterations"/>
+                    <param argument="-neutral" type="boolean" truevalue="-neutral" falsevalue="" checked="false" label="weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc."/>
+                    <param argument="-olen" type="integer" min="0" value="" optional="true" label="lower-order oligo normalization for oligo table, use if -nlen isn't working well"/>
+                    <param argument="-e" type="float" min="0" max="1" value="0" label="" help="Maximum expected motif instance per bp in random sequence"/>
+                    <param argument="-quickMask" type="boolean" truevalue="-quickMask" falsevalue="" checked="false" label="skip full masking after finding motifs, similar to original homer"/>
+                    <param argument="-minlp" type="float" value="-10" label="stop looking for motifs when seed logp score gets above this number"/>
+                </when>
+                <when value="homer1">
+                    <param argument="-depth" type="select" label="time spent on local optimization default">
+                        <option value="low">low</option>
+                        <option value="med" selected="true">med</option>
+                        <option value="high">high</option>
+                        <option value="allnight">allnight</option>
+                    </param>
+                </when>
+            </conditional>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="html" name="html_file" from_work_dir="outputKnown.html" label="${tool.name} on ${on_string}: Known motifs">
+            <filter>motif_options['noknown'] is False</filter>
+        </data>
+        <data format="html" name="html_homer_file" from_work_dir="outputHomer.html" label="${tool.name} on ${on_string}: De novo motifs">
+            <filter>nomotif is False</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input" value="fake_phix_peaks.bed"/>
+            <conditional name="genome">
+                <param name="source" value="installed"/>
+                <param name="all_fasta_source" value="phiX174"/>
+            </conditional>
+            <output name="html_file" file="motif_test1/knownResults.html" ftype="html" lines_diff="2"/>
+            <output name="html_homer_file">
+                <assert_contents>
+                    <has_text text="fake_phix_peaks_bed_motif/ - Homer de novo Motif Results"/>
+                    <has_text text="Total target sequences = 1"/>
+                    <has_text text="Jaspar"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" value="CTCF_peaks_shifted.bed"/>
+            <conditional name="genome">
+                <param name="source" value="history"/>
+                <param name="fasta" value="chr2_subset.fa"/>
+            </conditional>
+            <output name="html_file">
+                <assert_contents>
+                    <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
+                    <has_text text="Total Target Sequences = 40"/>
+                    <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/>
+                </assert_contents>
+            </output>
+            <output name="html_homer_file">
+                <assert_contents>
+                    <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/>
+                    <has_text text="Total target sequences = 40"/>
+                    <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" value="CTCF_peaks_shifted.bed"/>
+            <param name="mask" value="true"/>
+            <conditional name="genome">
+                <param name="source" value="history"/>
+                <param name="fasta" value="chr2_subset.fa"/>
+            </conditional>
+            <output name="html_file">
+                <assert_contents>
+                    <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
+                    <has_text text="Total Target Sequences = 34"/>
+                    <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/>
+                </assert_contents>
+            </output>
+            <output name="html_homer_file">
+                <assert_contents>
+                    <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/>
+                    <has_text text="Total target sequences = 34"/>
+                    <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="CTCF_peaks_shifted.bed"/>
+            <conditional name="genome">
+                <param name="source" value="history"/>
+                <param name="fasta" value="chr2_subset.fa"/>
+            </conditional>
+            <section name="motif_options">
+                <param name="mset" value="plants"/>
+            </section>
+            <param name="nomotif" value="true"/>
+            <output name="html_file">
+                <assert_contents>
+                    <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/>
+                    <has_text text="Total Target Sequences = 40"/>
+                    <has_text text="RAP26"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+    This is a wrapper for findMotifsGenome.pl from HOMER but not all options are included.
+
+    Program will find de novo and known motifs in regions in the genome.
+
+Usage::
+
+    findMotifsGenome.pl <pos file> <genome> <output directory> [additional options]
+
+Example::
+
+    findMotifsGenome.pl peaks.txt mm8r peakAnalysis -size 200 -len 8
+
+Possible Genomes::
+
+                -- or --
+        Custom: provide the path to genome FASTA files (directory or single file)
+                Heads up: will create the directory "preparsed/" in same location.
+
+Basic options::
+
+        -mask (mask repeats/lower case sequence, can also add 'r' to genome, i.e. mm9r)
+        -bg <background position file> (genomic positions to be used as background, default=automatic)
+                removes background positions overlapping with target positions unless -keepOverlappingBg is used
+                -chopify (chop up large background regions to the avg size of target regions)
+        -len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the program
+                to run out of memory - in these cases decrease the number of sequences analyzed (-N),
+                or try analyzing shorter sequence regions (i.e. -size 100)]
+        -size <#> (fragment size to use for motif finding, default=200)
+                -size <#,#> (i.e. -size -100,50 will get sequences from -100 to +50 relative from center)
+                -size given (uses the exact regions you give it)
+        -S <#> (Number of motifs to optimize, default: 25)
+        -mis <#> (global optimization: searches for strings with # mismatches, default: 2)
+        -norevopp (don't search reverse strand for motifs)
+        -nomotif (don't search for de novo motif enrichment)
+        -rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp)
+
+Scanning sequence for motifs::
+
+        -find <motif file> (This will cause the program to only scan for motifs)
+
+Known Motif Options/Visualization::
+
+        -mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto)
+        -basic (just visualize de novo motifs, don't check similarity with known motifs)
+        -bits (scale sequence logos by information content, default: doesn't scale)
+        -nocheck (don't search for de novo vs. known motif similarity)
+        -mcheck <motif file> (known motifs to check against de novo motifs,
+        -float (allow adjustment of the degeneracy threshold for known motifs to improve p-value[dangerous])
+        -noknown (don't search for known motif enrichment, default: -known)
+        -mknown <motif file> (known motifs to check for enrichment,
+        -nofacts (omit humor)
+        -seqlogo (use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now)
+
+Sequence normalization options::
+
+        -gc (use GC% for sequence content normalization, now the default)
+        -cpg (use CpG% instead of GC% for sequence content normalization)
+        -noweight (no CG correction)
+        Also -nlen <#>, -olen <#>, see homer2 section below.
+
+Advanced options::
+
+        -h (use hypergeometric for p-values, binomial is default)
+        -N <#> (Number of sequences to use for motif finding, default=max(50k, 2x input)
+        -local <#> (use local background, # of equal size regions around peaks to use i.e. 2)
+        -redundant <#> (Remove redundant sequences matching greater than # percent, i.e. -redundant 0.5)
+        -maxN <#> (maximum percentage of N's in sequence to consider for motif finding, default: 0.7)
+        -maskMotif <motif file1> [motif file 2]... (motifs to mask before motif finding)
+        -opt <motif file1> [motif file 2]... (motifs to optimize or change length of)
+        -rand (randomize target and background sequences labels)
+        -ref <peak file> (use file for target and background - first argument is list of peak ids for targets)
+        -oligo (perform analysis of individual oligo enrichment)
+        -dumpFasta (Dump fasta files for target and background sequences for use with other programs)
+        -preparse (force new background files to be created)
+        -preparsedDir <directory> (location to search for preparsed file and/or place new files)
+        -keepFiles (keep temporary files)
+        -fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations)
+
+homer2 specific options::
+
+        -homer2 (use homer2 instead of original homer, default)
+        -nlen <#> (length of lower-order oligos to normalize in background, default: -nlen 3)
+                -nmax <#> (Max normalization iterations, default: 160)
+                -neutral (weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc.)
+        -olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well)
+        -p <#> (Number of processors to use, default: 1)
+        -e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01)
+        -cache <#> (size in MB for statistics cache, default: 500)
+        -quickMask (skip full masking after finding motifs, similar to original homer)
+        -minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10)
+
+Original homer specific options::
+
+        -homer1 (to force the use of the original homer)
+        -depth [low|med|high|allnight] (time spent on local optimization default: med)
+
+
+    ]]></help>
+    <expand macro="citation"/>
+</tool>