Mercurial > repos > iuc > homer_findmotifsgenome
diff homer_findMotifsGenome.xml @ 0:ec974e69e0b5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/homer commit e49d856e0181edc6120220a1b819cba2466a4289"
author | iuc |
---|---|
date | Sun, 08 Aug 2021 11:02:42 +0000 |
parents | |
children | 3126da33847c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/homer_findMotifsGenome.xml Sun Aug 08 11:02:42 2021 +0000 @@ -0,0 +1,411 @@ +<tool id="homer_findMotifsGenome" name="findMotifsGenome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05" license="MIT"> + <description/> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="xrefs"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +## Taken from fastqc: +#import re +#import os +#set input_name = re.sub('[^\w\-\s]', '_', str($input.element_identifier)) +ln -s '${input}' '${input_name}' && +#set output = $input_name + '_motif' +## Process the genome: +#if str( $genome.source ) == "installed": + #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.all_fasta_source.fields.value)) + '.fa' + ln -s '$genome.all_fasta_source.fields.path' '$genome_file' && +#elif str( $genome.source ) == "preparsed": + #set genome_file = os.path.split(str($genome.homer_preparse_source.fields.path_fasta))[-1] + ln -s '$genome.homer_preparse_source.fields.path_fasta' '$genome_file' && +#elif str( $genome.source ) == "history": + #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.fasta.name)) + '.fa' + ln -s '$genome.fasta' '$genome_file' && +#end if +## Command: +findMotifsGenome.pl +## Peak: +'${input_name}' +## Genome: +'$genome_file' +## Ouptut folder: +'${output}' +## Options +#if str( $genome.source ) == "preparsed": + -preparsedDir '$genome.homer_preparse_source.fields.path' + #if str( $genome.homer_preparse_source.fields.mask ) == 'True': + -mask + #end if + #if str( $genome.choose_center.center ) == "centered": + -size '$genome.homer_preparse_source.fields.size' + #else + #set sizee = int($genome.choose_center.sizes) + int($genome.homer_preparse_source.fields.size) + -size '$genome.choose_center.sizes','${sizee}' + #end if +#else: + #if $genome.mask + -mask + #end if + #if $genome.fixed_size.size_fixed == "given": + -size given + #else: + #if str( $genome.fixed_size.choose_center.center ) == "centered": + -size '$genome.fixed_size.size' + #else + #set sizee = int($genome.fixed_size.choose_center.sizes) + int($genome.fixed_size.size) + -size '$genome.fixed_size.choose_center.sizes','${sizee}' + #end if + #end if +#end if +-len '$len' +-S $S +-mis $mis +$norevopp +$nomotif +$rna +-mset $motif_options.mset +$motif_options.basic +$motif_options.bits +$motif_options.nocheck +#if $motif_options.mcheck: + -mcheck '$motif_options.mcheck' +#end if +$motif_options.noknown +#if $motif_options.mknown: + -mknown '$motif_options.mknown' +#end if +$motif_options.nofacts +$motif_options.seqlogo +$advanced.norm +$advanced.h +#if str($advanced.N): + -N $advanced.N +#end if +-local $advanced.local +-redundant $advanced.redundant +-maxN $advanced.maxN +#if $advanced.maskMotif: + -maskMotif '$advanced.maskMotif' +#end if +#if $advanced.opt: + -opt '$advanced.opt' +#end if +$advanced.rand +#if $advanced.ref: + -ref '$advanced.ref' +#end if +$advanced.oligo +#if $advanced.fdr: + -fdr $advanced.fdr +#end if +#if str( $advanced.homer12.version ) == "homer2": + -nlen '$advanced.homer12.nlen' + -nmax '$advanced.homer12.nmax' + $advanced.homer12.neutral + -e '$advanced.homer12.e' + $advanced.homer12.quickMask + -minlp '$advanced.homer12.minlp' +#elif str( $advanced.homer12.version ) == "homer1": + -depth '$advanced.homer12.depth' +#end if +#if not $nomotif: + && cp '${output}'/homerResults.html outputHomer.html + && cp -r '${output}' '${html_homer_file.files_path}' +#end if +#if not $motif_options.noknown: + && cp '${output}'/knownResults.html outputKnown.html + && cp -r '${output}' '${html_file.files_path}' +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="bed,encodepeak,tabular" label="Peak file"/> + <conditional name="genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a installed genome?"> + <option value="preparsed">Preparsed (fasta is available and has been preparsed to specific size)</option> + <option value="installed">Installed (fasta is available but will be preparsed as run time)</option> + <option value="history">From History (fasta will be preparsed at run time)</option> + </param> + <when value="preparsed"> + <param name="homer_preparse_source" type="select" label="Preparsed FASTA"> + <options from_data_table="homer_preparse"> + <filter type="sort_by" column="2"/> + <filter type="static_value" column="version" value="@IDX_VERSION@"/> + <validator type="no_options" message="No preparsed genomes are available"/> + </options> + </param> + <expand macro="choose_center"/> + </when> + <when value="installed"> + <param name="all_fasta_source" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No references are available"/> + </options> + </param> + <expand macro="mask_size"/> + </when> + <when value="history"> + <param name="fasta" type="data" format="fasta" label="Select reference genome"/> + <expand macro="mask_size"/> + </when> + </conditional> + <param argument="-len" type="text" value="8,10,12" label="comma-separated motif lengths" help="values greater 12 may cause the program to run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)"> + <validator type="regex" message="motif lengths must be comma-separated integers without space">^(\d+,)*(\d+)$</validator> + </param> + <param argument="-S" type="integer" min="1" value="25" label="Number of motifs to find"/> + <param argument="-mis" type="integer" min="0" value="2" label="Number of mismatches during global optimisation"/> + <param argument="-norevopp" type="boolean" truevalue="-norevopp" falsevalue="" checked="false" label="Don't search reverse strand for motifs"/> + <param argument="-nomotif" type="boolean" truevalue="-nomotif" falsevalue="" checked="false" label="Don't search for de novo motif enrichment"/> + <param argument="-rna" type="boolean" truevalue="-rna" falsevalue="" checked="false" label="output RNA motif logos and compare to RNA motif database" help="automatically sets -norevopp"/> + <section name="motif_options" title="Known Motif Options/Visualization" expanded="False"> + <param argument="-mset" type="select" label="Check against motif collects"> + <option value="auto" selected="True">automatic</option> + <option value="vertebrates">vertebrates</option> + <option value="insects">insects</option> + <option value="worms">worms</option> + <option value="plants">plants</option> + <option value="yeast">yeast</option> + <option value="all">all</option> + </param> + <param argument="-basic" type="boolean" truevalue="-basic" falsevalue="" checked="false" label="Just visualize de novo motifs, don't check similarity with known motifs"/> + <param argument="-bits" type="boolean" truevalue="-bits" falsevalue="" checked="false" label="Scale sequence logos by information content" help="TODO"/> + <param argument="-nocheck" type="boolean" truevalue="-nocheck" falsevalue="" checked="false" label="Don't search for de novo vs. known motif similarity"/> + <param argument="-mcheck" type="data" optional="true" format="txt" label="known motifs to check against de novo motifs"/> + <param argument="-noknown" type="boolean" truevalue="-noknown" falsevalue="" checked="false" label="Don't search for known motif enrichment"/> + <param argument="-mknown" type="data" optional="true" format="txt" label="Known motifs to check for enrichment"/> + <param argument="-nofacts" type="boolean" truevalue="-nofacts" falsevalue="" checked="false" label="Omit humor"/> + <param argument="-seqlogo" type="boolean" truevalue="-seqlogo" falsevalue="" checked="false" label="Use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now"/> + </section> + <section name="advanced" title="Advanced options" expanded="false"> + <param name="norm" type="select" label="Sequence normalization options:"> + <option value="-gc" selected="true">use GC% for sequence content normalization</option> + <option value="-cpg">use CpG% instead of GC% for sequence content normalization</option> + <option value="-noweight">no CG correction</option> + </param> + <param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Use hypergeometric for p-values, binomial is default"/> + <param argument="-N" type="integer" min="0" value="" optional="true" label="Number of sequences to use for motif finding, default=max(50k, 2x input)"/> + <param argument="-local" type="integer" min="0" value="0" label="local background size in bp for each side of regions" help="0 means no local background."/> + <param argument="-redundant" type="float" min="0" max="2" value="2" label="Remove redundant sequences matching greater than # fraction, i.e. -redundant 0.5"/> + <param argument="-maxN" type="float" min="0" max="1" value="0.7" label="maximum percentage of N's in sequence to consider for motif finding"/> + <param argument="-maskMotif" type="data" format="txt" multiple="true" optional="true" label="motifs to mask before motif finding"/> + <param argument="-opt" type="data" format="txt" multiple="true" optional="true" label="motifs to optimize or change length of"/> + <param argument="-rand" type="boolean" truevalue="-rand" falsevalue="" checked="false" label="randomize target and background sequences labels"/> + <param argument="-ref" optional="true" type="data" format="tabular,bed,encodepeak" label="use file for target and background - first argument is list of peak ids for targets"/> + <param argument="-oligo" type="boolean" truevalue="-oligo" falsevalue="" checked="false" label="Perform analysis of individual oligo enrichment"/> + <param argument="-fdr" type="integer" min="0" value="" label="Number of randomizations to calculate empirical FDR for de novo discovery" optional="true"/> + <conditional name="homer12"> + <param name="version" type="select" label="Which homer version do you want to use"> + <option value="homer2" selected="true">homer2 (default)</option> + <option value="homer1">homer1 (to force the use of the original homer)</option> + </param> + <when value="homer2"> + <param argument="-nlen" type="integer" min="0" value="3" label="length of lower-order oligos to normalize in background"/> + <param argument="-nmax" type="integer" min="0" value="160" label="Max normalization iterations"/> + <param argument="-neutral" type="boolean" truevalue="-neutral" falsevalue="" checked="false" label="weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc."/> + <param argument="-olen" type="integer" min="0" value="" optional="true" label="lower-order oligo normalization for oligo table, use if -nlen isn't working well"/> + <param argument="-e" type="float" min="0" max="1" value="0" label="" help="Maximum expected motif instance per bp in random sequence"/> + <param argument="-quickMask" type="boolean" truevalue="-quickMask" falsevalue="" checked="false" label="skip full masking after finding motifs, similar to original homer"/> + <param argument="-minlp" type="float" value="-10" label="stop looking for motifs when seed logp score gets above this number"/> + </when> + <when value="homer1"> + <param argument="-depth" type="select" label="time spent on local optimization default"> + <option value="low">low</option> + <option value="med" selected="true">med</option> + <option value="high">high</option> + <option value="allnight">allnight</option> + </param> + </when> + </conditional> + </section> + </inputs> + <outputs> + <data format="html" name="html_file" from_work_dir="outputKnown.html" label="${tool.name} on ${on_string}: Known motifs"> + <filter>motif_options['noknown'] is False</filter> + </data> + <data format="html" name="html_homer_file" from_work_dir="outputHomer.html" label="${tool.name} on ${on_string}: De novo motifs"> + <filter>nomotif is False</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="2"> + <param name="input" value="fake_phix_peaks.bed"/> + <conditional name="genome"> + <param name="source" value="installed"/> + <param name="all_fasta_source" value="phiX174"/> + </conditional> + <output name="html_file" file="motif_test1/knownResults.html" ftype="html" lines_diff="2"/> + <output name="html_homer_file"> + <assert_contents> + <has_text text="fake_phix_peaks_bed_motif/ - Homer de novo Motif Results"/> + <has_text text="Total target sequences = 1"/> + <has_text text="Jaspar"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="input" value="CTCF_peaks_shifted.bed"/> + <conditional name="genome"> + <param name="source" value="history"/> + <param name="fasta" value="chr2_subset.fa"/> + </conditional> + <output name="html_file"> + <assert_contents> + <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> + <has_text text="Total Target Sequences = 40"/> + <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/> + </assert_contents> + </output> + <output name="html_homer_file"> + <assert_contents> + <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/> + <has_text text="Total target sequences = 40"/> + <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="input" value="CTCF_peaks_shifted.bed"/> + <param name="mask" value="true"/> + <conditional name="genome"> + <param name="source" value="history"/> + <param name="fasta" value="chr2_subset.fa"/> + </conditional> + <output name="html_file"> + <assert_contents> + <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> + <has_text text="Total Target Sequences = 34"/> + <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/> + </assert_contents> + </output> + <output name="html_homer_file"> + <assert_contents> + <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/> + <has_text text="Total target sequences = 34"/> + <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <param name="input" value="CTCF_peaks_shifted.bed"/> + <conditional name="genome"> + <param name="source" value="history"/> + <param name="fasta" value="chr2_subset.fa"/> + </conditional> + <section name="motif_options"> + <param name="mset" value="plants"/> + </section> + <param name="nomotif" value="true"/> + <output name="html_file"> + <assert_contents> + <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> + <has_text text="Total Target Sequences = 40"/> + <has_text text="RAP26"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + +.. class:: infomark + + This is a wrapper for findMotifsGenome.pl from HOMER but not all options are included. + + Program will find de novo and known motifs in regions in the genome. + +Usage:: + + findMotifsGenome.pl <pos file> <genome> <output directory> [additional options] + +Example:: + + findMotifsGenome.pl peaks.txt mm8r peakAnalysis -size 200 -len 8 + +Possible Genomes:: + + -- or -- + Custom: provide the path to genome FASTA files (directory or single file) + Heads up: will create the directory "preparsed/" in same location. + +Basic options:: + + -mask (mask repeats/lower case sequence, can also add 'r' to genome, i.e. mm9r) + -bg <background position file> (genomic positions to be used as background, default=automatic) + removes background positions overlapping with target positions unless -keepOverlappingBg is used + -chopify (chop up large background regions to the avg size of target regions) + -len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the program + to run out of memory - in these cases decrease the number of sequences analyzed (-N), + or try analyzing shorter sequence regions (i.e. -size 100)] + -size <#> (fragment size to use for motif finding, default=200) + -size <#,#> (i.e. -size -100,50 will get sequences from -100 to +50 relative from center) + -size given (uses the exact regions you give it) + -S <#> (Number of motifs to optimize, default: 25) + -mis <#> (global optimization: searches for strings with # mismatches, default: 2) + -norevopp (don't search reverse strand for motifs) + -nomotif (don't search for de novo motif enrichment) + -rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp) + +Scanning sequence for motifs:: + + -find <motif file> (This will cause the program to only scan for motifs) + +Known Motif Options/Visualization:: + + -mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto) + -basic (just visualize de novo motifs, don't check similarity with known motifs) + -bits (scale sequence logos by information content, default: doesn't scale) + -nocheck (don't search for de novo vs. known motif similarity) + -mcheck <motif file> (known motifs to check against de novo motifs, + -float (allow adjustment of the degeneracy threshold for known motifs to improve p-value[dangerous]) + -noknown (don't search for known motif enrichment, default: -known) + -mknown <motif file> (known motifs to check for enrichment, + -nofacts (omit humor) + -seqlogo (use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now) + +Sequence normalization options:: + + -gc (use GC% for sequence content normalization, now the default) + -cpg (use CpG% instead of GC% for sequence content normalization) + -noweight (no CG correction) + Also -nlen <#>, -olen <#>, see homer2 section below. + +Advanced options:: + + -h (use hypergeometric for p-values, binomial is default) + -N <#> (Number of sequences to use for motif finding, default=max(50k, 2x input) + -local <#> (use local background, # of equal size regions around peaks to use i.e. 2) + -redundant <#> (Remove redundant sequences matching greater than # percent, i.e. -redundant 0.5) + -maxN <#> (maximum percentage of N's in sequence to consider for motif finding, default: 0.7) + -maskMotif <motif file1> [motif file 2]... (motifs to mask before motif finding) + -opt <motif file1> [motif file 2]... (motifs to optimize or change length of) + -rand (randomize target and background sequences labels) + -ref <peak file> (use file for target and background - first argument is list of peak ids for targets) + -oligo (perform analysis of individual oligo enrichment) + -dumpFasta (Dump fasta files for target and background sequences for use with other programs) + -preparse (force new background files to be created) + -preparsedDir <directory> (location to search for preparsed file and/or place new files) + -keepFiles (keep temporary files) + -fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations) + +homer2 specific options:: + + -homer2 (use homer2 instead of original homer, default) + -nlen <#> (length of lower-order oligos to normalize in background, default: -nlen 3) + -nmax <#> (Max normalization iterations, default: 160) + -neutral (weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc.) + -olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well) + -p <#> (Number of processors to use, default: 1) + -e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01) + -cache <#> (size in MB for statistics cache, default: 500) + -quickMask (skip full masking after finding motifs, similar to original homer) + -minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10) + +Original homer specific options:: + + -homer1 (to force the use of the original homer) + -depth [low|med|high|allnight] (time spent on local optimization default: med) + + + ]]></help> + <expand macro="citation"/> +</tool>