Mercurial > repos > iuc > homer_findmotifsgenome
view homer_findMotifsGenome.xml @ 4:081139509124 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/homer commit e6639e130c5bf50f489dd09e38a1cbc712280fff
author | iuc |
---|---|
date | Fri, 07 Apr 2023 15:01:41 +0000 |
parents | 4fe92af4542b |
children |
line wrap: on
line source
<tool id="homer_findMotifsGenome" name="findMotifsGenome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT"> <description/> <macros> <import>macros.xml</import> </macros> <expand macro="xrefs"/> <expand macro="requirements"/> <command detect_errors="exit_code"><![CDATA[ ## Taken from fastqc: #import re #import os #set input_name = re.sub('[^\w\-\s]', '_', str($input.element_identifier)) ln -s '${input}' '${input_name}' && #set output = $input_name + '_motif' ## Process the genome: #if str( $genome.source ) == "installed": #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.all_fasta_source.fields.value)) + '.fa' ln -s '$genome.all_fasta_source.fields.path' '$genome_file' && #elif str( $genome.source ) == "preparsed": #set genome_file = os.path.split(str($genome.homer_preparse_source.fields.path_fasta))[-1] ln -s '$genome.homer_preparse_source.fields.path_fasta' '$genome_file' && #elif str( $genome.source ) == "history": #set genome_file = re.sub('[^\w\-\s]', '_', str($genome.fasta.name)) + '.fa' ln -s '$genome.fasta' '$genome_file' && #end if ## Command: findMotifsGenome.pl ## Peak: '${input_name}' ## Genome: '$genome_file' ## Ouptut folder: '${output}' ## Options #if str( $genome.source ) == "preparsed": -preparsedDir '$genome.homer_preparse_source.fields.path' #if str( $genome.homer_preparse_source.fields.mask ) == 'True': -mask #end if #if str( $genome.choose_center.center ) == "centered": -size '$genome.homer_preparse_source.fields.size' #else #set sizee = int($genome.choose_center.sizes) + int($genome.homer_preparse_source.fields.size) -size '$genome.choose_center.sizes','${sizee}' #end if #else: #if $genome.mask -mask #end if #if $genome.fixed_size.size_fixed == "given": -size given #else: #if str( $genome.fixed_size.choose_center.center ) == "centered": -size '$genome.fixed_size.size' #else #set sizee = int($genome.fixed_size.choose_center.sizes) + int($genome.fixed_size.size) -size '$genome.fixed_size.choose_center.sizes','${sizee}' #end if #end if #end if #if str( $background.use ) != "none": -bg '$background.bg' #end if #if str( $background.use ) == "withbg": -keepOverlappingBg #else if str( $background.use ) == "chopify": -chopify #end if -len '$len' -S $S -mis $mis $norevopp $nomotif $rna -mset $motif_options.mset $motif_options.basic $motif_options.bits $motif_options.nocheck #if $motif_options.mcheck: -mcheck '$motif_options.mcheck' #end if $motif_options.noknown #if $motif_options.mknown: -mknown '$motif_options.mknown' #end if $motif_options.nofacts $motif_options.seqlogo $advanced.norm $advanced.h #if str($advanced.N): -N $advanced.N #end if -local $advanced.local -redundant $advanced.redundant -maxN $advanced.maxN #if $advanced.maskMotif: -maskMotif '$advanced.maskMotif' #end if #if $advanced.opt: -opt '$advanced.opt' #end if $advanced.rand #if $advanced.ref: -ref '$advanced.ref' #end if $advanced.oligo #if $advanced.fdr: -fdr $advanced.fdr #end if #if str( $advanced.homer12.version ) == "homer2": -nlen '$advanced.homer12.nlen' -nmax '$advanced.homer12.nmax' $advanced.homer12.neutral -e '$advanced.homer12.e' $advanced.homer12.quickMask -minlp '$advanced.homer12.minlp' #elif str( $advanced.homer12.version ) == "homer1": -homer1 -depth '$advanced.homer12.depth' #end if #if str( $background.use ) == "none": #if not $nomotif: && cp '${output}/homerResults.html' outputHomer.html && cp -r '${output}' '${html_homer_file.files_path}' && cp -r '${output}/homerResults' homerResults #end if #if not $motif_options.noknown: && cp '${output}/knownResults.html' outputKnown.html && cp -r '${output}' '${html_file.files_path}' && cp -r '${output}/knownResults' knownResults #end if #else && cp '${output}/homerResults.html' outputHomer.html && cp -r '${output}' '${html_homer_file.files_path}' && cp -r '${output}/homerResults' homerResults #end if ]]></command> <inputs> <param name="input" type="data" format="bed,encodepeak,tabular" label="Peak file"/> <conditional name="genome"> <param name="source" type="select" label="Will you select a reference genome from your history or use an installed genome?"> <option value="preparsed">Preparsed (fasta is available and has been preparsed to specific size)</option> <option value="installed">Installed (fasta is available but will be preparsed as run time)</option> <option value="history">From History (fasta will be preparsed at run time)</option> </param> <when value="preparsed"> <param name="homer_preparse_source" type="select" label="Preparsed FASTA"> <options from_data_table="homer_preparse"> <filter type="sort_by" column="2"/> <filter type="static_value" column="version" value="@IDX_VERSION@"/> <validator type="no_options" message="No preparsed genomes are available"/> </options> </param> <expand macro="choose_center"/> </when> <when value="installed"> <param name="all_fasta_source" type="select" label="Source FASTA Sequence"> <options from_data_table="all_fasta"> <filter type="sort_by" column="2"/> <validator type="no_options" message="No references are available"/> </options> </param> <expand macro="mask_size"/> </when> <when value="history"> <param name="fasta" type="data" format="fasta" label="Select reference genome"/> <expand macro="mask_size"/> </when> </conditional> <conditional name="background"> <param name="use" type="select" label="Will you use a background model?" > <option value="none" selected="true" >No background model</option> <option value="model" >Background model with regions removed from target Peak file</option> <option value="withbg" >Background model with regions not removed from target Peak file</option> <option value="chopify" >Chop large background model regions to the average size of target Peak file regions</option> </param> <when value="none"></when> <when value="model"> <param argument="-bg" type="data" format="bed,encodepeak,tabular" label="Background Peak file"/> </when> <when value="withbg"> <param argument="-bg" type="data" format="bed,encodepeak,tabular" label="Background Peak file"/> </when> <when value="chopify"> <param argument="-bg" type="data" format="bed,encodepeak,tabular" label="Background Peak file" /> </when> </conditional> <param argument="-len" type="text" value="8,10,12" label="comma-separated motif lengths" help="values greater 12 may cause the program to run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)"> <validator type="regex" message="motif lengths must be comma-separated integers without space">^(\d+,)*(\d+)$</validator> </param> <param argument="-S" type="integer" min="1" value="25" label="Number of motifs to find"/> <param argument="-mis" type="integer" min="0" value="2" label="Number of mismatches during global optimisation"/> <param argument="-norevopp" type="boolean" truevalue="-norevopp" falsevalue="" checked="false" label="Don't search reverse strand for motifs"/> <param argument="-nomotif" type="boolean" truevalue="-nomotif" falsevalue="" checked="false" label="Don't search for de novo motif enrichment"/> <param argument="-rna" type="boolean" truevalue="-rna" falsevalue="" checked="false" label="output RNA motif logos and compare to RNA motif database" help="automatically sets -norevopp"/> <section name="motif_options" title="Known Motif Options/Visualization" expanded="False"> <param argument="-mset" type="select" label="Check against motif collects"> <option value="auto" selected="True">automatic</option> <option value="vertebrates">vertebrates</option> <option value="insects">insects</option> <option value="worms">worms</option> <option value="plants">plants</option> <option value="yeast">yeast</option> <option value="all">all</option> </param> <param argument="-basic" type="boolean" truevalue="-basic" falsevalue="" checked="false" label="Just visualize de novo motifs, don't check similarity with known motifs"/> <param argument="-bits" type="boolean" truevalue="-bits" falsevalue="" checked="false" label="Scale sequence logos by information content" help="TODO"/> <param argument="-nocheck" type="boolean" truevalue="-nocheck" falsevalue="" checked="false" label="Don't search for de novo vs. known motif similarity"/> <param argument="-mcheck" type="data" optional="true" format="txt" label="known motifs to check against de novo motifs"/> <param argument="-noknown" type="boolean" truevalue="-noknown" falsevalue="" checked="false" label="Don't search for known motif enrichment"/> <param argument="-mknown" type="data" optional="true" format="txt" label="Known motifs to check for enrichment"/> <param argument="-nofacts" type="boolean" truevalue="-nofacts" falsevalue="" checked="false" label="Omit humor"/> <param argument="-seqlogo" type="boolean" truevalue="-seqlogo" falsevalue="" checked="false" label="Use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now"/> </section> <section name="advanced" title="Advanced options" expanded="false"> <param name="norm" type="select" label="Sequence normalization options:"> <option value="-gc" selected="true">use GC% for sequence content normalization</option> <option value="-cpg">use CpG% instead of GC% for sequence content normalization</option> <option value="-noweight">no CG correction</option> </param> <param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Use hypergeometric for p-values, binomial is default"/> <param argument="-N" type="integer" min="0" value="" optional="true" label="Number of sequences to use for motif finding, default=max(50k, 2x input)"/> <param argument="-local" type="integer" min="0" value="0" label="local background size in bp for each side of regions" help="0 means no local background."/> <param argument="-redundant" type="float" min="0" max="2" value="2" label="Remove redundant sequences matching greater than # fraction, i.e. -redundant 0.5"/> <param argument="-maxN" type="float" min="0" max="1" value="0.7" label="maximum percentage of N's in sequence to consider for motif finding"/> <param argument="-maskMotif" type="data" format="txt" multiple="true" optional="true" label="motifs to mask before motif finding"/> <param argument="-opt" type="data" format="txt" multiple="true" optional="true" label="motifs to optimize or change length of"/> <param argument="-rand" type="boolean" truevalue="-rand" falsevalue="" checked="false" label="randomize target and background sequences labels"/> <param argument="-ref" optional="true" type="data" format="tabular,bed,encodepeak" label="use file for target and background - first argument is list of peak ids for targets"/> <param argument="-oligo" type="boolean" truevalue="-oligo" falsevalue="" checked="false" label="Perform analysis of individual oligo enrichment"/> <param argument="-fdr" type="integer" min="0" value="" label="Number of randomizations to calculate empirical FDR for de novo discovery" optional="true"/> <conditional name="homer12"> <param name="version" type="select" label="Which homer version do you want to use"> <option value="homer2" selected="true">homer2 (default)</option> <option value="homer1">homer1 (to force the use of the original homer)</option> </param> <when value="homer2"> <param argument="-nlen" type="integer" min="0" value="3" label="length of lower-order oligos to normalize in background"/> <param argument="-nmax" type="integer" min="0" value="160" label="Max normalization iterations"/> <param argument="-neutral" type="boolean" truevalue="-neutral" falsevalue="" checked="false" label="weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc."/> <param argument="-olen" type="integer" min="0" value="" optional="true" label="lower-order oligo normalization for oligo table, use if -nlen isn't working well"/> <param argument="-e" type="float" min="0" max="1" value="0" label="" help="Maximum expected motif instance per bp in random sequence"/> <param argument="-quickMask" type="boolean" truevalue="-quickMask" falsevalue="" checked="false" label="skip full masking after finding motifs, similar to original homer"/> <param argument="-minlp" type="float" value="-10" label="stop looking for motifs when seed logp score gets above this number"/> </when> <when value="homer1"> <param argument="-depth" type="select" label="time spent on local optimization default"> <option value="low">low</option> <option value="med" selected="true">med</option> <option value="high">high</option> <option value="allnight">allnight</option> </param> </when> </conditional> </section> </inputs> <outputs> <data format="html" name="html_file" from_work_dir="outputKnown.html" label="${tool.name} on ${on_string}: Known motifs"> <filter>motif_options['noknown'] is False</filter> <filter>background['use'] == "none"</filter> </data> <collection name="output_collection_known" type="list" label="${tool.name} on ${on_string}: Known motifs files"> <discover_datasets directory="knownResults" pattern="(?P<designation>.+)\.motif" format="txt" visible="false"/> <filter>motif_options['noknown'] is False</filter> <filter>background['use'] == "none"</filter> </collection> <data format="html" name="html_homer_file" from_work_dir="outputHomer.html" label="${tool.name} on ${on_string}: De novo motifs"> <filter>nomotif is False</filter> </data> <collection name="output_collection_de_novo" type="list" label="${tool.name} on ${on_string}: De novo motifs files"> <discover_datasets directory="homerResults" pattern="(?P<designation>.+)\.motif" format="txt" visible="false"/> <filter>nomotif is False</filter> </collection> </outputs> <tests> <test expect_num_outputs="4"> <param name="input" value="fake_phix_peaks.bed"/> <conditional name="genome"> <param name="source" value="installed"/> <param name="all_fasta_source" value="phiX174"/> </conditional> <output name="html_file" file="motif_test1/knownResults.html" ftype="html" lines_diff="2"/> <output name="html_homer_file"> <assert_contents> <has_text text="fake_phix_peaks_bed_motif/ - Homer de novo Motif Results"/> <has_text text="Total target sequences = 1"/> <!-- This is too much impredictible --> <!-- <has_text text="Jaspar"/> --> </assert_contents> </output> <output_collection name="output_collection_known" type="list" count="0"> </output_collection> <output_collection name="output_collection_de_novo" type="list"> <element name="motif1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> <test expect_num_outputs="4"> <param name="input" value="CTCF_peaks_shifted.bed"/> <conditional name="genome"> <param name="source" value="history"/> <param name="fasta" value="chr2_subset.fa.gz"/> </conditional> <output name="html_file"> <assert_contents> <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> <has_text text="Total Target Sequences = 24"/> <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/> </assert_contents> </output> <output name="html_homer_file"> <assert_contents> <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/> <has_text text="Total target sequences = 24"/> <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL"/> </assert_contents> </output> <output_collection name="output_collection_known" type="list"> <element name="known1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> <output_collection name="output_collection_de_novo" type="list"> <element name="motif1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> <test expect_num_outputs="4"> <param name="input" value="CTCF_peaks_shifted.bed"/> <param name="mask" value="true"/> <conditional name="genome"> <param name="source" value="history"/> <param name="fasta" value="chr2_subset.fa.gz"/> </conditional> <output name="html_file"> <assert_contents> <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> <has_text text="Total Target Sequences = 18"/> <has_text text="CTCF(Zf)/CD4+-CTCF-ChIP-Seq(Barski_et_al.)/Homer"/> </assert_contents> </output> <output name="html_homer_file"> <assert_contents> <has_text text="CTCF_peaks_shifted_bed_motif/ - Homer de novo Motif Results"/> <has_text text="Total target sequences = 18"/> <!-- This is too much impredictible --> <!-- <has_text_matching expression="CTCF(Zf)|CTCF/MA|BORIS|CTCFL|NFATC2"/> --> </assert_contents> </output> <output_collection name="output_collection_known" type="list"> <element name="known1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> <output_collection name="output_collection_de_novo" type="list"> <element name="motif1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> <test expect_num_outputs="2"> <param name="input" value="CTCF_peaks_shifted.bed"/> <conditional name="genome"> <param name="source" value="history"/> <param name="fasta" value="chr2_subset.fa.gz"/> </conditional> <section name="motif_options"> <param name="mset" value="plants"/> </section> <param name="nomotif" value="true"/> <output name="html_file"> <assert_contents> <has_text text="CTCF_peaks_shifted_bed_motif - Homer Known Motif Enrichment Results"/> <has_text text="Total Target Sequences = 24"/> </assert_contents> </output> <output_collection name="output_collection_known" type="list" count="0"> </output_collection> </test> <!-- background tests --> <test expect_num_outputs="2"> <param name="input" value="fake_phix_peaks.bed"/> <conditional name="genome"> <param name="source" value="installed"/> <param name="all_fasta_source" value="phiX174"/> </conditional> <conditional name="background"> <param name="use" value="withbg" /> <param name="bg" value="fake_phix_peaks.subset.bed" /> </conditional> <output name="html_homer_file"> <assert_contents> <has_text text="HOXB13/MA0901.2/Jaspar(0.948)"/> <has_text text="Yeast"/> </assert_contents> </output> <output_collection name="output_collection_de_novo" type="list"> <element name="motif1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> <test expect_num_outputs="2"> <param name="input" value="fake_phix_peaks.bed"/> <conditional name="genome"> <param name="source" value="installed"/> <param name="all_fasta_source" value="phiX174"/> </conditional> <conditional name="background"> <param name="use" value="chopify" /> <param name="bg" value="fake_phix_peaks.subset.bed" /> </conditional> <output name="html_homer_file"> <assert_contents> <has_text text="CCA"/> <has_text text="YAP5"/> </assert_contents> </output> <output_collection name="output_collection_de_novo" type="list"> <element name="motif1"> <assert_contents> <has_text text=">"/> </assert_contents> </element> </output_collection> </test> </tests> <help><![CDATA[ .. class:: infomark This is a wrapper for findMotifsGenome.pl from HOMER but not all options are included. Program will find de novo and known motifs in regions in the genome. Usage:: findMotifsGenome.pl <pos file> <genome> <output directory> [additional options] Example:: findMotifsGenome.pl peaks.txt mm8r peakAnalysis -size 200 -len 8 Possible Genomes:: -- or -- Custom: provide the path to genome FASTA files (directory or single file) Heads up: will create the directory "preparsed/" in same location. Basic options:: -mask (mask repeats/lower case sequence, can also add 'r' to genome, i.e. mm9r) -bg <background position file> (genomic positions to be used as background, default=automatic) removes background positions overlapping with target positions unless -keepOverlappingBg is used -chopify (chop up large background regions to the avg size of target regions) -len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the program to run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)] -size <#> (fragment size to use for motif finding, default=200) -size <#,#> (i.e. -size -100,50 will get sequences from -100 to +50 relative from center) -size given (uses the exact regions you give it) -S <#> (Number of motifs to optimize, default: 25) -mis <#> (global optimization: searches for strings with # mismatches, default: 2) -norevopp (don't search reverse strand for motifs) -nomotif (don't search for de novo motif enrichment) -rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp) Scanning sequence for motifs:: -find <motif file> (This will cause the program to only scan for motifs) Known Motif Options/Visualization:: -mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto) -basic (just visualize de novo motifs, don't check similarity with known motifs) -bits (scale sequence logos by information content, default: doesn't scale) -nocheck (don't search for de novo vs. known motif similarity) -mcheck <motif file> (known motifs to check against de novo motifs, -float (allow adjustment of the degeneracy threshold for known motifs to improve p-value[dangerous]) -noknown (don't search for known motif enrichment, default: -known) -mknown <motif file> (known motifs to check for enrichment, -nofacts (omit humor) -seqlogo (use weblogo/seqlogo/ghostscript to generate logos, default uses SVG now) Sequence normalization options:: -gc (use GC% for sequence content normalization, now the default) -cpg (use CpG% instead of GC% for sequence content normalization) -noweight (no CG correction) Also -nlen <#>, -olen <#>, see homer2 section below. Advanced options:: -h (use hypergeometric for p-values, binomial is default) -N <#> (Number of sequences to use for motif finding, default=max(50k, 2x input) -local <#> (use local background, # of equal size regions around peaks to use i.e. 2) -redundant <#> (Remove redundant sequences matching greater than # percent, i.e. -redundant 0.5) -maxN <#> (maximum percentage of N's in sequence to consider for motif finding, default: 0.7) -maskMotif <motif file1> [motif file 2]... (motifs to mask before motif finding) -opt <motif file1> [motif file 2]... (motifs to optimize or change length of) -rand (randomize target and background sequences labels) -ref <peak file> (use file for target and background - first argument is list of peak ids for targets) -oligo (perform analysis of individual oligo enrichment) -dumpFasta (Dump fasta files for target and background sequences for use with other programs) -preparse (force new background files to be created) -preparsedDir <directory> (location to search for preparsed file and/or place new files) -keepFiles (keep temporary files) -fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations) homer2 specific options:: -homer2 (use homer2 instead of original homer, default) -nlen <#> (length of lower-order oligos to normalize in background, default: -nlen 3) -nmax <#> (Max normalization iterations, default: 160) -neutral (weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc.) -olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well) -p <#> (Number of processors to use, default: 1) -e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01) -cache <#> (size in MB for statistics cache, default: 500) -quickMask (skip full masking after finding motifs, similar to original homer) -minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10) Original homer specific options:: -homer1 (to force the use of the original homer) -depth [low|med|high|allnight] (time spent on local optimization default: med) ]]></help> <expand macro="citation"/> </tool>