diff annotatePeaks.xml @ 28:f0b5827b6051 draft default tip

Uploaded
author kevyin
date Thu, 20 Dec 2012 18:28:03 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotatePeaks.xml	Thu Dec 20 18:28:03 2012 -0500
@@ -0,0 +1,164 @@
+<tool id="homer_annotatePeaks" name="homer_annotatePeaks" version="0.0.5">
+    <requirements>
+        <requirement type="package" version="4.1">homer</requirement>
+    </requirements>
+    <description></description>
+    <!--<version_command></version_command>-->
+    <command>
+        annotatePeaks.pl $input_bed $genome_selector 1&gt; $out_annotated
+        2&gt; $out_log || echo "Error running annotatePeaks." >&amp;2
+    </command>
+    <inputs>
+        <param format="tabular,bed" name="input_bed" type="data" label="Homer peaks OR BED format"/>
+        <param name="genome_selector" type="select" label="Genome version">
+            <option value="hg19" selected="true">hg19</option>
+        </param>
+        <param type="text" name="options" label="Extra options" value="" help="See link below for more options">
+          <sanitizer>
+            <valid initial="string.printable">
+             <remove value="&apos;"/>
+             <remove value="/"/>
+            </valid>
+            <mapping initial="none">
+              <add source="&apos;" target="__sq__"/>
+            </mapping>
+          </sanitizer>
+        </param>
+    </inputs>
+    <outputs>
+        <!--<data format="html" name="html_outfile" label="index" />-->
+        <!--<data format="html" hidden="True" name="html_outfile" label="index.html" />-->
+        <data format="csv" name="out_annotated" label="${tool.name} on #echo os.path.splitext(str($input_bed.name))[0]#_genome_${genome_selector}" />
+        <data format="txt" name="out_log" label="${tool.name} on #echo os.path.splitext(str($input_bed.name))[0]#_genome_${genome_selector}.log" />
+    </outputs>
+    <tests>
+        <test>
+            <!--<param name="input_file" value="extract_genomic_dna.fa" />-->
+            <!--<output name="html_file" file="sample_output.html" ftype="html" />-->
+        </test>
+    </tests>
+
+    <help>
+
+        .. class:: infomark
+
+        **Homer annoatePeaks**
+
+        More information on accepted formats and options
+
+        http://biowhat.ucsd.edu/homer/ngs/annotation.html
+
+        TIP: use homer_bed2pos and homer_pos2bed to convert between the homer peak positions and the BED format.
+
+**Parameter list**
+
+Command line options (not all of them are supported)::
+
+	Usage: annotatePeaks.pl &lt;peak file | tss&gt; &lt;genome version&gt;  [additional options...]
+
+	Available Genomes (required argument): (name,org,directory,default promoter set)
+			-- or --
+		Custom: provide the path to genome FASTA files (directory or single file)
+
+	User defined annotation files (default is UCSC refGene annotation):
+		annotatePeaks.pl accepts GTF (gene transfer formatted) files to annotate positions relative
+		to custom annotations, such as those from de novo transcript discovery or Gencode.
+		-gtf &lt;gtf format file&gt; (-gff and -gff3 can work for those files, but GTF is better)
+
+	Peak vs. tss/tts/rna mode (works with custom GTF file):
+		If the first argument is &quot;tss&quot; (i.e. annotatePeaks.pl tss hg18 ...) then a TSS centric
+		analysis will be carried out.  Tag counts and motifs will be found relative to the TSS.
+		(no position file needed) [&quot;tts&quot; now works too - e.g. 3&apos; end of gene]
+		[&quot;rna&quot; specifies gene bodies, will automaticall set &quot;-size given&quot;]
+		NOTE: The default TSS peak size is 4000 bp, i.e. +/- 2kb (change with -size option)
+		-list &lt;gene id list&gt; (subset of genes to perform analysis [unigene, gene id, accession,
+			 probe, etc.], default = all promoters)
+		-cTSS &lt;promoter position file i.e. peak file&gt; (should be centered on TSS)
+
+	Primary Annotation Options:
+		-mask (Masked repeats, can also add &apos;r&apos; to end of genome name)
+		-m &lt;motif file 1&gt; [motif file 2] ... (list of motifs to find in peaks)
+			-mscore (reports the highest log-odds score within the peak)
+			-nmotifs (reports the number of motifs per peak)
+			-mdist (reports distance to closest motif)
+			-mfasta &lt;filename&gt; (reports sites in a fasta file - for building new motifs)
+			-fm &lt;motif file 1&gt; [motif file 2] (list of motifs to filter from above)
+			-rmrevopp &lt;#&gt; (only count sites found within &lt;#&gt; on both strands once, i.e. palindromic)
+			-matrix &lt;prefix&gt; (outputs a motif co-occurrence files:
+				prefix.count.matrix.txt - number of peaks with motif co-occurrence
+				prefix.ratio.matrix.txt - ratio of observed vs. expected  co-occurrence
+				prefix.logPvalue.matrix.txt - co-occurrence enrichment
+				prefix.stats.txt - table of pair-wise motif co-occurrence statistics
+				additional options:
+				-matrixMinDist &lt;#&gt; (minimum distance between motif pairs - to avoid overlap)
+				-matrixMaxDist &lt;#&gt; (maximum distance between motif pairs)
+			-mbed &lt;filename&gt; (Output motif positions to a BED file to load at UCSC (or -mpeak))
+			-mlogic &lt;filename&gt; (will output stats on common motif orientations)
+		-d &lt;tag directory 1&gt; [tag directory 2] ... (list of experiment directories to show
+			tag counts for) NOTE: -dfile &lt;file&gt; where file is a list of directories in first column
+		-bedGraph &lt;bedGraph file 1&gt; [bedGraph file 2] ... (read coverage counts from bedGraph files)
+		-wig &lt;wiggle file 1&gt; [wiggle file 2] ... (read coverage counts from wiggle files)
+		-p &lt;peak file&gt; [peak file 2] ... (to find nearest peaks)
+			-pdist to report only distance (-pdist2 gives directional distance)
+			-pcount to report number of peaks within region
+		-vcf &lt;VCF file&gt; (annotate peaks with genetic variation infomation, one col per individual)
+			-editDistance (Computes the # bp changes relative to reference)
+			-individuals &lt;name1&gt; [name2] ... (restrict analysis to these individuals)
+		-gene &lt;data file&gt; ... (Adds additional data to result based on the closest gene.
+			This is useful for adding gene expression data.  The file must have a header,
+			and the first column must be a GeneID, Accession number, etc.  If the peak
+			cannot be mapped to data in the file then the entry will be left empty.
+		-go &lt;output directory&gt; (perform GO analysis using genes near peaks)
+		-genomeOntology &lt;output directory&gt; (perform genomeOntology analysis on peaks)
+			-gsize &lt;#&gt; (Genome size for genomeOntology analysis, default: 2e9)
+
+	Annotation vs. Histogram mode:
+		-hist &lt;bin size in bp&gt; (i.e 1, 2, 5, 10, 20, 50, 100 etc.)
+		The -hist option can be used to generate histograms of position dependent features relative
+		to the center of peaks.  This is primarily meant to be used with -d and -m options to map
+		distribution of motifs and ChIP-Seq tags.  For ChIP-Seq peaks for a Transcription factor
+		you might want to use the -center option (below) to center peaks on the known motif
+		** If using &quot;-size given&quot;, histogram will be scaled to each region (i.e. 0-100%), with
+		the -hist parameter being the number of bins to divide each region into.
+			Histogram Mode specific Options:
+			-nuc (calculated mononucleotide frequencies at each position,
+				Will report by default if extracting sequence for other purposes like motifs)
+			-di (calculated dinucleotide frequencies at each position)
+			-histNorm &lt;#&gt; (normalize the total tag count for each region to 1, where &lt;#&gt; is the
+				minimum tag total per region - use to avoid tag spikes from low coverage
+			-ghist (outputs profiles for each gene, for peak shape clustering)
+			-rm &lt;#&gt; (remove occurrences of same motif that occur within # bp)
+
+	Peak Centering: (other options are ignored)
+		-center &lt;motif file&gt; (This will re-center peaks on the specified motif, or remove peak
+			if there is no motif in the peak.  ONLY recentering will be performed, and all other
+			options will be ignored.  This will output a new peak file that can then be reanalyzed
+			to reveal fine-grain structure in peaks (It is advised to use -size &lt; 200) with this
+			to keep peaks from moving too far (-mirror flips the position)
+		-multi (returns genomic positions of all sites instead of just the closest to center)
+
+	Advanced Options:
+		-len &lt;#&gt; / -fragLength &lt;#&gt; (Fragment length, default=auto, might want to set to 0 for RNA)
+		-size &lt;#&gt; (Peak size[from center of peak], default=inferred from peak file)
+			-size #,# (i.e. -size -10,50 count tags from -10 bp to +50 bp from center)
+			-size &quot;given&quot; (count tags etc. using the actual regions - for variable length regions)
+		-log (output tag counts as log2(x+1+rand) values - for scatter plots)
+		-sqrt (output tag counts as sqrt(x+rand) values - for scatter plots)
+		-strand &lt;+|-|both&gt; (Count tags on specific strands relative to peak, default: both)
+		-pc &lt;#&gt; (maximum number of tags to count per bp, default=0 [no maximum])
+		-cons (Retrieve conservation information for peaks/sites)
+		-CpG (Calculate CpG/GC content)
+		-ratio (process tag values as ratios - i.e. chip-seq, or mCpG/CpG)
+		-nfr (report nuclesome free region scores instead of tag counts, also -nfrSize &lt;#&gt;)
+		-norevopp (do not search for motifs on the opposite strand [works with -center too])
+		-noadj (do not adjust the tag counts based on total tags sequenced)
+		-norm &lt;#&gt; (normalize tags to this tag count, default=1e7, 0=average tag count in all directories)
+		-pdist (only report distance to nearest peak using -p, not peak name)
+		-map &lt;mapping file&gt; (mapping between peak IDs and promoter IDs, overrides closest assignment)
+		-noann, -nogene (skip genome annotation step, skip TSS annotation)
+		-homer1/-homer2 (by default, the new version of homer [-homer2] is used for finding motifs)
+
+
+    </help>
+</tool>
+