Mercurial > repos > kevyin > homer
diff annotatePeaks.xml @ 28:f0b5827b6051 draft default tip
Uploaded
author | kevyin |
---|---|
date | Thu, 20 Dec 2012 18:28:03 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotatePeaks.xml Thu Dec 20 18:28:03 2012 -0500 @@ -0,0 +1,164 @@ +<tool id="homer_annotatePeaks" name="homer_annotatePeaks" version="0.0.5"> + <requirements> + <requirement type="package" version="4.1">homer</requirement> + </requirements> + <description></description> + <!--<version_command></version_command>--> + <command> + annotatePeaks.pl $input_bed $genome_selector 1> $out_annotated + 2> $out_log || echo "Error running annotatePeaks." >&2 + </command> + <inputs> + <param format="tabular,bed" name="input_bed" type="data" label="Homer peaks OR BED format"/> + <param name="genome_selector" type="select" label="Genome version"> + <option value="hg19" selected="true">hg19</option> + </param> + <param type="text" name="options" label="Extra options" value="" help="See link below for more options"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + <remove value="/"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + </inputs> + <outputs> + <!--<data format="html" name="html_outfile" label="index" />--> + <!--<data format="html" hidden="True" name="html_outfile" label="index.html" />--> + <data format="csv" name="out_annotated" label="${tool.name} on #echo os.path.splitext(str($input_bed.name))[0]#_genome_${genome_selector}" /> + <data format="txt" name="out_log" label="${tool.name} on #echo os.path.splitext(str($input_bed.name))[0]#_genome_${genome_selector}.log" /> + </outputs> + <tests> + <test> + <!--<param name="input_file" value="extract_genomic_dna.fa" />--> + <!--<output name="html_file" file="sample_output.html" ftype="html" />--> + </test> + </tests> + + <help> + + .. class:: infomark + + **Homer annoatePeaks** + + More information on accepted formats and options + + http://biowhat.ucsd.edu/homer/ngs/annotation.html + + TIP: use homer_bed2pos and homer_pos2bed to convert between the homer peak positions and the BED format. + +**Parameter list** + +Command line options (not all of them are supported):: + + Usage: annotatePeaks.pl <peak file | tss> <genome version> [additional options...] + + Available Genomes (required argument): (name,org,directory,default promoter set) + -- or -- + Custom: provide the path to genome FASTA files (directory or single file) + + User defined annotation files (default is UCSC refGene annotation): + annotatePeaks.pl accepts GTF (gene transfer formatted) files to annotate positions relative + to custom annotations, such as those from de novo transcript discovery or Gencode. + -gtf <gtf format file> (-gff and -gff3 can work for those files, but GTF is better) + + Peak vs. tss/tts/rna mode (works with custom GTF file): + If the first argument is "tss" (i.e. annotatePeaks.pl tss hg18 ...) then a TSS centric + analysis will be carried out. Tag counts and motifs will be found relative to the TSS. + (no position file needed) ["tts" now works too - e.g. 3' end of gene] + ["rna" specifies gene bodies, will automaticall set "-size given"] + NOTE: The default TSS peak size is 4000 bp, i.e. +/- 2kb (change with -size option) + -list <gene id list> (subset of genes to perform analysis [unigene, gene id, accession, + probe, etc.], default = all promoters) + -cTSS <promoter position file i.e. peak file> (should be centered on TSS) + + Primary Annotation Options: + -mask (Masked repeats, can also add 'r' to end of genome name) + -m <motif file 1> [motif file 2] ... (list of motifs to find in peaks) + -mscore (reports the highest log-odds score within the peak) + -nmotifs (reports the number of motifs per peak) + -mdist (reports distance to closest motif) + -mfasta <filename> (reports sites in a fasta file - for building new motifs) + -fm <motif file 1> [motif file 2] (list of motifs to filter from above) + -rmrevopp <#> (only count sites found within <#> on both strands once, i.e. palindromic) + -matrix <prefix> (outputs a motif co-occurrence files: + prefix.count.matrix.txt - number of peaks with motif co-occurrence + prefix.ratio.matrix.txt - ratio of observed vs. expected co-occurrence + prefix.logPvalue.matrix.txt - co-occurrence enrichment + prefix.stats.txt - table of pair-wise motif co-occurrence statistics + additional options: + -matrixMinDist <#> (minimum distance between motif pairs - to avoid overlap) + -matrixMaxDist <#> (maximum distance between motif pairs) + -mbed <filename> (Output motif positions to a BED file to load at UCSC (or -mpeak)) + -mlogic <filename> (will output stats on common motif orientations) + -d <tag directory 1> [tag directory 2] ... (list of experiment directories to show + tag counts for) NOTE: -dfile <file> where file is a list of directories in first column + -bedGraph <bedGraph file 1> [bedGraph file 2] ... (read coverage counts from bedGraph files) + -wig <wiggle file 1> [wiggle file 2] ... (read coverage counts from wiggle files) + -p <peak file> [peak file 2] ... (to find nearest peaks) + -pdist to report only distance (-pdist2 gives directional distance) + -pcount to report number of peaks within region + -vcf <VCF file> (annotate peaks with genetic variation infomation, one col per individual) + -editDistance (Computes the # bp changes relative to reference) + -individuals <name1> [name2] ... (restrict analysis to these individuals) + -gene <data file> ... (Adds additional data to result based on the closest gene. + This is useful for adding gene expression data. The file must have a header, + and the first column must be a GeneID, Accession number, etc. If the peak + cannot be mapped to data in the file then the entry will be left empty. + -go <output directory> (perform GO analysis using genes near peaks) + -genomeOntology <output directory> (perform genomeOntology analysis on peaks) + -gsize <#> (Genome size for genomeOntology analysis, default: 2e9) + + Annotation vs. Histogram mode: + -hist <bin size in bp> (i.e 1, 2, 5, 10, 20, 50, 100 etc.) + The -hist option can be used to generate histograms of position dependent features relative + to the center of peaks. This is primarily meant to be used with -d and -m options to map + distribution of motifs and ChIP-Seq tags. For ChIP-Seq peaks for a Transcription factor + you might want to use the -center option (below) to center peaks on the known motif + ** If using "-size given", histogram will be scaled to each region (i.e. 0-100%), with + the -hist parameter being the number of bins to divide each region into. + Histogram Mode specific Options: + -nuc (calculated mononucleotide frequencies at each position, + Will report by default if extracting sequence for other purposes like motifs) + -di (calculated dinucleotide frequencies at each position) + -histNorm <#> (normalize the total tag count for each region to 1, where <#> is the + minimum tag total per region - use to avoid tag spikes from low coverage + -ghist (outputs profiles for each gene, for peak shape clustering) + -rm <#> (remove occurrences of same motif that occur within # bp) + + Peak Centering: (other options are ignored) + -center <motif file> (This will re-center peaks on the specified motif, or remove peak + if there is no motif in the peak. ONLY recentering will be performed, and all other + options will be ignored. This will output a new peak file that can then be reanalyzed + to reveal fine-grain structure in peaks (It is advised to use -size < 200) with this + to keep peaks from moving too far (-mirror flips the position) + -multi (returns genomic positions of all sites instead of just the closest to center) + + Advanced Options: + -len <#> / -fragLength <#> (Fragment length, default=auto, might want to set to 0 for RNA) + -size <#> (Peak size[from center of peak], default=inferred from peak file) + -size #,# (i.e. -size -10,50 count tags from -10 bp to +50 bp from center) + -size "given" (count tags etc. using the actual regions - for variable length regions) + -log (output tag counts as log2(x+1+rand) values - for scatter plots) + -sqrt (output tag counts as sqrt(x+rand) values - for scatter plots) + -strand <+|-|both> (Count tags on specific strands relative to peak, default: both) + -pc <#> (maximum number of tags to count per bp, default=0 [no maximum]) + -cons (Retrieve conservation information for peaks/sites) + -CpG (Calculate CpG/GC content) + -ratio (process tag values as ratios - i.e. chip-seq, or mCpG/CpG) + -nfr (report nuclesome free region scores instead of tag counts, also -nfrSize <#>) + -norevopp (do not search for motifs on the opposite strand [works with -center too]) + -noadj (do not adjust the tag counts based on total tags sequenced) + -norm <#> (normalize tags to this tag count, default=1e7, 0=average tag count in all directories) + -pdist (only report distance to nearest peak using -p, not peak name) + -map <mapping file> (mapping between peak IDs and promoter IDs, overrides closest assignment) + -noann, -nogene (skip genome annotation step, skip TSS annotation) + -homer1/-homer2 (by default, the new version of homer [-homer2] is used for finding motifs) + + + </help> +</tool> +