comparison genetrack.xml @ 0:25cd59a002d9 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genetrack commit e96df94dba60050fa28aaf55b5bb095717a5f260
author iuc
date Tue, 22 Dec 2015 17:03:27 -0500
parents
children df7ac50ade5d
comparison
equal deleted inserted replaced
-1:000000000000 0:25cd59a002d9
1 <?xml version="1.0"?>
2 <tool id="genetrack" name="GeneTrack" version="@WRAPPER_VERSION@.0">
3 <description>peak predictor</description>
4 <macros>
5 <import>genetrack_macros.xml</import>
6 </macros>
7 <expand macro="requirements" />
8 <command>
9 python $__tool_directory__/genetrack.py
10 --input_format $input_format_cond.input_format
11 #for $i in $input_format_cond.input:
12 --input "${i}" "${i.hid}"
13 #end for
14 --sigma $sigma
15 --exclusion $exclusion
16 --up_width $up_width
17 --down_width $down_width
18 --filter $filter
19 </command>
20 <inputs>
21 <conditional name="input_format_cond">
22 <param name="input_format" type="select" label="Format of files for conversion">
23 <option value="scidx" selected="True">ScIdx</option>
24 <option value="gff">Gff</option>
25 </param>
26 <when value="scidx">
27 <param name="input" type="data" format="scidx" multiple="True" label="Predict peaks on" />
28 </when>
29 <when value="gff">
30 <param name="input" type="data" format="gff" multiple="True" label="Predict peaks on" />
31 </when>
32 </conditional>
33 <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads" help="Higher values increase computation but produce more smoothing." />
34 <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
35 <param name="up_width" type="integer" value="10" min="0" label="Exclusion zone of upstream called peaks" />
36 <param name="down_width" type="integer" value="10" min="0" label="Exclusion zone of downstream called peaks" />
37 <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
38 </inputs>
39 <outputs>
40 <collection name="genetrack_output" type="list" label="Genetrack results on ${on_string}">
41 <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="output" ext="gff" visible="false" />
42 </collection>
43 </outputs>
44 <tests>
45 <test>
46 <param name="input" value="genetrack_input2.gff" ftype="gff" />
47 <param name="input_format" value="gff" />
48 <param name="sigma" value="5" />
49 <param name="exclusion" value="20" />
50 <param name="up_width" value="10" />
51 <param name="down_width" value="10" />
52 <param name="filter" value="3" />
53 <output_collection name="genetrack_output" type="list">
54 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output2.gff" ftype="gff" />
55 </output_collection>
56 </test>
57 <test>
58 <param name="input" value="genetrack_input3.scidx" ftype="scidx" />
59 <param name="input_format" value="scidx" />
60 <param name="sigma" value="5" />
61 <param name="exclusion" value="20" />
62 <param name="up_width" value="10" />
63 <param name="down_width" value="10" />
64 <param name="filter" value="3" />
65 <output_collection name="genetrack_output" type="list">
66 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output3.gff" ftype="gff" />
67 </output_collection>
68 </test>
69 <test>
70 <param name="input" value="genetrack_input_unsorted4.gff" ftype="gff" />
71 <param name="input_format" value="gff" />
72 <param name="sigma" value="5" />
73 <param name="exclusion" value="20" />
74 <param name="up_width" value="10" />
75 <param name="down_width" value="10" />
76 <param name="filter" value="3" />
77 <output_collection name="genetrack_output" type="list">
78 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output4.gff" ftype="gff" />
79 </output_collection>
80 </test>
81 </tests>
82 <help>
83 **What it does**
84
85 GeneTrack separately identifies peaks on the forward "+” (W) and reverse “-” (C) strand. The way that GeneTrack
86 works is to replace each tag with a probabilistic distribution of occurrences for that tag at and around its mapped
87 genomic coordinate. The distance decay of the probabilistic distribution is set by adjusting the value of the
88 tool's **Sigma to use when smoothing reads** parameter. GeneTrack then sums the distribution over all mapped
89 tags. This results in a smooth continuous trace that can be globally broadened or tightened by adjusting the
90 sigma value. GeneTrack starts with the highest smoothed peak first, treating each strand separately if indicated
91 by the data, then sets up an exclusion zone (centered over the peak) defined by the value of the **Peak exclusion
92 zone** parameter (see figure). The exclusion zone prevents any secondary peaks from being called on the same strand
93 within that exclusion zone. In rare cases, it may be desirable to set different exclusion zones upstream (more 5’)
94 versus downstream (more 3’) of the peak.
95
96 .. image:: $PATH_TO_IMAGES/genetrack.png
97
98 GeneTrack continues through the data in order of peak height, until no other peaks are found, and in principle will
99 call a peak at a single isolated tag, if no filter is set using the tool's **Absolute read filter** parameter. A
100 filter value of 1 means that it will stop calling peaks when the tag count in the peak hits 1 (so single tag peaks
101 will be excluded in this case). GeneTrack outputs **chrom** (chromosome number), **strand** (+/W or -/C strand),
102 **start** (lower coordinate of exclusion zone), **end** (higher coordinate of exclusion zone), and **value** (peak
103 height). Genetrack's GFF output reports the start (lower coordinate) and end (higher coordinate) of the exclusion
104 zone.
105
106 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein plus
107 a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
108 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
109
110 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding
111 site size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors
112 mapped by ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically
113 varied between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be
114 desirable if closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that
115 contribute to a binding event to be excluded, because they may not be located sufficiently close to the main peak.
116 If alternative (mutually exclusive) binding is expected for two overlapping sites, and these sites are to be
117 independently recorded, then an empirically determined smaller exclusion zone width is set. Thus the value of sigma
118 is set empirically for each mapped factor, depending upon the resolution and binding site size of the binding event.
119
120 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on
121 only a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might
122 be improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized
123 action of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
124
125 -----
126
127 **Options**
128
129 * **Sigma to use when smoothing reads** - Smooths clusters of tags via a Gaussian distribution.
130 * **Peak exclusion zone** - Exclusion zone around each peak, eliminating all other peaks on the same strand that are within a ± bp distance of the peak.
131 * **Exclusion zone of upstream called peaks** - Defines the exclusion zone centered over peaks upstream of a peak.
132 * **Exclusion zone of downstream called peaks** - Defines the exclusion zone centered over peaks downstream of a peak.
133 * **Filter** - Absolute read filter, restricts output to only peaks with larger peak height.
134
135 -----
136
137 **Output gff Columns**
138
139 1. Chromosome
140 2. Script
141 3. Placeholder (no meaning)
142 4. Start of peak exclusion zone (-e 20)
143 5. End of peak exclusion zone
144 6. Tag sum (not peak height or area under curve, which LionDB provides)
145 7. Strand
146 8. Placeholder (no meaning)
147 9. Attributes (standard deviation of reads located within exclusion zone) = fuzziness of peak
148
149 -----
150
151 **Considerations**
152
153 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein
154 plus a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
155 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
156
157 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding site
158 size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors mapped by
159 ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically varied
160 between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be desirable if
161 closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that contribute to a binding
162 event to be excluded, because they may not be located sufficiently close to the main peak. If alternative (mutually
163 exclusive) binding is expected for two overlapping sites, and these sites are to be independently recorded, then an
164 empirically determined smaller exclusion zone width is set. Thus, the value of sigma is set empirically for each mappedfactor depending upon the resolution and binding site size of the binding event.
165
166 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on only
167 a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might be
168 improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized action
169 of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
170
171 </help>
172 <expand macro="citations" />
173 </tool>