comparison hicFindTADs.xml @ 8:b05f292d220c draft

planemo upload for repository https://github.com/maxplanck-ie/HiCExplorer/tree/master/galaxy/wrapper/ commit eec0a4d5a7c5ba4ec0fbd2ead8280c3d143bb9d8
author iuc
date Fri, 27 Apr 2018 03:39:27 -0400
parents db2cc9e1ff76
children 6b7987d22eab
comparison
equal deleted inserted replaced
7:43332e51a712 8:b05f292d220c
1 <tool id="hicexplorer_hicfindtads" name="@BINARY@" version="@WRAPPER_VERSION@.0"> 1 <tool id="hicexplorer_hicfindtads" name="@BINARY@" version="@WRAPPER_VERSION@.0">
2 <description>find minimum cuts that correspond to boundaries</description> 2 <description>identify TAD boundaries by computing the degree of separation of each Hi-C matrix bin</description>
3 <macros> 3 <macros>
4 <token name="@BINARY@">hicFindTADs</token> 4 <token name="@BINARY@">hicFindTADs</token>
5 <import>macros.xml</import> 5 <import>macros.xml</import>
6 </macros> 6 </macros>
7 <expand macro="requirements" /> 7 <expand macro="requirements" />
8 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 hicFindTADs 9 hicFindTADs
10 --matrix '$matrix_h5_cooler' 10 --matrix '$matrix_h5_cooler'
11 11
12 --delta $delta 12 --delta $delta
13 13
14 #if $minBoundaryDistance: 14 #if $minBoundaryDistance:
15 --minBoundaryDistance $minBoundaryDistance 15 --minBoundaryDistance $minBoundaryDistance
16 #end if 16 #end if
22 --threshold $multiple_comparison_conditional.threshold 22 --threshold $multiple_comparison_conditional.threshold
23 #elif $multiple_comparison_conditional.multiple_comparison_selector == 'bonferroni': 23 #elif $multiple_comparison_conditional.multiple_comparison_selector == 'bonferroni':
24 --correctForMultipleTesting bonferroni 24 --correctForMultipleTesting bonferroni
25 --threshold $multiple_comparison_conditional.threshold 25 --threshold $multiple_comparison_conditional.threshold
26 #else: 26 #else:
27 --multipleComparisons None 27 --multipleComparisons None
28 #end if 28 #end if
29 29
30 --numberOfProcessors @THREADS@ 30 --numberOfProcessors @THREADS@
31 --outPrefix galaxy_tad_prefix 31 --outPrefix galaxy_tad_prefix
32 ]]></command> 32 ]]></command>
68 values: 0.001, 0.01, 0.03, 0.05, 0.1. Other single or multiple values 68 values: 0.001, 0.01, 0.03, 0.05, 0.1. Other single or multiple values
69 can be given."/> 69 can be given."/>
70 70
71 <param argument="--minBoundaryDistance" type="integer" value="" optional="True" 71 <param argument="--minBoundaryDistance" type="integer" value="" optional="True"
72 label="Minimum distance between boundaries (in bp)." 72 label="Minimum distance between boundaries (in bp)."
73 help="This parameter can be used to reduce spurious boundaries caused by noise. "/> 73 help="This parameter can be used to reduce spurious boundaries caused by noise. "/>
74 74
75 </inputs> 75 </inputs>
76 <outputs> 76 <outputs>
77 77
78 <data name="boundaries" from_work_dir="galaxy_tad_prefix_boundaries.bed" format="bed" 78 <data name="boundaries" from_work_dir="galaxy_tad_prefix_boundaries.bed" format="bed"
79 label="${tool.name} on ${on_string}: Boundary positions" /> 79 label="${tool.name} on ${on_string}: Boundary positions" />
80 80
81 <data name="score" from_work_dir="galaxy_tad_prefix_score.bedgraph" format="bedgraph" 81 <data name="score" from_work_dir="galaxy_tad_prefix_score.bedgraph" format="bedgraph"
82 label="${tool.name} on ${on_string}: Matrix with multi-scale TAD scores" /> 82 label="${tool.name} on ${on_string}: Matrix with multi-scale TAD scores" />
83 <data name="domains" from_work_dir="galaxy_tad_prefix_domains.bed" format="bed" 83 <data name="domains" from_work_dir="galaxy_tad_prefix_domains.bed" format="bed"
84 label="${tool.name} on ${on_string}: TAD domains" /> 84 label="${tool.name} on ${on_string}: TAD domains" />
85 <data name="boundaries_bin" from_work_dir="galaxy_tad_prefix_boundaries.gff" 85 <data name="boundaries_bin" from_work_dir="galaxy_tad_prefix_boundaries.gff"
86 format="gff" label="${tool.name} on ${on_string}: Boundary information plus score" /> 86 format="gff" label="${tool.name} on ${on_string}: Boundary information plus score" />
87 87
88 <data name="tad_score" from_work_dir="galaxy_tad_prefix_tad_score.bm" 88 <data name="tad_score" from_work_dir="galaxy_tad_prefix_tad_score.bm"
89 format="bedgraph" label="${tool.name} on ${on_string}: TAD information in bm file" /> 89 format="bedgraph" label="${tool.name} on ${on_string}: TAD information in bm file" />
90 90
91 <data name="matrix_output" from_work_dir="galaxy_tad_prefix_zscore_matrix.h5" 91 <data name="matrix_output" from_work_dir="galaxy_tad_prefix_zscore_matrix.h5"
92 format="h5" label="${tool.name} on ${on_string}: Z-score matrix in h5" /> 92 format="h5" label="${tool.name} on ${on_string}: Z-score matrix in h5" />
106 <output name="boundaries" file="find_TADs/multiFDR_boundaries.bed" ftype="bed" compare="sim_size" delta="35000" /> 106 <output name="boundaries" file="find_TADs/multiFDR_boundaries.bed" ftype="bed" compare="sim_size" delta="35000" />
107 <output name="boundaries_bin" file="find_TADs/multiFDR_boundaries.gff" ftype="gff" compare="sim_size" delta="35000" /> 107 <output name="boundaries_bin" file="find_TADs/multiFDR_boundaries.gff" ftype="gff" compare="sim_size" delta="35000" />
108 <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" compare="sim_size" delta="35000" /> 108 <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" compare="sim_size" delta="35000" />
109 <output name="score" file="find_TADs/multiFDR_score.bedgraph" ftype="bedgraph" compare="sim_size" delta="35000" /> 109 <output name="score" file="find_TADs/multiFDR_score.bedgraph" ftype="bedgraph" compare="sim_size" delta="35000" />
110 <output name="tad_score" file="find_TADs/multiFDR_tad_score.bm" ftype="bedgraph" compare="sim_size" delta="35000" /> 110 <output name="tad_score" file="find_TADs/multiFDR_tad_score.bm" ftype="bedgraph" compare="sim_size" delta="35000" />
111 <output name="matrix_output" file="find_TADs/multiFDR_zscore_matrix.h5" ftype="h5" compare="sim_size" delta="50000" />
112 </test> 111 </test>
113 </tests> 112 </tests>
114 <help><![CDATA[ 113 <help><![CDATA[
115 Calculate TADs 114 Calculate Topologic Associated Domains
116 ============== 115 ======================================
117 116
118 Topological associated domains (TADs) are regions on the DNA which tend to interact within the region a lot, but not outside their boundaries. More information_. 117 Toplogical domains (TADs) are large mainly self-interacting domains. Chromatin interactions occur with higher frequency within a TAD as between TADs. More information_.
119 118
120 Calculation 119 _________________
121 ------------
122 ``hicFindTADs`` computes the TAD regions in two steps: In a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:
123 120
124 “The absolute value of z represents the distance between the raw score and the population mean in 121 Usage
125 units of the standard deviation. z is negative when the raw score is below the mean, positive when above.” 122 -----
126 [Source_]. 123
124 This tool must be used on unmerged matrices (restiction enzyme resolution) produced by ``hicBuildMatrix`` and corrected by ``hicCorrectMatrix``.
125
126 _________________
127
128 Computation details
129 -------------------
130
131 **hicFindTADs** computes the TAD regions in two steps: in a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:
132
133 “The absolute value of z represents the distance between the raw score and the population mean in
134 units of the standard deviation. z is negative when the raw score is below the mean, positive when above.”
135 [Source_].
127 136
128 .. image:: $PATH_TO_IMAGES/z-score.svg 137 .. image:: $PATH_TO_IMAGES/z-score.svg
129 :width: 100 138 :width: 100
130 139
131 `Source of image <https://wikimedia.org/api/rest_v1/media/math/render/svg/5ceed701c4042bb34618535c9a902ca1a937a351>`_ 140 `Source of image <https://wikimedia.org/api/rest_v1/media/math/render/svg/5ceed701c4042bb34618535c9a902ca1a937a351>`_
132 141
133 In our case the distribution describes the counts per bin of a genomic distance. In a second step the local minima of the TAD-separation score is evaluated with respect to the surrounding bins to assign a p-value. Two multiple testing corrections can be applied to filter the results: `Bonferroni <https://en.wikipedia.org/wiki/Bonferroni_correction>`_ or the `false discovery rate <https://en.wikipedia.org/wiki/False_discovery_rate>`_. 142 In our case the distribution describes the counts per bin of a genomic distance. In a second step the local minima of the TAD-separation score is evaluated with respect to the surrounding bins to assign a p-value. Two multiple testing corrections can be applied to filter the results: `Bonferroni <https://en.wikipedia.org/wiki/Bonferroni_correction>`_ or the `false discovery rate <https://en.wikipedia.org/wiki/False_discovery_rate>`_.
134 143
135 144 _________________
136 Input
137 -----
138
139 Parameters
140 __________
141 - contact matrix to compute the TADs on
142 - minimum window length
143 - maximum window length
144 - step size
145 - multiple testing correction
146 - minimum threshold
147 - minimum distance
148
149
150 hicFindTADs tries to identify sensible parameters but those can be change to identify more stringent set of boundaries.
151 145
152 Output 146 Output
153 ------ 147 ------
154 148
155 - Boundary positions as a bed file 149 **hicFindTADs** produces multiple outputs:
156 - Matrix with multi-scale TAD scores as a bedgraph
157 - TAD domains as a bed file
158 - Boundary information plus score as gff
159 - TAD information in bm file
160 - Z-score matrix in h5
161 150
162 The calulated TAD regions can be plotted with ``hicPlotTADs``. 151 - TAD boundaries positions as a BED file and TAD separation score.
152 - TAD boundaries positions with delta, p-value and TAD separation score as GFF.
153 - TAD domains as a BED file.
154 - TAD seperation score as bigwig (bw), bedgraph and numpy array (npz) format. These files can be used to plot the so-called TAD insulation score or TAD separation score along the genome or at specific regions. This score is much more reliable across samples than the number of TADs or the TADs width that can vary depending on the sequencing depth because of the lack of information at certain bins, and depending on the parameters used with this tool.
155 - Matrix with multi-scale TAD scores as a bed-matrix (bm) file that can be plotted inside ``hicPlotTADs`` to nicely display TAD insulation score alongside Hi-C heatmap and other datasets.
156 - Z-score matrix in h5 format that is useful to quickly test the --thresholdComparisons, --delta and --correctForMultipleTesting parameters by using the --TAD_sep_score_prefix option pointing to this zscore_matrix.h5 file (will be added in a future update).
163 157
158 _________________
164 159
165 .. image:: $PATH_TO_IMAGES/master_TADs_plot.png 160 Usage hints
166 :width: 80 % 161 -----------
167 162
163 It is mandatory to test multiple parameters of TAD calling with **hicFindTADs** before making conclusions about the number of TADs in a given sample or before comparing TAD calling between multiple conditions. In order to compare numerous TAD calling parameters at once, it is recommended to use ``hicPlotTADs``, below you can find a plot where multiple TAD calling parameters are displayed for *Drosophila melanogaster* embryos:
168 164
165 .. image:: $PATH_TO_IMAGES/hicFindTADs_TAD_calling_comparison.png
166 :width: 65 %
167
168 We can see that the fourth set of **hicFindTADs** parameters with a threshold of 0.001 gives the best results in terms of TAD calling compared to the corrected Hi-C counts distribution and compared to the enrichment of H3K36me3, which is known to be enriched at TAD boundaries in *Drosophila melanogaster*.
169
170 _________________
169 171
170 For more information about HiCExplorer please consider our documentation on readthedocs.io_ 172 For more information about HiCExplorer please consider our documentation on readthedocs.io_
171 173
172 .. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html 174 .. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
173 .. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score 175 .. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score
174 .. _information: https://en.wikipedia.org/wiki/Topologically_associating_domain_ 176 .. _information: https://en.wikipedia.org/wiki/Topologically_associating_domain_
175 ]]></help> 177 ]]></help>
176 <expand macro="citations" /> 178 <expand macro="citations" />
177 </tool> 179 </tool>