diff hicFindTADs.xml @ 8:b05f292d220c draft

planemo upload for repository https://github.com/maxplanck-ie/HiCExplorer/tree/master/galaxy/wrapper/ commit eec0a4d5a7c5ba4ec0fbd2ead8280c3d143bb9d8
author iuc
date Fri, 27 Apr 2018 03:39:27 -0400
parents db2cc9e1ff76
children 6b7987d22eab
line wrap: on
line diff
--- a/hicFindTADs.xml	Wed Mar 07 03:46:54 2018 -0500
+++ b/hicFindTADs.xml	Fri Apr 27 03:39:27 2018 -0400
@@ -1,5 +1,5 @@
 <tool id="hicexplorer_hicfindtads" name="@BINARY@" version="@WRAPPER_VERSION@.0">
-    <description>find minimum cuts that correspond to boundaries</description>
+    <description>identify TAD boundaries by computing the degree of separation of each Hi-C matrix bin</description>
     <macros>
         <token name="@BINARY@">hicFindTADs</token>
         <import>macros.xml</import>
@@ -8,7 +8,7 @@
     <command detect_errors="exit_code"><![CDATA[
         hicFindTADs
                 --matrix '$matrix_h5_cooler'
-               
+
                 --delta $delta
 
                 #if $minBoundaryDistance:
@@ -24,7 +24,7 @@
                     --correctForMultipleTesting bonferroni
                     --threshold $multiple_comparison_conditional.threshold
                 #else:
-                    --multipleComparisons None             
+                    --multipleComparisons None
                 #end if
 
                 --numberOfProcessors @THREADS@
@@ -70,21 +70,21 @@
 
         <param argument="--minBoundaryDistance" type="integer" value="" optional="True"
                 label="Minimum distance between boundaries (in bp)."
-                help="This parameter can be used to reduce spurious boundaries caused by noise. "/>   
+                help="This parameter can be used to reduce spurious boundaries caused by noise. "/>
 
     </inputs>
     <outputs>
-    
+
         <data name="boundaries" from_work_dir="galaxy_tad_prefix_boundaries.bed" format="bed"
             label="${tool.name} on ${on_string}: Boundary positions" />
-        
+
         <data name="score" from_work_dir="galaxy_tad_prefix_score.bedgraph" format="bedgraph"
             label="${tool.name} on ${on_string}: Matrix with multi-scale TAD scores" />
         <data name="domains" from_work_dir="galaxy_tad_prefix_domains.bed" format="bed"
             label="${tool.name} on ${on_string}: TAD domains" />
         <data name="boundaries_bin" from_work_dir="galaxy_tad_prefix_boundaries.gff"
             format="gff" label="${tool.name} on ${on_string}: Boundary information plus score" />
-        
+
         <data name="tad_score" from_work_dir="galaxy_tad_prefix_tad_score.bm"
             format="bedgraph" label="${tool.name} on ${on_string}: TAD information in bm file" />
 
@@ -108,69 +108,71 @@
             <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" compare="sim_size" delta="35000" />
             <output name="score" file="find_TADs/multiFDR_score.bedgraph" ftype="bedgraph" compare="sim_size" delta="35000" />
             <output name="tad_score" file="find_TADs/multiFDR_tad_score.bm" ftype="bedgraph" compare="sim_size" delta="35000" />
-            <output name="matrix_output" file="find_TADs/multiFDR_zscore_matrix.h5" ftype="h5" compare="sim_size" delta="50000" />
         </test>
     </tests>
     <help><![CDATA[
-Calculate TADs
-==============
+Calculate Topologic Associated Domains
+======================================
+
+Toplogical domains (TADs) are large mainly self-interacting domains. Chromatin interactions occur with higher frequency within a TAD as between TADs. More information_.
 
-Topological associated domains (TADs) are regions on the DNA which tend to interact within the region a lot, but not outside their boundaries. More information_.
+_________________
+
+Usage
+-----
 
-Calculation
-------------
-``hicFindTADs`` computes the TAD regions in two steps: In a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:
+This tool must be used on unmerged matrices (restiction enzyme resolution) produced by ``hicBuildMatrix`` and corrected by ``hicCorrectMatrix``.
+
+_________________
 
-“The absolute value of z represents the distance between the raw score and the population mean in 
-units of the standard deviation. z is negative when the raw score is below the mean, positive when above.” 
-[Source_].
+Computation details
+-------------------
+
+**hicFindTADs** computes the TAD regions in two steps: in a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:
+
+  “The absolute value of z represents the distance between the raw score and the population mean in
+  units of the standard deviation. z is negative when the raw score is below the mean, positive when above.”
+  [Source_].
 
 .. image:: $PATH_TO_IMAGES/z-score.svg
    :width: 100
-   
+
 `Source of image <https://wikimedia.org/api/rest_v1/media/math/render/svg/5ceed701c4042bb34618535c9a902ca1a937a351>`_
 
 In our case the distribution describes the counts per bin of a genomic distance. In a second step the local minima of the TAD-separation score is evaluated with respect to the surrounding bins to assign a p-value. Two multiple testing corrections can be applied to filter the results: `Bonferroni <https://en.wikipedia.org/wiki/Bonferroni_correction>`_ or the `false discovery rate <https://en.wikipedia.org/wiki/False_discovery_rate>`_.
 
-
-Input
------
-
-Parameters
-__________
-- contact matrix to compute the TADs on
-- minimum window length
-- maximum window length
-- step size
-- multiple testing correction
-- minimum threshold
-- minimum distance
-
-
-hicFindTADs tries to identify sensible parameters but those can be change to identify more stringent set of boundaries.
+_________________
 
 Output
 ------
 
-- Boundary positions as a bed file
-- Matrix with multi-scale TAD scores as a bedgraph 
-- TAD domains as a bed file
-- Boundary information plus score as gff
-- TAD information in bm file
-- Z-score matrix in h5
+**hicFindTADs** produces multiple outputs:
+
+- TAD boundaries positions as a BED file and TAD separation score.
+- TAD boundaries positions with delta, p-value and TAD separation score as GFF.
+- TAD domains as a BED file.
+- TAD seperation score as bigwig (bw), bedgraph and numpy array (npz) format. These files can be used to plot the so-called TAD insulation score or TAD separation score along the genome or at specific regions. This score is much more reliable across samples than the number of TADs or the TADs width that can vary depending on the sequencing depth because of the lack of information at certain bins, and depending on the parameters used with this tool.
+- Matrix with multi-scale TAD scores as a bed-matrix (bm) file that can be plotted inside ``hicPlotTADs`` to nicely display TAD insulation score alongside Hi-C heatmap and other datasets.
+- Z-score matrix in h5 format that is useful to quickly test the --thresholdComparisons, --delta and --correctForMultipleTesting parameters by using the --TAD_sep_score_prefix option pointing to this zscore_matrix.h5 file (will be added in a future update).
+
+_________________
 
-The calulated TAD regions can be plotted with ``hicPlotTADs``.
+Usage hints
+-----------
 
+It is mandatory to test multiple parameters of TAD calling with **hicFindTADs** before making conclusions about the number of TADs in a given sample or before comparing TAD calling between multiple conditions. In order to compare numerous TAD calling parameters at once, it is recommended to use ``hicPlotTADs``, below you can find a plot where multiple TAD calling parameters are displayed for *Drosophila melanogaster* embryos:
 
-.. image:: $PATH_TO_IMAGES/master_TADs_plot.png
-   :width: 80 %
+.. image:: $PATH_TO_IMAGES/hicFindTADs_TAD_calling_comparison.png
+   :width: 65 %
 
+We can see that the fourth set of **hicFindTADs** parameters with a threshold of 0.001 gives the best results in terms of TAD calling compared to the corrected Hi-C counts distribution and compared to the enrichment of H3K36me3, which is known to be enriched at TAD boundaries in *Drosophila melanogaster*.
 
+_________________
 
 For more information about HiCExplorer please consider our documentation on readthedocs.io_
 
 .. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
-.. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score 
+.. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score
 .. _information: https://en.wikipedia.org/wiki/Topologically_associating_domain_
 ]]></help>
     <expand macro="citations" />