view hicFindTADs.xml @ 18:d404277cc87a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hicexplorer commit 2a0943e78bdc8ebb13f181399206a9eea37ed78f"
author iuc
date Tue, 16 Mar 2021 14:59:18 +0000
parents 1d9b575fe97d
children 5746d61f0a91
line wrap: on
line source

<tool id="hicexplorer_hicfindtads" name="@BINARY@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>identify TAD boundaries by computing the degree of separation of each Hi-C matrix bin</description>
    <macros>
        <token name="@BINARY@">hicFindTADs</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        ln -s '$matrix_h5_cooler' 'matrix.$matrix_h5_cooler.ext' &&

        #if $precomputedZscore_conditional.precomputedZscore_selector == 'precomputed':
            ln -s '$precomputedZscore_conditional.scoreFile' 'prefix_tad_score.bm' &&
            ln -s '$precomputedZscore_conditional.zscoreMatrix' 'prefix_zscore_matrix.h5' &&
        #end if
        @BINARY@
            --matrix 'matrix.$matrix_h5_cooler.ext'
            --delta $delta
            #if $minBoundaryDistance:
            --minBoundaryDistance $minBoundaryDistance
            #end if

            #if $precomputedZscore_conditional.precomputedZscore_selector == 'scratch':
                --minDepth $precomputedZscore_conditional.minDepth
                --maxDepth $precomputedZscore_conditional.maxDepth
                --step $precomputedZscore_conditional.step
            #elif $precomputedZscore_conditional.precomputedZscore_selector == 'precomputed':
                --TAD_sep_score_prefix prefix
            #end if
            #if $multiple_comparison_conditional.multiple_comparison_selector == 'fdr':
                --correctForMultipleTesting fdr
                --threshold $multiple_comparison_conditional.threshold
            #elif $multiple_comparison_conditional.multiple_comparison_selector == 'bonferroni':
                --correctForMultipleTesting bonferroni
                --threshold $multiple_comparison_conditional.threshold
            #else:
                --multipleComparisons None
            #end if

            --numberOfProcessors @THREADS@
            --outPrefix galaxy_tad_prefix
            #if $chromosomes:
                --chromosomes #echo "' '".join([ "'%s'" % $chrom.chromosome for $chrom in $chromosomes ])#
            #end if
        #if $precomputedZscore_conditional.precomputedZscore_selector == 'scratch':
            #if $matrix_h5_cooler.ext == 'cool':
                && mv galaxy_tad_prefix*.cool matrix
            #elif $matrix_h5_cooler.ext == 'h5':
                && mv galaxy_tad_prefix*.h5 matrix
            #end if
        #end if
    ]]>    </command>
    <inputs>
        <expand macro='matrix_h5_cooler_macro' />
        <conditional name='precomputedZscore_conditional'>
            <param name='precomputedZscore_selector' type="select" label="Compute from scratch or use precomputed data">
                <option value='scratch' selected='True'>From scratch</option>
                <option value='precomputed'>Precomputed z-score matrix</option>
            </param>
            <when value='scratch'>
                <param argument="--minDepth" type="integer" value="5000" label="Minimum window length (in bp) to be considered to the left and to the right of each Hi-C bin." help="This number should be at least 3 times as large as the bin size of the Hi-C matrix." />
                <param argument="--maxDepth" type="integer" value="10000" label="Maximum window length (in bp) to be considered to the left and to the right of each Hi-C bin." help="This number should around 6-10 times as large as the bin size of the Hi-C matrix." />
                <param argument="--step" type="integer" value="10000" label="Step size when moving from minDepth to maxDepth" help="The step size grows exponentially as maxDeph + (step * int(x)**1.5) for x in [0, 1, ...]
                        until  it reaches maxDepth. For example, selecting step=10,000, minDepth=20,000
                        and maxDepth=150,000 will compute TAD-scores for window sizes:
                        20,000, 30,000, 40,000, 70,000 and 100,000" />
            </when>
            <when value='precomputed'>
                <param name="scoreFile" type="data" format='bedgraph' label="Precomputed TAD score file (bm)" />
                <param name="zscoreMatrix" type="data" format='cool,h5' label="Precomputed z-score matrix" />
            </when>
        </conditional>
        <conditional name="multiple_comparison_conditional">
            <param name="multiple_comparison_selector" type="select" label="Multiple Testing Corrections">
                <option value="fdr" selected="True">False discovery rate</option>
                <option value="bonferroni">Bonferroni correction</option>
                <option value="None">No correction</option>
            </param>
            <when value="fdr">
                <param argument="--threshold" type="float" value="0.01" label="q-value" />
            </when>
            <when value="bonferroni">
                <param argument="--threshold" type="float" value="0.01" label="p-value" />
            </when>
            <when value="None" />
        </conditional>
        <param argument="--delta" type="float" value="0.001" optional="True" label="Minimum threshold of the difference between the TAD-separation score of a putative boundary and the mean of the TAD-sep. score of surrounding bins." help="The delta value reduces spurious boundaries that are shallow, which usually
                        occur at the center of large TADs when the TAD-sep. score is flat. Higher
                        delta threshold values produce more conservative boundary estimations. By
                        default, multiple delta thresholds are saved for the following delta
                        values: 0.001, 0.01, 0.03, 0.05, 0.1. Other single or multiple values
                        can be given." />
        <param argument="--minBoundaryDistance" type="integer" value="" optional="True" label="Minimum distance between boundaries (in bp)." help="This parameter can be used to reduce spurious boundaries caused by noise. " />
        <repeat name="chromosomes" title="List of chromosomes to be included in the correlation" min="0">
            <param name="chromosome" type="text" label='chromosome (one per field)'>
                <validator type="empty_field" />
            </param>
        </repeat>
    </inputs>
    <outputs>
        <data name="boundaries" from_work_dir="galaxy_tad_prefix_boundaries.bed" format="bed" label="${tool.name} on ${on_string}: Boundary positions" />
        <data name="score" from_work_dir="galaxy_tad_prefix_score.bedgraph" format="bedgraph" label="${tool.name} on ${on_string}: Matrix with multi-scale TAD scores" />
        <data name="domains" from_work_dir="galaxy_tad_prefix_domains.bed" format="bed" label="${tool.name} on ${on_string}: TAD domains" />
        <data name="boundaries_bin" from_work_dir="galaxy_tad_prefix_boundaries.gff" format="gff" label="${tool.name} on ${on_string}: Boundary information plus score" />
        <data name="tad_score" from_work_dir="galaxy_tad_prefix_tad_score.bm" format="bedgraph" label="${tool.name} on ${on_string}: TAD information in bm file">
            <filter>precomputedZscore_conditional.precomputedZscore_selector == 'scratch'</filter>
        </data>
        <data name="matrix_output" from_work_dir="matrix" format="h5" label="${tool.name} z-score matrix on ${on_string}">
            <filter>precomputedZscore_conditional.precomputedZscore_selector == 'scratch'</filter>
            <change_format>
                <when input="matrix_h5_cooler.ext" value="cool" format="cool" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="matrix_h5_cooler" value="small_test_matrix.h5" />
            <conditional name="precomputedZscore_conditional">
                <param name="precomputedZscore_selector" value="scratch" />
                <param name="minDepth" value="15000" />
                <param name="maxDepth" value="30000" />
                <param name="step" value="15000" />
            </conditional>
            <param name="minBoundaryDistance" value="5000" />
            <conditional name="multiple_comparison_conditional">
                <param name="multiple_comparison_selector" value="fdr" />
                <param name="threshold" value="0.8" />
            </conditional>
            <output name="boundaries" file="find_TADs/multiFDR_boundaries.bed" ftype="bed" />
            <output name="boundaries_bin" file="find_TADs/multiFDR_boundaries.gff" ftype="gff" />
            <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" />
            <output name="score" ftype="bedgraph">
                <assert_contents>
                    <has_n_lines n="24097" />
                    <has_n_columns n="4" />
                    <has_text text="chr2LHet" />
                    <has_text text="chrYHet" />
                    <has_text text="chrX" />
                </assert_contents>
            </output>
            <output name="tad_score" file="find_TADs/multiFDR_tad_score.bm" ftype="bedgraph" />
            <output name="matrix_output" ftype="h5">
                <assert_contents>
                    <has_h5_keys keys='intervals,matrix' />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="matrix_h5_cooler" value="small_test_matrix.h5" />
            <conditional name="precomputedZscore_conditional">
                <param name="precomputedZscore_selector" value="precomputed" />
                <param name="scoreFile" value="find_TADs/multiFDR_tad_score.bm" />
                <param name="zscoreMatrix" value="find_TADs/multiFDR_zscore_matrix.h5" />
            </conditional>
            <param name="minBoundaryDistance" value="5000" />
            <conditional name="multiple_comparison_conditional">
                <param name="multiple_comparison_selector" value="fdr" />
                <param name="threshold" value="0.1" />
            </conditional>
            <output name="boundaries" file="find_TADs/multiFDR_boundaries.bed" ftype="bed" />
            <output name="boundaries_bin" file="find_TADs/multiFDR_boundaries.gff" ftype="gff" />
            <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" />
            <output name="score" ftype="bedgraph">
                <assert_contents>
                    <has_n_lines n="24097" />
                    <has_n_columns n="4" />
                    <has_text text="chr2LHet" />
                    <has_text text="chrYHet" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

Calculate Topologic Associated Domains
======================================

Toplogical domains (TADs) are large mainly self-interacting domains. Chromatin interactions occur with higher frequency within a TAD as between TADs. More information_.

_________________

Usage
-----

This tool must be used on unmerged matrices (restiction enzyme resolution) produced by ``hicBuildMatrix`` and corrected by ``hicCorrectMatrix``.

_________________

Computation details
-------------------

**hicFindTADs** computes the TAD regions in two steps: in a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:

  “The absolute value of z represents the distance between the raw score and the population mean in
  units of the standard deviation. z is negative when the raw score is below the mean, positive when above.”
  [Source_].

.. image:: $PATH_TO_IMAGES/z-score.svg
   :width: 100

`Source of image <https://wikimedia.org/api/rest_v1/media/math/render/svg/5ceed701c4042bb34618535c9a902ca1a937a351>`_

In our case the distribution describes the counts per bin of a genomic distance. In a second step the local minima of the TAD-separation score is evaluated with respect to the surrounding bins to assign a p-value. Two multiple testing corrections can be applied to filter the results: `Bonferroni <https://en.wikipedia.org/wiki/Bonferroni_correction>`_ or the `false discovery rate <https://en.wikipedia.org/wiki/False_discovery_rate>`_.

_________________

Output
------

**hicFindTADs** produces multiple outputs:

- TAD boundaries positions as a BED file and TAD separation score.
- TAD boundaries positions with delta, p-value and TAD separation score as GFF.
- TAD domains as a BED file.
- TAD seperation score as bigwig (bw), bedgraph and numpy array (npz) format. These files can be used to plot the so-called TAD insulation score or TAD separation score along the genome or at specific regions. This score is much more reliable across samples than the number of TADs or the TADs width that can vary depending on the sequencing depth because of the lack of information at certain bins, and depending on the parameters used with this tool.
- Matrix with multi-scale TAD scores as a bed-matrix (bm) file that can be plotted inside ``pyGenomeTracks`` to nicely display TAD insulation score alongside Hi-C heatmap and other datasets.
- Z-score matrix in h5 format that is useful to quickly test the --thresholdComparisons, --delta and --correctForMultipleTesting parameters by using the --TAD_sep_score_prefix option pointing to this zscore_matrix.h5 file (will be added in a future update).

_________________

Usage hints
-----------

It is mandatory to test multiple parameters of TAD calling with **hicFindTADs** before making conclusions about the number of TADs in a given sample or before comparing TAD calling between multiple conditions. In order to compare numerous TAD calling parameters at once, it is recommended to use ``pyGenomeTracks``, below you can find a plot where multiple TAD calling parameters are displayed for *Drosophila melanogaster* embryos:

.. image:: $PATH_TO_IMAGES/hicFindTADs_TAD_calling_comparison.png
   :width: 65 %

We can see that the fourth set of **hicFindTADs** parameters with a threshold of 0.001 gives the best results in terms of TAD calling compared to the corrected Hi-C counts distribution and compared to the enrichment of H3K36me3, which is known to be enriched at TAD boundaries in *Drosophila melanogaster*.

_________________

For more information about HiCExplorer please consider our documentation on readthedocs.io_

.. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
.. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score
.. _information: https://en.wikipedia.org/wiki/Topologically_associating_domain_
]]>    </help>
    <expand macro="citations" />
</tool>