view hicFindTADs.xml @ 6:033c14b59b9a draft

planemo upload for repository https://github.com/maxplanck-ie/HiCExplorer/tree/master/galaxy/wrapper/ commit 8def073a083b0619e366436a067a614555190058
author iuc
date Tue, 02 Jan 2018 10:14:10 -0500
parents db2cc9e1ff76
children b05f292d220c
line wrap: on
line source

<tool id="hicexplorer_hicfindtads" name="@BINARY@" version="@WRAPPER_VERSION@.0">
    <description>find minimum cuts that correspond to boundaries</description>
    <macros>
        <token name="@BINARY@">hicFindTADs</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        hicFindTADs
                --matrix '$matrix_h5_cooler'
               
                --delta $delta

                #if $minBoundaryDistance:
                --minBoundaryDistance $minBoundaryDistance
                #end if
                --minDepth $minDepth
                --maxDepth $maxDepth
                --step $step
                #if $multiple_comparison_conditional.multiple_comparison_selector == 'fdr':
                    --correctForMultipleTesting fdr
                    --threshold $multiple_comparison_conditional.threshold
                #elif $multiple_comparison_conditional.multiple_comparison_selector == 'bonferroni':
                    --correctForMultipleTesting bonferroni
                    --threshold $multiple_comparison_conditional.threshold
                #else:
                    --multipleComparisons None             
                #end if

                --numberOfProcessors @THREADS@
                --outPrefix galaxy_tad_prefix
    ]]></command>
    <inputs>
        <expand macro='matrix_h5_cooler_macro' />

        <param argument="--minDepth" type="integer" value="40000"
                label="Minimum window length (in bp) to be considered to the left and to the right of each Hi-C bin."
                help="This number should be at least 3 times as large as the bin size of the Hi-C matrix."/>
        <param argument="--maxDepth" type="integer" value="100000"
                label="Maximum window length (in bp) to be considered to the left and to the right of each Hi-C bin."
                help="This number should around 6-10 times as large as the bin size of the Hi-C matrix."/>
        <param argument="--step" type="integer" value="10000"
                label="Step size when moving from minDepth to maxDepth"
                help="The step size grows exponentially as maxDeph + (step * int(x)**1.5) for x in [0, 1, ...]
                until  it reaches maxDepth. For example, selecting step=10,000, minDepth=20,000
                and maxDepth=150,000 will compute TAD-scores for window sizes:
                20,000, 30,000, 40,000, 70,000 and 100,000"/>
        <conditional name="multiple_comparison_conditional">
            <param name="multiple_comparison_selector" type="select" label="Multiple Testing Corrections" >
                <option value="fdr" selected="True">False discovery rate</option>
                <option value="bonferroni">Bonferroni correction</option>
                <option value="None">No correction</option>
            </param>
            <when value="fdr">
                <param name="threshold" type="float" value="0.01" label="q-value" />
            </when>
            <when value="bonferroni">
                <param name="threshold" type="float" value="0.01" label="p-value" />
            </when>
            <when value="None" />
        </conditional>
        <param argument="--delta" type="float" value="0.001" optional="True"
                label="Minimum threshold of the difference between the TAD-separation score of a putative boundary and the mean of the TAD-sep. score of surrounding bins."
                help="The delta value reduces spurious boundaries that are shallow, which usually
                        occur at the center of large TADs when the TAD-sep. score is flat. Higher
                        delta threshold values produce more conservative boundary estimations. By
                        default, multiple delta thresholds are saved for the following delta
                        values: 0.001, 0.01, 0.03, 0.05, 0.1. Other single or multiple values
                        can be given."/>

        <param argument="--minBoundaryDistance" type="integer" value="" optional="True"
                label="Minimum distance between boundaries (in bp)."
                help="This parameter can be used to reduce spurious boundaries caused by noise. "/>   

    </inputs>
    <outputs>
    
        <data name="boundaries" from_work_dir="galaxy_tad_prefix_boundaries.bed" format="bed"
            label="${tool.name} on ${on_string}: Boundary positions" />
        
        <data name="score" from_work_dir="galaxy_tad_prefix_score.bedgraph" format="bedgraph"
            label="${tool.name} on ${on_string}: Matrix with multi-scale TAD scores" />
        <data name="domains" from_work_dir="galaxy_tad_prefix_domains.bed" format="bed"
            label="${tool.name} on ${on_string}: TAD domains" />
        <data name="boundaries_bin" from_work_dir="galaxy_tad_prefix_boundaries.gff"
            format="gff" label="${tool.name} on ${on_string}: Boundary information plus score" />
        
        <data name="tad_score" from_work_dir="galaxy_tad_prefix_tad_score.bm"
            format="bedgraph" label="${tool.name} on ${on_string}: TAD information in bm file" />

        <data name="matrix_output" from_work_dir="galaxy_tad_prefix_zscore_matrix.h5"
            format="h5" label="${tool.name} on ${on_string}: Z-score matrix in h5" />
    </outputs>
    <tests>
        <test>
            <param name="matrix_h5_cooler" value="small_test_matrix.h5"/>

            <param name="minDepth" value="60000"/>
            <param name="maxDepth" value="180000"/>
            <param name="step" value="20000"/>
            <param name="minBoundaryDistance" value="20000" />
            <conditional name="multiple_comparison_conditional">
                <param name="multiple_comparison_selector" value="fdr"/>
                <param name="threshold" value="0.1" />
            </conditional>
            <output name="boundaries" file="find_TADs/multiFDR_boundaries.bed" ftype="bed" compare="sim_size" delta="35000" />
            <output name="boundaries_bin" file="find_TADs/multiFDR_boundaries.gff" ftype="gff" compare="sim_size" delta="35000" />
            <output name="domains" file="find_TADs/multiFDR_domains.bed" ftype="bed" compare="sim_size" delta="35000" />
            <output name="score" file="find_TADs/multiFDR_score.bedgraph" ftype="bedgraph" compare="sim_size" delta="35000" />
            <output name="tad_score" file="find_TADs/multiFDR_tad_score.bm" ftype="bedgraph" compare="sim_size" delta="35000" />
            <output name="matrix_output" file="find_TADs/multiFDR_zscore_matrix.h5" ftype="h5" compare="sim_size" delta="50000" />
        </test>
    </tests>
    <help><![CDATA[
Calculate TADs
==============

Topological associated domains (TADs) are regions on the DNA which tend to interact within the region a lot, but not outside their boundaries. More information_.

Calculation
------------
``hicFindTADs`` computes the TAD regions in two steps: In a first step it computes a TAD-separation score based on a z-score matrix for all bins. The z-score is defined as:

“The absolute value of z represents the distance between the raw score and the population mean in 
units of the standard deviation. z is negative when the raw score is below the mean, positive when above.” 
[Source_].

.. image:: $PATH_TO_IMAGES/z-score.svg
   :width: 100
   
`Source of image <https://wikimedia.org/api/rest_v1/media/math/render/svg/5ceed701c4042bb34618535c9a902ca1a937a351>`_

In our case the distribution describes the counts per bin of a genomic distance. In a second step the local minima of the TAD-separation score is evaluated with respect to the surrounding bins to assign a p-value. Two multiple testing corrections can be applied to filter the results: `Bonferroni <https://en.wikipedia.org/wiki/Bonferroni_correction>`_ or the `false discovery rate <https://en.wikipedia.org/wiki/False_discovery_rate>`_.


Input
-----

Parameters
__________
- contact matrix to compute the TADs on
- minimum window length
- maximum window length
- step size
- multiple testing correction
- minimum threshold
- minimum distance


hicFindTADs tries to identify sensible parameters but those can be change to identify more stringent set of boundaries.

Output
------

- Boundary positions as a bed file
- Matrix with multi-scale TAD scores as a bedgraph 
- TAD domains as a bed file
- Boundary information plus score as gff
- TAD information in bm file
- Z-score matrix in h5

The calulated TAD regions can be plotted with ``hicPlotTADs``.


.. image:: $PATH_TO_IMAGES/master_TADs_plot.png
   :width: 80 %



For more information about HiCExplorer please consider our documentation on readthedocs.io_

.. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
.. _Source: https://en.wikipedia.org/wiki/Standard_score#Calculation_from_raw_score 
.. _information: https://en.wikipedia.org/wiki/Topologically_associating_domain_
]]></help>
    <expand macro="citations" />
</tool>