Mercurial > repos > rnateam > footprint

<?xml version="1.0" encoding="UTF-8"?>
<tool id="footprint" name="footprint" version="1.0.0">
    <requirements>
        <requirement type="package" version="1.0.0">footprint</requirement>
    </requirements>
    <command detect_errors="aggressive"><![CDATA[
ln -s '$bam_file' ./bam_file.bam
&&
find_footprints.sh
    ./bam_file.bam
    '$chrom_sizes'
    '$motif_coords'
    ## genome source
    #if $refGenomeSource.genomeSource == "history":
        '$refGenomeSource.ownFile'
    #else
        '$refGenomeSource.builtin.fields.path'
    #end if
    '$factor_name'
    '$bias_file'
    '$peak_file'
    $no_of_components
    $background
    $fixed_bg
    &&
mv *.PARAM PARAM &&
mv *.RESULTS RESULTS &&
mv *.plot2.png plot2.png &&
mv *.plot1.png plot1.png

]]>
    </command>
    <inputs>
        <param name="bam_file" type="data" format="BAM" label="alignment bam file" help="" />
        <param name="chrom_sizes" type="data" format="tablular" label="chromosome length" help="" />
        <param name="motif_coords" type="data" format="BED" label="coordinates of motif" help="" />
        <conditional name="refGenomeSource">
            <param name="genomeSource" type="select"
                label="Will you select a reference genome from your
                history or use a built-in genome?"
                help="The version of genome against which the reads were aligned.">
                <option value="fai" selected="True">
                    Use a built-in genome</option>
                <option value="history">
                    Use a genome from my current history</option>
            </param>
            <when value="fai">
            <param name="builtin" type="select"
                label="Select a reference genome">
                <options from_data_table="sam_fa_indices">
                    <filter type="sort_by" column="1" />
                    <validator type="no_options"
                    message="A built-in reference genome is not available
                    for the build associated with the selected input file"/>
                </options>
            </param>
            </when>
            <when value="history">
                <param name="ownFile" type="data" format="fasta"
                label="Select the reference genome"  help="Genome sequences in FASTA format" />
            </when>
        </conditional>
        <param name="factor_name" type="text" label="transcription factor" help="e.g. CTCF" />
        <param name="bias_file" type="data"  format="tabular,txt" label="cleavage/transposition bias" help="" />
        <param name="peak_file" type="data" format="tabular" label="coordinates of ChIP-seq peaks" help="" />
        <param name="no_of_components" type="select" label="number of components">
            <option value="2" selected="true">2</option>
            <option value="3">3</option>
        </param>
        <param name="background" type="select" label="background components">
            <option value="Seq" selected="true">Seq</option>
            <option value="Flat">Flat</option>
        </param>
        <param name="fixed_bg" type="select" label="fixed background component">
            <option value="TRUE" selected="true">TRUE</option>
            <option value="FALSE">FALSE</option>
        </param>
    </inputs>
    <outputs>
        <data name="RESULTS" format="tabular" from_work_dir="RESULTS" label="${tool.name} on ${on_string}: results" />
        <data name="PARAM" format="txt" from_work_dir="PARAM" label="${tool.name} on ${on_string}: parameters" />
        <data name="plot1" format="png" from_work_dir="plot1.png" label="${tool.name} on ${on_string}: plot 1" />
        <data name="plot2" format="png" from_work_dir="plot2.png" label="${tool.name} on ${on_string}: plot 2" />
    </outputs>
    <tests>
        <test>
            <param name="bam_file" value="input_ATAC_HEK293_hg19_chr1.bam" />
            <param name="chrom_sizes" value="input_hg19.chr1.chrom.size" />
            <param name="motif_coords" value="input_CTCF_motifs_hg19_chr1.bed" />
            <param name="genomeSource" value="history" />
            <param name="ownFile" value="input_hg19_chr1.fa" />
            <param name="factor_name" value="CTCF" />
            <param name="bias_file" value="input_SeqBias_ATAC.txt" />
            <param name="peak_file" value="input_CTCF_HEK293_chip_hg19_chr1.bed" />
            <param name="no_of_components" value="2" />
            <param name="background" value="Seq" />
            <param name="fixed_bg" value="TRUE" />
            <output name="RESULTS" file="output.RESULTS" ftype="tabular" compare="sim_size"/>
            <output name="PARAM" file="output.PARAM" ftype="txt"  compare="sim_size"/>
            <output name="plot1" file="output_plot1.png" ftype="png" compare="sim_size" delta="15000" />
            <output name="plot2" file="output_plot2.png" ftype="png" compare="sim_size" delta="15000" />
        </test>
    </tests>
    <help><![CDATA[.. class:: infomark

**Purpose**

This is a pipeline to find transcription factor footprints in ATAC-seq or DNase-seq data.

-----

.. class:: infomark

**Inputs**

alignment bam file
 * A bam file from the ATAC-seq or DNase-seq experiment.

chromosome length
 * A tab delimited file with 2 columns.
 * The first column is the chromosome name and the second column is the chromosome length for the appropriate organism and genome build.
 * Example: chr1    10000000

coordinates of motif
 * A 6-column bed file with the coordinates of motif matches (eg resulting from scanning the genome with a PWM) for the transcription factor of interest.
 * The 6 columns should contain chromosome, start coordinate, end coordinate, name, score and strand information in this order. The coordinates should be closed (1-based).
 * Example: chr1    24782   24800   .       11.60   -
 * There should not be any additional columns.

transcription factor
 * The name of the transcription factor of interest supplied by the user, e.g. CTCF.

cleavage/transposition bias
 * The cleavage/transposition bias of the different protocols, for all 6-mers.
 * Provided `options`_: ATAC, DNase double hit or DNase single hit protocols.

 .. _options: https://ohlerlab.mdc-berlin.de/software/Reproducible_footprinting_139/

coordinates of ChIP-seq peaks
 * A file with the coordinates of the ChIP-seq peaks for the transcription factor of interest.
 * The format is flexible as long as the first 3 columns (chromosome, start coordinate, end coordinate) are present.
 * Example: chr1    237622  237882

number of components
 * Total number of footprint and background components that should be learned from the data.
 * Options are 2 (1 fp and 1 bg) and 3 (2 fp and 1 bg) components.

background components
 * The mode of initialization for the background component. Options are "Flat" or "Seq".
 * Choosing "Flat" initializes this component as a uniform distribution.
 * Choosing "Seq" initializes it as the signal profile that would be expected solely due to the protocol bias (given by the cleavage/transposition bias file).

fixed background component
 * Whether the background component should be kept fixed.
 * Options are TRUE or FALSE.
 * Setting "TRUE" keeps this component fixed, whereas setting "FALSE" lets it be reestimated during training.
 * In general, if the background is estimated from bias (option "Seq"), it is recommended to keep it fixed.

-----

.. class:: infomark

**Outputs**

results
 * The results of the footprinting analysis.
 * The first 6 columns harbor the motif information (identical to the 'coordinates of motif').
 * The 7th column has the footprint score (log-odds of footprint versus background) for each motif instance.
 * The following columns show the probabilities for the individual footprint and background components.

parameters
 * Gives the trained parameters for the footprint and background components.
 * It includes as many lines as components (eg the first line has the parameters for the first component).

plot 1
 * A plot with two panels, showing the initial components above and the final trained components below.
 * The plotted values for the final components are given in the 'parameters' output file explained above.

plot 2
 * A plot only with the final trained components.
 * In a model where 2 components are used, this plot is identical to the bottom panel in plot1.
 * When 3 components are used, this plot shows the weighted average of the 2 footprint components as the final footprint profile.

]]></help>
    <citations>
        <citation type="bibtex">@ARTICLE{footprint,
        author = {Aslihan Karabacak, Galip Gurkan Yardimci, Ricardo Wurmus, Dilmurat Yusuf, Uwe Ohler},
        title = {To submit},
        journal = {},
        year = {},
        volume = {},
        pages = {}
        }</citation>
    </citations>
</tool>
author	rnateam
date	Mon, 20 Nov 2017 05:04:27 -0500
parents	4bff424dfa47
children