Mercurial > repos > mvdbeek > damidseq_core

<tool id="damidseq_core" name="damidseq" version="0.1.5">
    <description>align, extend and normalize a DamID-seq experiment</description>
    <requirements>
        <requirement type="package" version="1.4">damidseq_pipeline</requirement>
    </requirements>
    <version_command><![CDATA[damidseq_pipeline --help 2>&1| grep damidseq_pipeline]]></version_command>
    <command detect_errors="aggressive"><![CDATA[
        #set dam_file = 'A001.fastq.gz' if str($dam.ext).endswith('.gz') else 'A001.fastq'
        #set dam_fusion_file = 'A002.fastq.gz' if str($dam_fusion.ext).endswith('.gz') else 'A002.fastq'
        ln -f -s '$dam' $dam_file &&
        ln -f -s '$dam_fusion' $dam_fusion_file &&
        ln -f -s '$index' index.txt &&
        HOME="\$PWD" damidseq_pipeline
        --bins=$bins
        --bowtie=1
        --bowtie2_genome_dir='$reference_index.fields.path'
        --extend_reads=$extend_reads
        --extension_method='$extension_method'
        $full_data_files
        --gatc_frag_file='$gatc_frag_file'
        --len=$len
        --max_norm_value='$max_norm_value'
        $method_subtract
        --min_norm_value='$min_norm_value'
        --norm_method=$norm_method
        --norm_steps=$norm_steps
        --output_format=$output_format
        --q=$q
        --qscore1max=$qscore1max
        --qscore1min=$qscore1min
        --qscore2max=$qscore2max
        --threads=\${GALAXY_SLOTS:-4} 2>&1| LC_ALL=C sed -e 's/[^A-Za-z0-9._-]/ /g' &&
        #set EXT = "-ext%s" % $len if $extend_reads == '1' else ""
        mv 'Dam-${dam.element_identifier}${EXT}.bam' '$control_output' &&
        mv '${dam_fusion.element_identifier}${EXT}.bam' '$fusion_output' &&
        mv '${dam_fusion.element_identifier}-vs-Dam.gatc.$output_format' gatc.output
        #if str($full_data_files):
            && mv '${dam_fusion.element_identifier}-vs-Dam.$output_format' full.output
        #end if
    ]]></command>
    <configfiles>
        <configfile name="index">A1	Dam-$dam.element_identifier
A2	$dam_fusion.element_identifier</configfile>
    </configfiles>
    <inputs>
        <param name="dam_fusion" type="data" format="fastq,fastq.gz" label="Dam fusion fastq"/>
        <param argument="--dam" type="data" format="fastq,fastq.gz" label="Control Dam fastq"/>
        <param name="reference_index" type="select" label="Select reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
          <options from_data_table="bowtie2_indexes">
            <filter type="sort_by" column="2"/>
            <validator type="no_options" message="No indexes are available for the selected input dataset"/>
          </options>
        </param>
        <param argument="--gatc_frag_file" type="data" format="gff" label="GFF file with all GATC locations"/>
        <param name="output_format" type="select" label="Select the output format for the peaks">
            <option value="bedgraph">Bed</option>
            <option value="gff">GFF</option>
        </param>
        <param argument="--extend_reads" type="boolean" truevalue="1" falsevalue="0" checked="True" label="Perform read extension?"/>
        <param argument="--extension_method" type="select" label="Select the read extension method" help="Select Full to extend all reads or GATC to extend reads to --len or to the next GATC site, whichever is shorter. Using this option increases peak resolution (default).">
            <option value="gatc">To nearest GATC site</option>
            <option value="full">Full</option>
        </param>
        <param argument="--full_data_files" type="boolean" truevalue="--full_data_files" falsevalue="" label="Output full binned ratio files (not only GATC array)"/>
        <param argument="--len" type="integer" min="50" value="300" label="Length to extend reads to"/>
        <param argument="--bins" type="integer" min="10" value="75" label="Width of bins to use for mapping reads"/>
        <param argument="--min_norm_value" type="float" value="-5.0" label="Minimum log2 value to limit normalisation search at"/>
        <param argument="--max_norm_value" type="float" value="5.0" label="Maximum log2 value to limit normalisation search at"/>
        <param argument="--method_subtract" type="boolean" truevalue="--method_subtract" falsevalue="" label="Subtract Dam control values from Dam-fusion values instead of using the log2 ratio?"/>
        <param argument="--norm_method" type="select" label="Select normalization method">
            <option value="kde">kernel density estimation of log2 GATC fragment ratio (recommended)</option>
            <option value="rpm">readcounts per million reads (not recommended for most use cases)</option>
        </param>
        <param argument="--norm_steps" type="integer" min="1" value="300" label="Number of points in normalisation routine"/>
        <param argument="--q" type="integer" value="30" min="0" label="Cutoff average Q score for aligned reads"/>
        <param argument="--qscore1min" type="float" min="0.0" value="0.4" max="1.0" label="min decile for normalising from Dam array"/>
        <param argument="--qscore1max" type="float" min="0.0" value="1.0" max="1.0" label="max decile for normalising from Dam array"/>
        <param argument="--qscore2max" type="float" min="0.0" value="0.9" max="1.0" label="max decile for normalising from fusion-protein array"/>
    </inputs>
    <outputs>
        <data name="output_ratio" format="bedgraph" from_work_dir="gatc.output" label="Dam-fusion vs Dam-only GATC ratio on ${on_string}" default_identifier_source="dam_fusion">
            <change_format>
                <when input="output_format" value="gff" format="gff" />
            </change_format>
            <actions>
                <action type="metadata" name="dbkey">
                    <option type="from_data_table" name="bowtie2_indexes" column="1" offset="0">
                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                        <filter type="param_value" ref="reference_index" column="0"/>
                    </option>
                </action>
            </actions>
        </data>
        <data name="output_ratio_full" format="bedgraph" from_work_dir="full.output" label="Dam-fusion vs Dam-only full ratio on ${on_string}" default_identifier_source="dam_fusion">
            <filter>full_data_files</filter>
            <change_format>
                <when input="output_format" value="gff" format="gff" />
            </change_format>
            <actions>
                <action type="metadata" name="dbkey">
                    <option type="from_data_table" name="bowtie2_indexes" column="1" offset="0">
                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                        <filter type="param_value" ref="reference_index" column="0"/>
                    </option>
                </action>
            </actions>
        </data>
        <data name="control_output" format="bam" label="Dam-only alignment on ${on_string}" default_identifier_source="dam_fusion">
            <actions>
                <action type="metadata" name="dbkey">
                    <option type="from_data_table" name="bowtie2_indexes" column="1" offset="0">
                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                        <filter type="param_value" ref="reference_index" column="0"/>
                    </option>
                </action>
            </actions>
        </data>
        <data name="fusion_output" format="bam" label="Dam-fusion alignment on ${on_string}" default_identifier_source="dam_fusion">
            <actions>
                <action type="metadata" name="dbkey">
                    <option type="from_data_table" name="bowtie2_indexes" column="1" offset="0">
                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                        <filter type="param_value" ref="reference_index" column="0"/>
                    </option>
                </action>
            </actions>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="dam" value="A001.fastq.gz"/>
            <param name="dam_fusion" value="A002.fastq.gz"/>
            <param name="gatc_frag_file" value="dm6.GATC.gff"/>
            <param name="reference_index" value="dm6"/>
            <param name="norm_method" value="rpm"/>
            <output name="output_ratio" file="output_ratio.bed"/>
            <output name="control_output" file="control.bam"/>
            <output name="fusion_output" file="fusion.bam"/>
        </test>
        <test>
            <!-- test full data output -->
            <param name="dam" value="A001.fastq"/>
            <param name="dam_fusion" value="A002.fastq"/>
            <param name="gatc_frag_file" value="dm6.GATC.gff"/>
            <param name="reference_index" value="dm6"/>
            <param name="norm_method" value="rpm"/>
            <param name="full_data_files" value="true"/>
            <output name="output_ratio" file="output_ratio.bed"/>
            <output name="output_ratio_full" file="output_ratio_full.bed"/>
            <output name="control_output" file="control.bam"/>
            <output name="fusion_output" file="fusion.bam"/>
        </test>
        <test>
            <!-- test no extension and gff outpt -->
            <param name="dam" value="A001.fastq"/>
            <param name="dam_fusion" value="A002.fastq"/>
            <param name="gatc_frag_file" value="dm6.GATC.gff"/>
            <param name="reference_index" value="dm6"/>
            <param name="norm_method" value="rpm"/>
            <param name="full_data_files" value="true"/>
            <param name="extend_reads" value="false"/>
            <param name="output_format" value="gff"/>
            <output name="output_ratio" file="output_ratio2.gff"/>
            <output name="output_ratio_full" file="output_ratio_full2.gff"/>
            <output name="control_output" file="control2.bam"/>
            <output name="fusion_output" file="fusion2.bam"/>
        </test>
    </tests>
    <help><![CDATA[

Processing DamID-seq data involves extending single-end reads, aligning
the reads to the genome and determining the coverage, similar to
processing regular ChIP-seq datasets. However, as DamID data is
represented as a log2 ratio of (Dam-fusion/Dam), normalisation of the
sample and Dam-only control is necessary and adding pseudocounts to
mitigate the effect of background counts is highly recommended.

damidseq_pipeline is a single script that automatically handles
sequence alignment, read extension, binned counts, normalisation,
pseudocount addition and final ratio file generation. The script uses
FASTQ or BAM files as input, and outputs the final log2 ratio files in
bedGraph (or optionally GFF) format.

The output ratio files can easily be converted to TDF for viewing in IGV using
igvtools. The files can be processed for peak calling using find_peaks or, if
using RNA pol II DamID, transcribed genes can be determined using
polii.gene.call.

        ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv386</citation>
    </citations>
</tool>
author	mvdbeek
date	Thu, 03 May 2018 05:12:02 -0400
parents	02f09108bcff
children