Mercurial > repos > iuc > metabat2_jgi_summarize_bam_contig_depths

<tool id="metabat2_jgi_summarize_bam_contig_depths" name="Calculate contig depths" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>for MetaBAT2</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
jgi_summarize_bam_contig_depths
--outputDepth '$outputDepth'
--percentIdentity $advanced.percentIdentity
#if str($advanced.output_paired_contigs) == 'yes':
    --pairedContigs '$outputPairedContigs'
#end if
$advanced.noIntraDepthVariance
$advanced.showDepth
--minMapQual $advanced.minMapQual
--weightMapQual $advanced.weightMapQual
$advanced.includeEdgeBases
--maxEdgeBases $advanced.maxEdgeBases
#if str($advanced.use_reference_cond.use_reference) == 'yes':
    #if str($advanced.use_reference_cond.reference_cond.reference_source) == 'cached'
        --referenceFasta '$advanced.use_reference_cond.reference_cond.referenceFasta.fields.path'
    #else:
        --referenceFasta '$advanced.use_reference_cond.reference_cond.referenceFasta'
    #end if
    --outputGC '$outputGC'
    --gcWindow $advanced.use_reference_cond.gcWindow
    --outputReadStats '$outputReadStats'
    --outputKmers '$outputKmers'
#end if
--shredLength $shredding.shredLength
--shredDepth $shredding.shredDepth
--minContigLength $shredding.minContigLength
--minContigDepth $shredding.minContigDepth
#for bam_input in $bam_inputs:
    '$bam_input'
#end for
    ]]></command>
    <inputs>
        <param name="bam_inputs" type="data" format="bam" multiple="true" label="Sorted bam files"/>
        <section name="advanced" title="Advanced options">
            <param argument="--percentIdentity" type="integer" value="97" label="Minimum end-to-end percent identity of qualifying reads"/>
            <param name="output_paired_contigs" type="select" display="radio" label="Output the sparse matrix of contigs which paired reads span?">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <param argument="--noIntraDepthVariance" type="boolean" truevalue="--noIntraDepthVariance" falsevalue="" checked="false" label="Remove variance from mean depth along the contig?"/>
            <param argument="--showDepth" type="boolean" truevalue="--showDepth" falsevalue="" checked="false" label="Output a depth file per bam for each contig base?"/>
            <param argument="--minMapQual" type="integer" value="0" label="Minimum mapping quality necessary to count a read as mapped"/>
            <param argument="--weightMapQual" type="float" value="0.0" label="Weight per-base depth based on the MQ of the read" help="Zero value disables"/>
            <param argument="--includeEdgeBases" type="boolean" truevalue="--includeEdgeBases" falsevalue="" checked="false" label="Include 1-readlength edges when calculating depth and variance?"/>
            <param argument="--maxEdgeBases" type="integer" value="75" label="Maximum length when calculating depth and variance" help="Ignored when including 1-readlength edges when calculating depth and variance"/>
            <conditional name="use_reference_cond">
                <param name="use_reference" type="select" label="Select a reference genome?">
                    <option value="no" selected="true">No</option>
                    <option value="yes">Yes</option>
                </param>
                <when value="no"/>
                <when value="yes">
                    <conditional name="reference_cond">
                        <param name="reference_source" type="select" label="Choose the source for the reference genome">
                            <option value="cached" selected="true">locally cached</option>
                            <option value="history">from history</option>
                        </param>
                        <when value="cached">
                            <param argument="--referenceFasta" type="select" label="Using reference genome">
                                <options from_data_table="fasta_indexes">
                                    <filter type="data_meta" column="1" key="dbkey" ref="bam_inputs"/>
                                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected BAM file(s)"/>
                                </options>
                            </param>
                        </when>
                        <when value="history">
                            <param argument="--referenceFasta" type="data" format="fasta,fasta.gz" label="Using reference genome" help="Must be the reference used to map the input bam files"/>
                        </when>
                    </conditional>
                    <param argument="--gcWindow" type="integer" value ="100" label="Sliding window size for GC calculations"/>
               </when>
            </conditional>
        </section>
        <section name="shredding" title="Options to control shredding contigs that are under-represented by the reads">
            <param argument="--shredLength" type="integer" value="16000" label="Maximum length of the shreds"/>
            <param argument="--shredDepth" type="integer" value="5" label="Depth to generate overlapping shreds"/>
            <param argument="--minContigLength" type="integer" value="1" label="Mimimum length of contig to include for mapping and shredding"/>
            <param argument="--minContigDepth" type="float" value="0.0" label="Minimum depth along the contig at which to break the contig"/>
        </section>
    </inputs>
    <outputs>
        <data name="outputDepth" format="tabular" label="${tool.name} on ${on_string} (depth matrix)"/>
        <data name="outputPairedContigs" format="fasta" label="${tool.name} on ${on_string} (paired contigs)">
            <filter>advanced["output_paired_contigs"] == "yes"</filter>
        </data>
        <data name="outputGC" format="tabular" label="${tool.name} on ${on_string} (gc coverage histogram)">
            <filter>advanced["use_reference_cond"]["use_reference"] == "yes"</filter>
        </data>
        <data name="outputReadStats" format="tabular" label="${tool.name} on ${on_string} (read statistics)">
            <filter>advanced["use_reference_cond"]["use_reference"] == "yes"</filter>
        </data>
        <data name="outputKmers" format="tabular" label="${tool.name} on ${on_string} (perfect kmer counts)">
            <filter>advanced["use_reference_cond"]["use_reference"] == "yes"</filter>
        </data>
    </outputs>
    <tests>
        <!-- Single input, default settings -->
        <test expect_num_outputs="1">
            <param name="bam_inputs" value="input1.bam" ftype="bam"/>
            <output name="outputDepth" file="jgi_output1.tabular" ftype="tabular" compare="contains"/>
        </test>
        <!-- Multiple inputs, default settings -->
        <test expect_num_outputs="1">
            <param name="bam_inputs" value="input1.bam,input1.bam" ftype="bam"/>
            <output name="outputDepth" file="jgi_output2.tabular" ftype="tabular" compare="contains"/>
        </test>
        <!-- Single input, output paired contigs, reference from history -->
        <test expect_num_outputs="5">
            <param name="bam_inputs" value="input2.bam" ftype="bam" dbkey="89"/>
            <param name="output_paired_contigs" value="yes"/>
            <param name="use_reference" value="yes"/>
            <param name="reference_source" value="history"/>
            <param name="referenceFasta" value="NC_002945v4.fasta" ftype="fasta"/>
            <output name="outputDepth" file="jgi_output_depth1.tabular" ftype="tabular" compare="contains"/>
            <output name="outputPairedContigs" file="jgi_output_paired_contigs1.fasta" ftype="fasta"/>
            <output name="outputGC" file="jgi_output_gc1.tabular" ftype="tabular"/>
            <output name="outputReadStats" file="jgi_output_read_stats1.tabular" ftype="tabular"/>
            <output name="outputKmers" file="jgi_output_kmers1.tabular" ftype="tabular"/>
        </test>
        <!-- Single input, output paired contigs, cached reference -->
        <test expect_num_outputs="5">
            <param name="bam_inputs" value="input2.bam" ftype="bam" dbkey="89"/>
            <param name="output_paired_contigs" value="yes"/>
            <param name="use_reference" value="yes"/>
            <param name="reference_source" value="cached"/>
            <output name="outputDepth" file="jgi_output_depth1.tabular" ftype="tabular" compare="contains"/>
            <output name="outputPairedContigs" file="jgi_output_paired_contigs1.fasta" ftype="fasta"/>
            <output name="outputGC" file="jgi_output_gc1.tabular" ftype="tabular"/>
            <output name="outputReadStats" file="jgi_output_read_stats1.tabular" ftype="tabular"/>
            <output name="outputKmers" file="jgi_output_kmers1.tabular" ftype="tabular"/>
        </test>
    </tests>
    <help>
**What it does**

Calculates coverage depth for each sequence in one or more selected BAM files, producing a tabular file (for each input)
having mean and variance of base coverage depth that can be used as one of the inputs for the MetaBAT2 metagenome binning
tool.

The algorithm used for calculating the coverage depth is adjusted by a few factors to improve the fidelity of the metrics
when correlating abundance coverage in the binning stage.  By default the following adjustments are applied.

**Edge bases are ignored**

Edge bases are not counted as coverage, by the lesser of 1 AverageReadLength or (--maxEdgeBases=75).  This is because most
mappers can not reliably place a read that would extend off the edge of a sequence, and coverage depth tends to drop towards
0 at the edge of a contig or scaffold.  Use --includeEdgeBases to include the coverage in this region.

**Reads with high mapping errors are skipped**

Reads that map imperfectly are excluded when the %ID of the mapping drops below a threshold (--percentIdentity=97).
MetaBAT2 is designed to resolve strain variation and mapping reads with low %ID indicate that the read actually came from
a different strain/species.

%ID is calculated from the CIGAR string and/or NM/MD fields and == 100 * MatchedBases / (MatchedBases + Substituions +
Insertions + Deletions).  This ensures that clips, insertions, deletions and mismatches are excluded from the coverage count.
Only the read bases that exactly match the reference are counted as coverage. This generally has a small effect, except in
the case of long reads from PacBio and Nanopore.

**More information**

https://bitbucket.org/berkeleylab/metabat/src/master/

**Options**

 * **Select a reference genome** - optionally select the reference genome that was used to map the input bam file(s) and 3 additional outputs will be produced; gc coverage histogram, read statistics and perfect kmer counts.

    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Fri, 28 Jan 2022 12:21:33 +0000
parents
children	1592150e38d2