view plot.xml @ 2:9916308301da draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/checkm commit 3ae2bd72f789518e95ef65b97e0e5ac90165e113
author iuc
date Mon, 02 Sep 2024 13:51:37 +0000
parents 356839cd89d2
children
line wrap: on
line source

<tool id="checkm_plot" name="CheckM plot" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        for assessing the quality of genome bins
    </description>
    <macros>
        <import>macros.xml</import>
        <xml name="gff_inputs">
            <param name="gff" type="data_collection" collection_type="list" format="gff" label="Gene feature files for each bin"/>
        </xml>
        <token name="@PLOT_GFF_INPUTS@"><![CDATA[
#for $i in $plot.gff
    #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
mkdir -p 'inputs/bins/${identifier}' &&
ln -s '$i' 'inputs/bins/${identifier}/genes.gff' &&
#end for
]]></token>
        <xml name="tetra_profile">
            <param name="tetra_profile" type="data" format="tabular" multiple="true" label="Tetranucleotide profiles for each bin" help="This can be generated using the tetra tool"/>
        </xml>   
        <xml name="dist_value">
            <param argument="--dist_value" type="integer" min="0" max="100" value="" label="Reference distribution(s) to plot" />
        </xml>
        <xml name="gc_params">
            <param argument="--gc_window_size" type="integer" min="0" value="5000" label="Window size used to calculate GC histogram" />
            <param argument="--gc_bin_width" type="float" min="0" value="0.01" label="Width of GC bars in histogram" />
        </xml>
        <xml name="cd_params">
            <param argument="--cd_window_size" type="integer" min="0" value="10000" label="Window size used to calculate CD histogram" />
            <param argument="--cd_bin_width" type="float" min="0" value="0.01" label="Width of CD bars in histogram" />
        </xml>
        <xml name="td_params">
            <param argument="--td_window_size" type="integer" min="0" value="5000" label="Window size used to calculate TD histogram" />
            <param argument="--td_bin_width" type="float" min="0" value="0.01" label="Width of TD bars in histogram" />
        </xml>
        <xml name="fig_padding">
            <param argument="--fig_padding" type="float" min="0" value="0.2" label="White space to place around figure" help="In inches"/>
        </xml>
        <xml name="gc_bias_plot">
            <when value="gc_bias_plot">
                <param name="bam_file" type="data" format="bam" label="BAM file to interrogate for coverage information" help="The file should be sorted"/>
                <param argument="--window_size" type="integer" min="0" value="5000" label="Window size used to calculate plot statistics" />
                <param argument="--all_reads" type="boolean" truevalue="--all_reads" falsevalue="" checked="false" label="Use all reads to estimate coverage instead of just those in proper pairs?" />
                <param argument="--min_align" type="float" min="0" max="1" value="0.98" label="Minimum alignment length as percentage of read length"/>
                <param argument="--max_edit_dist" type="float" min="0" max="1" value="0.02" label="Maximum edit distance as percentage of read length"/>
            </when>
        </xml>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements">
        <requirement type="package" version="1.20">samtools</requirement>
    </expand>
    <expand macro="version"/>
    <command detect_errors="exit_code"><![CDATA[
@BIN_INPUTS@

#if $plot.command == 'gc_plot'
checkm gc_plot
    'bins'
    'output'
    $plot.dist_value
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height

#else if $plot.command == 'coding_plot'
@PLOT_GFF_INPUTS@
checkm coding_plot
    'inputs'
    'bins'
    'output'
    $plot.dist_value
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --cd_window_size $plot.cd_window_size
    --cd_bin_width $plot.cd_bin_width

#else if $plot.command == 'tetra_plot'
@PLOT_GFF_INPUTS@
checkm tetra_plot
    'inputs'
    'bins'
    'output'
    '$tetra_profile'
    $plot.dist_value
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --td_window_size $plot.td_window_size
    --td_bin_width $plot.td_bin_width

#else if $plot.command == 'dist_plot'
@PLOT_GFF_INPUTS@
checkm dist_plot
    'inputs'
    'bins'
    'output'
    '$tetra_profile'
    $plot.dist_value
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --gc_window_size $plot.gc_window_size
    --gc_bin_width $plot.gc_bin_width
    --cd_window_size $plot.cd_window_size
    --cd_bin_width $plot.cd_bin_width
    --td_window_size $plot.td_window_size
    --td_bin_width $plot.td_bin_width

#else if $plot.command == 'nx_plot'
checkm nx_plot
    'bins'
    'output'
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --step_size $plot.step_size

#else if $plot.command == 'len_hist'
checkm len_hist
    'bins'
    'output'
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height

#else if $plot.command == 'marker_plot'
mkdir -p 'inputs/storage/' &&
cp '$marker_gene_stats' 'inputs/storage/marker_gene_stats.tsv' &&
cp '$bin_stats_ext' 'inputs/storage/bin_stats_ext.tsv' &&
#for $b in $plot.genes_fna
    #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($b.element_identifier))
mkdir -p 'inputs/bins/${identifier}' &&
cp '$b.file_name' 'inputs/bins/${identifier}/genes.faa' &&
#end for
checkm marker_plot
    'inputs'
    'bins'
    'output'
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --fig_padding $plot.fig_padding

#else if $plot.command == 'gc_bias_plot'
mkdir 'mapping' &&
ln -s '$bam_file' 'mapping.bam' &&
samtools index 'mapping.bam' 'mapping.bam.bai' &&

checkm gc_bias_plot
    'bins'
    'output'
    'mapping.bam'
    --extension 'fasta'
    --image_type '$image_type'
    --dpi $dpi
    --font_size $font_size
    --width $width
    --height $height
    --window_size $plot.window_size
    $plot.all_reads       
    --min_align $plot.min_align
    --max_edit_dist $plot.max_edit_dist
    --threads \${GALAXY_SLOTS:-1}
#end if
    ]]></command>
    <inputs>
        <expand macro="bin_inputs"/>
        <conditional name="plot">
            <param name="command" type="select" label="Plot to generate">
                <option value="gc_plot">gc_plot: Create GC histogram and delta-GC plot</option>
                <option value="coding_plot">Create coding density (CD) histogram and delta-CD plot</option>
                <option value="tetra_plot">Create tetranucleotide distance (TD) histogram and delta-TD plot</option>
                <option value="dist_plot">Create image with GC, coding density (CD), and tetranucleotide distance (TD) distribution plots together</option>
                <option value="nx_plot">Create Nx-plots</option>
                <option value="len_hist">Sequence length histogram</option>
                <option value="marker_plot">Plot position of marker genes on sequences</option>
                <!--<option value="gc_bias_plot">Plot bin coverage as a function of GC</option>-->
            </param>
            <when value="gc_plot">
                <expand macro="dist_value"/>
                <expand macro="gc_params"/>
            </when>
            <when value="coding_plot">
                <expand macro="gff_inputs"/>
                <expand macro="dist_value"/>
                <expand macro="cd_params"/>
            </when>
            <when value="tetra_plot">
                <expand macro="gff_inputs"/>
                <expand macro="tetra_profile"/>
                <expand macro="dist_value"/>
                <expand macro="td_params"/>
            </when>
            <when value="dist_plot">
                <expand macro="gff_inputs"/>
                <expand macro="tetra_profile"/>
                <expand macro="dist_value"/>
                <expand macro="gc_params"/>
                <expand macro="cd_params"/>
                <expand macro="td_params"/>
            </when>
            <when value="nx_plot">
                <param argument="--step_size" type="float" min="0" value="0.05" label="x step size for calculating Nx" />
            </when>
            <when value="len_hist">
                <expand macro="fig_padding" />
            </when>
            <when value="marker_plot">
                <param name="genes_fna" type="data_collection" collection_type="list" format="fasta" label="Nucleotide gene sequences for each bin"  help="Optional output of the CheckM tree or lineage_wf tools"/>
                <param name="marker_gene_stats" type="data" format="tabular" label="Marker gene stats" help="Output of the CheckM qa tool or optional output of the lineage_wf or taxonomy_wf tools"/>
                <param name="bin_stats_ext" type="data" format="tabular" label="Marker gene bin extensive stats" help="Output of the CheckM qa tool or optional output of the lineage_wf or taxonomy_wf tools"/>
                <expand macro="fig_padding" />
            </when>
        </conditional>
        <param argument="--image_type" type="select" label="Image type">
            <option value="eps">EPS</option>
            <option value="pdf">PDF</option>
            <option value="png" selected="true">PNG</option>
            <option value="ps">PS</option>
            <option value="svg">SVG</option>
        </param>
        <param argument="--dpi" type="integer" min="0" value="600" label="DPI of output image" />
        <param argument="--font_size" type="integer" min="0" value="8" label="Font size" />
        <param argument="--width" type="float" min="0" value="6.5" label="Width of output image" />
        <param argument="--height" type="float" min="0" value="3.5" label="Height of output image" />
    </inputs>
    <outputs>
        <collection name="gc_plot" type="list" label="${tool.name} on ${on_string}: GC distribution plot">
            <filter>plot['command'] == 'gc_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gc_plots\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="coding_plot" type="list" label="${tool.name} on ${on_string}: Coding density (CD) distribution plot">
            <filter>plot['command'] == 'coding_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.coding_density_plots\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="tetra_plot" type="list" label="${tool.name} on ${on_string}: Tetranucleotide distance (TD) distribution plot">
            <filter>plot['command'] == 'tetra_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tetra_dist_plots\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="dist_plot" type="list" label="${tool.name} on ${on_string}: GC, Coding density (CD) and Tetranucleotide distance (TD) distribution plot">
            <filter>plot['command'] == 'dist_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.ref_dist_plots\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="nx_plot" type="list" label="${tool.name} on ${on_string}: Nx-plot">
            <filter>plot['command'] == 'nx_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.nx_plot\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="len_hist" type="list" label="${tool.name} on ${on_string}: Sequence length histogram">
            <filter>plot['command'] == 'len_hist'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.len_hist\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="marker_plot" type="list" label="${tool.name} on ${on_string}: Marker gene position plot">
            <filter>plot['command'] == 'marker_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.marker_pos_plot\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
        <collection name="gc_bias_plot" type="list" label="${tool.name} on ${on_string}: Bin coverage as a function of GC">
            <filter>plot['command'] == 'gc_bias_plot'</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.marker_pos_plot\.(?P&lt;ext&gt;.+)" directory="output/"/>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="gc_plot"/>
                <param name="dist_value" value="100" />
                <param name="gc_window_size" value="5000"/>
                <param name="gc_bin_width" value="0.01"/>
            </conditional>
            <param name="image_type" value="eps"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="gc_plot" count="1">
                <element name="637000110" ftype="eps">
                    <assert_contents>
                        <has_size value="46343" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="coding_plot"/>
                <param name="gff">
                    <collection type="list">
                        <element name="637000110" ftype="gff" value="637000110.gff"/>
                    </collection>
                </param>
                <param name="dist_value" value="100" />
                <param name="cd_window_size" value="10000"/>
                <param name="cd_bin_width" value="0.01"/>
            </conditional>
            <param name="image_type" value="png"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="coding_plot" count="1">
                <element name="637000110" ftype="png">
                    <assert_contents>
                        <has_size value="224229" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="tetra_plot"/>
                <param name="gff">
                    <collection type="list">
                        <element name="637000110" ftype="gff" value="637000110.gff"/>
                    </collection>
                </param>
                <param name="tetra_profile" ftype="tabular" value="tetra"/>
                <param name="dist_value" value="100" />
                <param name="td_window_size" value="5000"/>
                <param name="td_bin_width" value="0.01"/>
            </conditional>
            <param name="image_type" value="pdf"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="tetra_plot" count="1">
                <element name="637000110" ftype="pdf">
                    <assert_contents>
                        <has_size value="17443" delta="10"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="dist_plot"/>
                <param name="gff">
                    <collection type="list">
                        <element name="637000110" ftype="gff" value="637000110.gff"/>
                    </collection>
                </param>
                <param name="tetra_profile" ftype="tabular" value="tetra"/>
                <param name="dist_value" value="100" />
                <param name="gc_window_size" value="5000"/>
                <param name="gc_bin_width" value="0.01"/>
                <param name="cd_window_size" value="10000"/>
                <param name="cd_bin_width" value="0.01"/>
                <param name="td_window_size" value="5000"/>
                <param name="td_bin_width" value="0.01"/>
            </conditional>
            <param name="image_type" value="png"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="dist_plot" count="1">
                <element name="637000110" ftype="png">
                    <assert_contents>
                        <has_size value="375137" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="nx_plot"/>
                <param name="step_size" value="0.05"/>
            </conditional>
            <param name="image_type" value="ps"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="nx_plot" count="1">
                <element name="637000110" ftype="ps">
                    <assert_contents>
                        <has_size value="18700" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="len_hist"/>
                <param name="fig_padding" value="0.2"/>
            </conditional>
            <param name="image_type" value="svg"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="len_hist" count="1">
                <element name="637000110" ftype="svg">
                    <assert_contents>
                        <has_size value="11147" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="marker_plot"/>
                <param name="genes_fna">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.faa"/>
                    </collection>
                </param>
                <param name="marker_gene_stats" ftype="tabular" value="marker_gene_stats.tsv"/>
                <param name="bin_stats_ext" ftype="tabular" value="bin_stats_ext.tsv"/>
                <param name="fig_padding" value="0.2"/>
            </conditional>
            <param name="image_type" value="png"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="marker_plot" count="1">
                <element name="637000110" ftype="png">
                    <assert_contents>
                        <has_size value="137320" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!--<test expect_num_outputs="1">
            <conditional name="bins">
                <param name="select" value="collection"/>
                <param name="bins_coll">
                    <collection type="list">
                        <element name="637000110" ftype="fasta" value="637000110.fna"/>
                    </collection>
                </param>
            </conditional>
            <conditional name="plot">
                <param name="command" value="gc_bias_plot"/>
                <param name="bam_file" ftype="bam" value="637000110.bam"/>
                <param name="window_size" value="5000"/>
                <param name="all_reads" value="false" />
                <param name="min_align" value="0.98"/>
                <param name="max_edit_dist" value="0.02"/>
            </conditional>
            <param name="image_type" value="png"/>
            <param name="dpi" value="600" />
            <param name="font_size" value="8"/>
            <param name="width" value="6.5"/>
            <param name="height" value="3.5"/>
            <output_collection name="gc_bias_plot" count="1">
                <element name="637000110" ftype="png">
                    <assert_contents>
                        <has_size value="10000" delta="100"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>-->
    </tests>
    <help><![CDATA[
@HELP_HEADER@

This command produces a number of plots for assessing the quality of genome bins. Here we describe each of these plots and provide an example.

- gc_plot: Provides a 3 pane plot suitable for assessing the GC distribution of sequences within a genome bin. The first pane is a histogram of the number of non-overlapping 5 kbp windows with a give percent GC. A typical genome will produce a unimodal distribution. The second pane plots each sequence in the genome bin as a function of its deviation from the average GC of the entire genome (x-axis) and sequence length (y-axis). The dashed red lines indicate the expected deviation from the mean GC as a function of length. This expected deviation is pre-calculated from a set of trusted reference genomes and the percentile plotted is provided as an argument to this command. A good default value to use for this distribution parameter is 95.
- coding_plot: Provides a plot analogous to the gc_plot suitable for assessing the coding density of sequences within a genome bin.
- tetra_plot: Provides a plot analogous to the gc_plot suitable for assessing the tetranucleotide signatures of sequences within a genome bin. The Manhattan distance is used for determine the different between each sequence's tetranucleotide signature and the tetranucleotide signature of the entire genome bin. This plot requires a file indicating the tetranucleotide signature of all sequences within the genome bins. This file can be creates with the tetra command.
- dist_plot: Produces a single figure combining the plots produced by gc_plot, coding_plot, and tetra_plot. This plot requires a file indicating the tetranucleotide signature of all sequences within the genome bins. This file can be creates with the tetra command.
- nx_plot: Produces a plot indicating the Nx value of a genome bin for all values of x. This provides a more comprehensive view of the quality of an assembly than simply considering N50.
- len_hist: Produce a histogram of the number of sequences within a genome bin at different sequence length intervals. This provides additional information regarding the quality of an assembled genome.
- marker_plot: Plots the position of marker genes on sequences within a genome bin. This provides information regarding the extent to which marker genes are collocated. The number of marker genes within a fixed size window (2.8 kbps in this example) is indicated by with different colours. Sequences without any marker genes are not shown.
- gc_bias_plot: 
    ]]></help>
    <expand macro="citations"/>
</tool>