Mercurial > repos > iuc > bcftools_stats

<tool name="bcftools @EXECUTABLE@" id="bcftools_@EXECUTABLE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats</description>
    <macros>
        <token name="@EXECUTABLE@">stats</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools" />
    <expand macro="requirements">
        <expand macro="samtools_requirement"/>
        <expand macro="matplotlib_requirement" />
        <requirement type="package" version="0.12.0">tectonic</requirement>
    </expand>
    <expand macro="version_command" />
    <command detect_errors="aggressive"><![CDATA[
@PREPARE_ENV@
#set $input_files = [$input_file]
#if $inputB_file:
    #silent $input_files.append($inputB_file)
#end if
@PREPARE_INPUT_FILES@
#set $section = $sec_restrict
@PREPARE_TARGETS_FILE@
@PREPARE_REGIONS_FILE@
## Stats section
#set $section = $sec_default.reference_source
@PREPARE_FASTA_REF@
#set $section = $sec_default
@PREPARE_EXONS_FILE@

bcftools @EXECUTABLE@

## Stats section
#set $section = $sec_default.reference_source
@FASTA_REF@
#set $section = $sec_default
@EXONS_FILE@
${section.first_allele_only}
#if $section.depth.set_depth == 'yes':
    --depth ${section.depth.depth_min},${section.depth.depth_max},${section.depth.depth_bin_size}
#end if
#if $section.user_tstv:
    --user-tstv '${section.user_tstv}'
#end if
#if $section.afbins.afbins_select == 'af_bins_list':
    --af-bins '$section.afbins.af_bins_list'
#elif $section.afbins.afbins_select == 'af_bins_file':
    --af-bins '$section.afbins.af_bins_file'
#end if
#if $section.af_tag:
    --af-tag '${section.af_tag}'
#end if
#if len($input_vcfs) == 1:
    ${section.split_by_ID}
#end if
${section.verbose}

## Stats section
#set $section = $sec_restrict
@APPLY_FILTERS@
@COLLAPSE@
@REGIONS@
@SAMPLES@
@TARGETS@
@INCLUDE@
@EXCLUDE@

## Primary Input/Outputs
@INPUT_FILES@
> '$output_file'
#if $plot_title:
    && plot-vcfstats
    -p 'plot_tmp/'
    -T '$plot_title'
    -s
    '$output_file'
    || (printf "The content of plot_tmp/plot-vcfstats.log is:\n" >&2 && cat plot_tmp/plot-vcfstats.log >&2 && exit 1)
#end if
    ]]></command>
    <inputs>
        <expand macro="macro_input" />
        <param name="inputB_file" type="data" format="vcf,vcf_bgzip,bcf" optional="true" label="Optional VCF/BCF Data to compare against" help="When this second dataset is also specified, separate stats for intersection and the complements are generated" />
        <section name="sec_restrict" expanded="false" title="Restrict to">
            <expand macro="macro_samples" />
            <expand macro="macro_apply_filters" />
            <expand macro="macro_collapse" />
            <expand macro="macro_restrict" />
            <expand macro="macro_restrict" type="target" label_type="Target" />
            <expand macro="macro_include" />
            <expand macro="macro_exclude" />
        </section>
        <section name="sec_default" expanded="true" title="Stats options">
            <param name="first_allele_only" type="boolean" truevalue="--1st-allele-only" falsevalue="" label="First allele only"
                   help="Include only first allele at multiallelic sites" />
            <conditional name="depth">
                <param name="set_depth" type="select" label="Depth distribution">
                    <option value="no">Use depth defaults</option>
                    <option value="yes">Set depth values</option>
                </param>
                <when value="no"/>
                <when value="yes">
                    <param name="depth_min" type="integer" value="0" min="0" label="Depth min"/>
                    <param name="depth_max" type="integer" value="500" min="1" label="Depth max" />
                    <param name="depth_bin_size" type="integer" value="1" min="1" label="Depth bin size" />
                </when>
            </conditional>
            <conditional name="reference_source">
                <param name="reference_source_selector" type="select" label="Choose a reference genome">
                    <option value="">Run without a reference genome</option>
                    <option value="cached">Use a built-in genome</option>
                    <option value="history">Use a genome from the history</option>
                </param>
                <when value="" />
                <when value="cached">
                    <param name="fasta_ref" type="select" label="Reference genome">
                        <options from_data_table="fasta_indexes">
                            <filter type="data_meta" column="dbkey" key="dbkey" ref="input_file" />
                            <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file" />
                        </options>
                    </param>
                </when>
                <when value="history">
                    <param name="fasta_ref" type="data" format="fasta" label="Reference genome" />
                </when>
            </conditional>
            <expand macro="macro_exons_file" />
            <param name="split_by_ID" type="boolean" truevalue="--split-by-ID" falsevalue="" label="Split by ID (ignored on multiple inputs)"
                   help="Collect stats for sites with ID separately (known vs novel)" />
            <param name="user_tstv" type="text" value="" optional="true" label="User Tstv"
                   help="Collect Ts/Tv stats for any tag using the given binning: TAG[:min:max:binsize]" >
                  <validator type="regex" message="TAG optionally followed by :min:max:binsize">^([^ \t\n\r\f\v:,](:\d+:\d+:\d+)?)?$</validator>
            </param>
            <conditional name="afbins">
                <param name="afbins_select" type="select" label="Set af-bins">
                    <option value="default">Use default</option>
                    <option value="af_bins_list">Enter bins</option>
                    <option value="af_bins_file">Read bins from file</option>
                </param>
                <when value="default"/>
                <when value="af_bins_list">
                    <param name="af_bins_list" type="text" value="0.1,0.5,1" label="List of allele frequency bins" help="e.g. 0.1,0.5,1">
                        <validator type="regex" message="Comma-separated list of floats of increasing value">^[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?(,[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)*$</validator>
                    </param>
                </when>
                <when value="af_bins_file">
                    <param name="af_bins_file" type="data" format="tabular" label="File listing the allele frequency bins one per line"/>
                </when>
            </conditional>
            <param name="af_tag" type="text" value="" optional="true" label="Allele frequency tag to use, by default estimated from AN,AC or GT">
                  <validator type="regex" message="TAG">^\w*$</validator>
            </param>
            <param name="verbose" type="boolean" truevalue="--verbose" falsevalue="" label="Verbose"
                   help="Produce verbose per-site and per-sample output" />
        </section>
        <param name="plot_title" type="text" value="" optional="true" label="Create a plots pdf with this title">
            <validator type="regex" message="">^\w.*\w$</validator>
        </param>
    </inputs>
    <outputs>
        <data name="output_file" format="txt" label="${tool.name} on ${on_string}: txt" />
        <data name="output_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF" from_work_dir="plot_tmp/summary.pdf">
            <filter>plot_title</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input_file" ftype="vcf" value="stats.b.vcf" />
            <param name="inputB_file" ftype="vcf" value="stats.a.vcf" />
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="SN\t0\tnumber of samples:\t3"/>
                    <has_text_matching expression="SN\t1\tnumber of samples:\t3"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <param name="input_file" ftype="vcf" value="mpileup.vcf" />
            <section name="sec_default">
                <expand macro="test_using_reference" select_from="cached" ref="mpileup" />
            </section>
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="SN\t0\tnumber of samples:\t3"/>
                    <has_text_matching expression="SN\t0\tnumber of records:\t4103"/>
                    <has_text_matching expression="ST\t0\tA>C\t16"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="input_file" ftype="vcf" value="mpileup.vcf" />
            <param name="plot_title" value="Plot for mpileup.vcf" />
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="SN\t0\tnumber of samples:\t3"/>
                    <has_text_matching expression="SN\t0\tnumber of records:\t4103"/>
                    <has_text_matching expression="ST\t0\tA>C\t16"/>
                </assert_contents>
            </output>
            <output name="output_pdf" file="summary.pdf" compare="sim_size" delta="25000" />
        </test>
        <!-- Test region overlap option-->
        <test expect_num_outputs="1">
            <param name="input_file" ftype="vcf" value="stats.b.vcf" />
            <param name="inputB_file" ftype="vcf" value="stats.a.vcf" />
            <section name="sec_restrict">
                <param name="regions_overlap" value="1"/>
            </section>
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="SN\t0\tnumber of samples:\t3"/>
                    <has_text_matching expression="SN\t1\tnumber of samples:\t3"/>
                </assert_contents>
            </output>
            <assert_command>
                <has_text text="--regions-overlap" />
            </assert_command>
        </test>
        <!-- Test VCF.gz input file -> REQUIRES https://github.com/galaxyproject/galaxy/pull/14605
        <test expect_num_outputs="1">
            <param name="input_file"  value="mpileup.vcf.gz" />
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="bcftools stats  input0.vcf.gz"/>
                    <has_text_matching expression="SN\t0\tnumber of samples:\t3"/>
                    <has_text_matching expression="SN\t0\tnumber of records:\t4103"/>
                </assert_contents>
            </output>
        </test>
        -->
        <!-- Test modification in samples option -->
        <test expect_num_outputs="1">
            <param name="input_file" ftype="vcf" value="mpileup.vcf" />
            <section name="sec_restrict">
                <param name="samples" value="-" />
            </section>
            <output name="output_file">
                <assert_contents>
                    <has_text_matching expression="bcftools stats  --samples - "/>
                    <has_text_matching expression="PSC\t0\tHG00101\t0\t0\t0\t0\t0\t0\t4.8"/>
                    <has_text_matching expression="PSI\t0\tHG00102\t0"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
=====================================
 bcftools @EXECUTABLE@
=====================================

Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.

When two files are given, the program generates separate stats for intersection and the complements.
By default only sites are compared, -s/-S must given to include also sample columns.

When one VCF file is specified, then stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. are printed. When two VCF files are given, then stats such as concordance (Genotype concordance by non-reference allele frequency, Genotype concordance by sample, Non-Reference Discordance) and correlation are also printed. Per-site discordance (PSD) is also printed in --verbose mode.

@COLLAPSE_HELP@
@REGIONS_HELP@
@TARGETS_HELP@
@EXPRESSIONS_HELP@

@BCFTOOLS_MANPAGE@#@EXECUTABLE@

@BCFTOOLS_WIKI@
]]>
</help>
    <expand macro="citations" />
</tool>
author	iuc
date	Sun, 18 Aug 2024 10:16:43 +0000
parents	bf8325a07ce7
children