view gemini_stats.xml @ 7:5c9efee9cb1e draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
author iuc
date Tue, 10 Mar 2020 06:17:06 -0400
parents 86d4303cc3ca
children b92cfa44f5be
line wrap: on
line source

<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
    <description>Compute useful variant statistics</description>
    <macros>
        <import>gemini_macros.xml</import>
        <token name="@BINARY@">stats</token>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command>
<![CDATA[
        gemini @BINARY@
            #if str($stats.type) == "gts-stats":
                #set $multiline_sql_expr = $stats.variants.gt_filter
                #set $cmdln_param = "--gt-filter"
                @MULTILN_SQL_EXPR_TO_CMDLN@

                #if str($stats.variants.constraint).strip():
                    #set $multiline_sql_expr = "select * from variants WHERE " + str($stats.variants.constraint)
                #else:
                    #set $multiline_sql_expr = "select * from variants"
                #end if
                #set $cmdln_param = "--summarize"
                @MULTILN_SQL_EXPR_TO_CMDLN@
            #else:
                ${stats.stats_option}
            #end if
            '$infile'
            > '$outfile'
]]>
    </command>
    <inputs>
        <expand macro="infile" />

        <conditional name="stats">
            <param name="type" type="select"
            label="Select the type of statistics you are interested in" help="">
                <option value="gts-stats">Genotype counts tabulated by sample (--summarize)</option>
                <option value="snp-counts">Counts of SNPs by nucleotide change (--snp-counts)</option>
                <option value="tstv-stats">Transition / transversion statistics for the SNPs in the dataset</option>
                <option value="aaf">Alternate allele frequency spectrum of all variants (--sfs)</option>
                <option value="sample-distance">Pair-wise genetic distances between for all samples (--mds)</option>
            </param>
            <when value="snp-counts">
                <param name="stats_option" type="hidden" value="--snp-counts" />
            </when>
            <when value="aaf">
                <param name="stats_option" type="hidden" value="--sfs" />
            </when>
            <when value="sample-distance">
                <param name="stats_option" type="hidden" value="--mds" />
            </when>
            <when value="tstv-stats">
                <param name="stats_option" type="select"
                label="Calculate Ts/Tv statistics based on"
                help="Restricting the calculation to coding/noncoding regions will only produce meaningful results with preannotated variants. If you haven't annotated your variants with SnpEff or VEP before loading them into GEMINI, select All SNPs.">
                    <option value="--tstv">All SNPs (--tstv)</option>
                    <option value="--tstv-coding">SNPs in coding regions (--tstv-coding)</option>
                    <option value="--tstv-noncoding">SNPs in non-coding regions (--tstv-noncoding)</option>
                </param>
            </when>
            <when value="gts-stats">
                <param name="stats_option" type="hidden" value="" />
                <conditional name="variants">
                    <param name="keep" type="select"
                    label="Compute the genotype counts table based on"
                    help="If you select All variants the genotype counts will be produced using --summarize with the wildcard query &quot;select * from variants&quot;.">
                        <option value="all">All variants</option>
                        <option value="custom">Custom filtered variants</option>
                    </param>
                    <when value="all">
                        <param name="gt_filter" type="hidden" value="" />
                        <param name="constraint" type="hidden" value="" />
                    </when>
                    <when value="custom">
                        <param argument="--gt-filter" name="gt_filter" type="text" area="True" size="5x50"
                        label="Restrictions to apply to genotype values"
                        help="">
                            <expand macro="sanitize_query" />
                        </param>
                        <param name="constraint" type="text" area="True" size="5x50"
                        label="Additional constraints on the variants"
                        help="Enter valid constraints for the WHERE clause of a GEMINI query here. You could use, for example: chrom = 'chr1' or impact_severity = 'HIGH', to include only high-impact variants on chromosome 1 in the counts table.">
                            <expand macro="sanitize_query" />
                        </param>
                    </when>
                </conditional>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="outfile" format="tabular" />
    </outputs>
    <tests>
        <test>
            <!-- test vars-by-sample report -->
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <conditional name="stats">
                <param name="type" value="tstv-stats" />
                <param name="stats_option" value="--tstv-coding" />
            </conditional>
            <output name="outfile">
                <assert_contents>
                    <!-- since the input file is not annotated
                    no variants will be considered to be in coding regions -->
                    <has_line line="ts&#009;tv&#009;ts/tv" />
                    <has_line line="0&#009;0&#009;0" />
                </assert_contents>
            </output>
        </test>
        <test>
            <!-- test gts-by-sample report -->
            <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" />
            <conditional name="stats">
                <param name="type" value="gts-stats" />
                <conditional name="variants">
                    <param name="keep" value="all" />
                </conditional>
            </conditional>
            <output name="outfile">
                <assert_contents>
                    <has_line_matching expression="sample&#009;total&#009;num_het&#009;num_hom_alt&#009;num_hom_ref" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

The stats tool computes one of the following useful variant statistics for a GEMINI database:

**Genotype counts tabulated by sample**:

This mode uses the ``gemini stats --summarize`` option to produce a table with
one row per sample, which tabulates the numbers of sites, for which a given
sample shows a:

- non-reference genotype (*total* column; the sum of the *num_het* and *num_hom_alt* columns next to it)
- heterozygous genotype (*num_het* column)
- homozygous variant genotype (*num_hom_alt* column)
- homozygous reference genotype (*num_hom_ref* column)

You can choose to calculate the table based on all variants in your database,
or to filter the variants before the calculation using GEMINI genotype filter
expressions and/or WHERE clauses of GEMINI queries.

**Counts of SNPs by nucleotide change**:

This runs ``gemini stats`` with the ``--snp-count`` option. The result is a
simple table listing the number of occurences of each observed REF->ALT change
in your database, e.g.::

 type    count
 A->G    2
 C->T    1
 G->A    1

**Transition / transversion statistics**

This mode uses ``gemini stats`` with the ``--tstv``, ``--tstv-coding``, or
``--tstv-noncoding`` option to compute the transition/transversion ratios for
all SNPs, for SNPs in coding, or SNPs in non-coding regions, respectively.

The result is presented in a 1x3 table listing the number of
transitions (*ts* column), transversions (*tv* column) and the ratio of the two
(*ts/tv* column), e.g.::

 ts    tv    ts/tv
 126   39    3.2307

**Alternate allele frequency spectrum**

Runs ``gemini stats --sfs`` to produce binned alternate allele frequency counts
in a table like::

 aaf     count
 0.125   2
 0.375   1

**Pairwise genetic distances**

Runs ``gemini stats --mds`` and tabulates all pairwise genetic distance for the
samples in your database. An example could look like this::

 sample1  sample2  distance
 M10500   M10500   0.0
 M10475   M10478   1.25
 M10500   M10475   2.0
 M10500   M10478   0.5714

    ]]></help>
    <expand macro="citations"/>
</tool>