Mercurial > repos > iuc > gemini_annotate

<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy2">
    <description>the variants in an existing GEMINI database with additional information</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>gemini_macros.xml</import>
        <token name="@BINARY@">annotate</token>
        <xml name="add_as">
            <param argument="-c" name="column_name" type="text" value=""
            label="Database column name to use for recording annotations"
            help="A column with the name provided here will be added to the variants table of the GEMINI database to store the annotations">
                <validator type="empty_field" />
                <validator type="expression" message="Only alphanumeric characters and the underscore can be used in column names">value.replace('_', '').isalnum()</validator>
            </param>
        </xml>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command>
<![CDATA[

    ## For GEMINI to work correctly, tabixed file must have form [name].[bed|vcf].gz
    #if $annotate_source.ext == 'vcf_bgzip':
        ## File is bgzipped and tabixed already -> just symlink data and index
        #set $tabixed_file = 'tabixed.vcf.gz'
        ln -s '$annotate_source' $tabixed_file &&
        ln -s '$annotate_source.metadata.tabix_index' ${tabixed_file}.tbi &&
    #else:
        ## bgzip and index vcf or bed
        #set $tabixed_file = "tabixed.%s.gz" % $annotate_source.ext
        bgzip -c '$annotate_source' > '$tabixed_file' &&
        tabix -p '$annotate_source.ext' '$tabixed_file' &&
    #end if
    ## gemini annotate modifies its input database in-place so, in Galaxy,
    ## we need to create a copy first!
    cp '$infile' '$outfile' &&

    gemini @BINARY@
        -f '$tabixed_file'
        -a $a.a_selector
        #if $a.a_selector == 'extract':
            #set $types = []
            #set $elements = []
            #set $column_names = []
            #set $operations = []
            #for $action in $a.actions:
                $types.append(str($action.column_type))
                $elements.append(str($action.element_to_extract).strip())
                $column_names.append(str($action.column_name).strip())
                $operations.append(str($action.operation))
            #end for

            -t #echo ",".join($types)
            -e '#echo ",".join($elements)#'
            -o #echo ",".join($operations)
            -c '#echo ",".join($column_names)#'
        #else:
            -c '${a.column_name}'
        #end if
        #if str($annotate_source.ext) != "bed":
            $region_only
        #end if
        '$outfile'
]]>
    </command>
    <inputs>
        <expand macro="infile" />
        <param argument="-f" name="annotate_source" type="data" format="vcf,vcf_bgzip,bed"
        label="Dataset to use as the annotation source"
        help="The tool can use the information from a BED or VCF dataset to annotate the database variants."/>
        <param argument="--region-only" name="region_only" type="boolean" checked="true" truevalue="" falsevalue="--region-only"
        label="Strict variant-identity matching of database and annotation records (VCF format only)"
        help="The default is to consider VCF-formatted annotations only if a variant in the GEMINI database and a record in the annotation source describe the exact same nucleotide change at the same position in the genome. You can disable this option to make use of any annotation that overlaps with the position of a database variant. This setting is ignored for annotation sources in BED format, for which matching is always based on overlapping positions only." />
        <conditional name="a">
            <param argument="-a" name="a_selector" type="select"
            label="Type of information to add to the database variants"
            help="">
                <option value="boolean">Binary indicator (1=found, 0=not found) of whether the variant had any match in the annotation source (boolean)</option>
                <option value="count">Count of the number of matches found in the annotation source for the database variant (count)</option>
                <option value="extract" selected="True">Specific values extracted from matching records in the annotation source (extract)</option>
            </param>
            <when value="extract">
                <repeat name="actions" title="Annotation extraction recipe" default="1" min="1">
                    <param argument="-e" name="element_to_extract" type="text" value=""
                    label="Elements to extract from the annotation source"
                    help="For an annotation source in BED format, specify the number of the column from which the annotations should be read. For a VCF source, name an INFO field element.">
                        <validator type="expression" message="This field cannot be empty">value.strip()</validator>
                    </param>
                    <expand macro="add_as" />
                    <param argument="-t" name="column_type" type="select" display="radio"
                    label="What type of data are you trying to extract?"
                    help="Your selection will determine the data type used to store the new annotations in the database.">
                        <option value="float">Numbers with decimal precision</option>
                        <option value="integer">Integer numbers</option>
                        <option value="text">Text (text)</option>
                    </param>
                    <param argument="-o" name="operation" type="select"
                    label="If multiple annotations are found for the same variant, store ..."
                    help="Note: If indicated (in parentheses) an option is only applicable to annotations of a specific type.">
                        <option value="first">the first annotation found</option>
                        <option value="last">the last annotation found</option>
                        <option value="list">a comma-separated list of the (text) values</option>
                        <option value="uniq_list">a comma-separated list of non-redundant (text) values</option>
                        <option value="min">the smallest of the (numeric) values</option>
                        <option value="max">the largest of the (numeric) values</option>
                        <option value="mode">the most frequent of the (numeric) values</option>
                        <option value="mean">the mean of the (numeric) values</option>
                        <option value="median">the median of the (numeric) values</option>
                        <option value="sum">the sum of the (numeric) values</option>
                    </param>
                </repeat>
            </when>
            <when value="boolean">
                <expand macro="add_as" />
            </when>
            <when value="count">
                <expand macro="add_as" />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="outfile" format="gemini.sqlite" />
    </outputs>
    <tests>
        <test>
            <!-- test with annotation source in bed format -->
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <param name="annotate_source" value="anno.bed" ftype="bed" />
            <param name="region_only" value="false" />
            <conditional name="a">
                <param name="a_selector" value="count" />
                <param name="column_name" value="anno5" />
            </conditional>
            <assert_command>
                <!-- confirm region_only gets ignored for bed input -->
                <not_has_text text="--region-only" />
            </assert_command>
            <output name="outfile" file="gemini_annotate_result.db" ftype="gemini.sqlite" compare="sim_size" delta="1000"/>
        </test>
        <test>
            <!-- test with annotation source in vcf format -->
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <param name="annotate_source" value="gemini_amend.vcf" ftype="vcf" />
            <param name="region_only" value="false" />
            <conditional name="a">
                <param name="a_selector" value="count" />
                <param name="column_name" value="anno5" />
            </conditional>
            <output name="outfile" file="gemini_annotate_result.db" ftype="gemini.sqlite" compare="sim_size" delta="1000"/>
        </test>
        <test>
            <!-- test with annotation source in vcf_bgzip format -->
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <param name="annotate_source" value="gemini_amend.vcf.gz" ftype="vcf_bgzip" />
            <param name="region_only" value="false" />
            <conditional name="a">
                <param name="a_selector" value="count" />
                <param name="column_name" value="anno5" />
            </conditional>
            <output name="outfile" file="gemini_annotate_result.db" ftype="gemini.sqlite" compare="sim_size" delta="1000"/>
        </test>
        <test>
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <param name="annotate_source" value="anno.bed" />
            <conditional name="a">
                <param name="a_selector" value="extract" />
                <repeat name="actions">
                    <param name="element_to_extract" value="1" />
                    <param name="column_name" value="anno5" />
                    <param name="column_type" value="text" />
                    <param name="operation" value="first" />
                </repeat>
            </conditional>
            <assert_command>
                <has_text_matching expression="-a +extract" />
                <has_text_matching expression="-e +&apos;1&apos;" />
                <has_text_matching expression="-c +&apos;anno5&apos;" />
                <has_text_matching expression="-t +text" />
                <has_text_matching expression="-o +first" />
            </assert_command>
        </test>
        <test>
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <param name="annotate_source" value="anno.bed" />
            <conditional name="a">
                <param name="a_selector" value="extract" />
                <repeat name="actions">
                    <param name="element_to_extract" value="1" />
                    <param name="column_name" value="anno5" />
                    <param name="column_type" value="text" />
                    <param name="operation" value="first" />
                </repeat>
                <repeat name="actions">
                    <param name="element_to_extract" value="2" />
                    <param name="column_name" value="anno6" />
                    <param name="column_type" value="integer" />
                    <param name="operation" value="first" />
                </repeat>
            </conditional>
            <assert_command>
                <has_text_matching expression="-a +extract" />
                <has_text_matching expression="-e +&apos;1,2&apos;" />
                <has_text_matching expression="-c +&apos;anno5,anno6&apos;" />
                <has_text_matching expression="-t +text,integer" />
                <has_text_matching expression="-o +first,first" />
            </assert_command>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

Given an existing GEMINI database and an annotation source in BED or VCF
format, the annotate tool will, for each variant in the variants table of the
database, screen for overlapping regions defined in the annotation source and
update one or more new columns of the variant record in the database based on
the result and the annotation found.
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Wed, 13 Jul 2022 15:23:12 +0000
parents	567837ca5f33
children