view gemini_query.xml @ 7:da74170c55c7 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
author iuc
date Tue, 10 Mar 2020 06:14:55 -0400
parents cd00221d67cb
children 77a1e60fd1de
line wrap: on
line source

<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy1">
    <description>Querying the GEMINI database</description>
    <macros>
        <import>gemini_macros.xml</import>
        <token name="@BINARY@">query</token>

        <xml name="sorting">
            <param name="order_by" type="text"
            label="Sort the output by the following column(s)"
            help="" />
            <param name="sort_order" type="select" label="Sort order">
                <option value=" ASC">Ascending</option>
                <option value=" DESC">Descending</option>
            </param>
        </xml>
        <xml name="pheno_strat">
            <param name="phenotype" type="text"
            label="Phenotype to stratify samples across"
            help="Leave blank to stratify across the default phenotype column" />
        </xml>
        <xml name="sample_delimiter" token_applied_to="samples">
            <param argument="--sample-delim" name="sample_delim" type="text" value=","
            label="Delimiter to use in the list of affected @APPLIED_TO@"
            help="" />
        </xml>
        <xml name="dgidb_query">
            <param argument="--dgidb" name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False"
            label="Request drug-gene interaction info from DGIdb" help="" />
        </xml>
        <xml name="maf_extra_info">
            <param name="tumor_sample_name" type="text"
            label="Name of the tumor sample in the (multi-sample) GEMINI database"
            help="Specify only if the tumor sample is not the only sample in the database." />
            <param name="normal_sample_name" type="text"
            label="Name of the normal sample in the GEMINI database (for matched tumor/normal sample pair analyses only)" />
        </xml>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command>
<![CDATA[
        gemini @BINARY@
            ${query.oformat.report.header}
            ${query.oformat.report.dgidb}

            #for $i in $query.filter_by_genotype:
                #set $multiline_sql_expr = str($i.gt_filter)
                #set $cmdln_param = "--gt-filter"
                @MULTILN_SQL_EXPR_TO_CMDLN@
            #end for

            #for $i in $query.filter_by_sample:
                $i.family_wise
                #if int($i.min_kindreds) > 0:
                    --min-kindreds ${i.min_kindreds}
                #end if
                ${i.in}
                #set $multiline_sql_expr = str($i.sample_filter)
                #set $cmdln_param = "--sample-filter"
                @MULTILN_SQL_EXPR_TO_CMDLN@
            #end for

            #if str($query.oformat.report.format) == 'with_samples':
                #set $sample_delim = str($query.oformat.report.sample_delim) or ','
                --show-samples --sample-delim '$sample_delim'
            #elif str($query.oformat.report.format) == 'with_samples_flattened':
                --show-samples --format sampledetail
            #elif str($query.oformat.report.format) == 'with_families':
                #set $sample_delim = str($query.oformat.report.sample_delim) or ','
                --show-families --sample-delim '$sample_delim'
            #elif str($query.oformat.report.format) == 'carrier_summary':
                --carrier-summary-by-phenotype
                #if str($query.oformat.report.phenotype).strip():
                    '${query.oformat.report.phenotype}'
                #else:
                    affected
                #end if
            #elif str($query.oformat.report.format) != 'maf':
                --format ${query.oformat.report.format}
            #end if

            #if str($query.interface) == 'basic':
                ## build the SQL query string from its components
                #if str($query.oformat.report.format) in ('vcf', 'tped'):
                    #set $cols = "*"
                #elif str($query.oformat.report.format) == 'maf':
                    #if str($query.oformat.report.tumor_sample_name):
                        #set $gt_string = 'gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}'.format(str($query.oformat.report.tumor_sample_name))
                        #if str($query.oformat.report.normal_sample_name):
                            #set $gt_string = $gt_string + ', gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}'.format(str($query.oformat.report.normal_sample_name))
                        #end if
                    #else:
                        #set $gt_string = '(gt_alt_depths).(*), (gt_ref_depths).(*), (gts).(*)'
                    #end if
                    #if str($query.oformat.report.mutation_status.status_select) == 'custom':
                        ## Need to quote the user-specified mutation status for the SQL query
                        #set $mutation_status = '"%s"' % str($query.oformat.report.mutation_status.status_custom)
                    #elif str($query.oformat.report.mutation_status.status_select) == 'expression':
                        ## For custom expressions, it is up to the user to ensure valid syntax
                        #set $mutation_status = str($query.oformat.report.mutation_status.status_expression)
                    #else:
                        ## The user selected a fixed value from the list, but
                        ## it still needs quoting.
                        #set $mutation_status = '"%s"' % str($query.oformat.report.mutation_status.status_select)
                    #end if
                    #set $cols = 'ifnull(g1.gene, "unknown") AS Hugo_Symbol, ifnull(ifnull(g2.entrez_id, g1.entrez_id), "") AS Entrez_Gene_Id, "" AS Center, "37" AS NCBI_Build, replace(v.chrom, "chr", "") AS Chromosome, v.start + 1 AS Start_Position, v.end AS End_Position, "" as Strand, v.impact_so AS Variant_Classification, ifnull(nullif(v.type, "indel"), v.sub_type) AS Variant_Type, v.ref AS Reference_Allele, "${tumor_seq_allele1}" AS Tumor_Seq_Allele1, "${tumor_seq_allele2}" AS Tumor_Seq_Allele2, ifnull(v.rs_ids, ifnull(nullif(ifnull(nullif(v.in_omim = 0 AND v.cosmic_ids IS NULL AND v.max_aaf_all = -1, 1), "novel"), 0), "")) AS dbSNP_RS, "" AS dbSNP_Val_Status, printf("%s", "' + str($query.oformat.report.tumor_sample_id) + '") AS Tumor_Sample_Barcode, printf("%s", "' + str($query.oformat.report.norm_sample_id) + '") AS Matched_Norm_Sample_Barcode, "${match_norm_seq_allele1}" AS Match_Norm_Seq_Allele1, "${match_norm_seq_allele2}" AS Match_Norm_Seq_Allele2, "" AS Tumor_Validation_Allele1, "" AS Tumor_Validation_Allele2, "" AS Match_Norm_Validation_Allele1, "" AS Match_Norm_Validation_Allele2, "" AS Verification_Status, "" AS Validation_Status, ' + $mutation_status + ' AS Mutation_Status, "" AS Sequencing_Phase, "" AS Sequence_Source, "" AS Validation_Method, "" AS Score, "" AS BAM_File, "" AS Sequencer, ifnull(nullif(v.aa_change, ""), "p.=") AS HGVSp_Short, "${t_alt_count}" AS t_alt_count, "${t_ref_count}" AS t_ref_count, "${n_alt_count}" AS n_alt_count, "${n_ref_count}" AS n_ref_count, v.alt, ' + $gt_string
                #else:
                    #set $report = $query.oformat.report.report
                    @SET_COLS@
                #end if
                #set $q = "SELECT %s FROM variants" % $cols
                #if str($query.oformat.report.format) == 'maf':
                    #set $q = $q + ' v LEFT JOIN (SELECT DISTINCT gene, is_hgnc, hgnc_id, entrez_id, chrom FROM gene_detailed) g1 ON v.gene = g1.gene AND v.chrom = g1.chrom LEFT JOIN (SELECT DISTINCT gene, is_hgnc, hgnc_id, entrez_id, transcript, chrom, ensembl_gene_id FROM gene_detailed) g2 ON g1.gene = g2.gene AND (v.transcript = g2.transcript OR v.transcript=g2.ensembl_gene_id)'
                #end if

                #set $where_clause_elements = []
                #if str($query.filter).strip():
                    #silent $where_clause_elements.append(str($query.filter).strip())
                #end if

                #set $regions = $query.regions
                @PARSE_REGION_ELEMENTS@
                #if $region_elements:
                    #silent $where_clause_elements.append(" OR ".join($region_elements))
                #end if
                #if $where_clause_elements:
                    #set $q = $q + " WHERE " + " AND ".join($where_clause_elements)
                #end if
                #if str($query.oformat.report.format) == 'maf':
                    #set $q = $q + " GROUP BY v.variant_id"
                #end if
                #if str($query.oformat.report.order_by).strip():
                    #set $q = $q + " ORDER BY " + str($query.oformat.report.order_by).strip() + str($query.oformat.report.sort_order)
                #end if
            #else
                ## The user entered the SQL query string directly.
                #set $q = str($query.q)
            #end if

            #set $multiline_sql_expr = $q
            #set $cmdln_param = "-q"
            @MULTILN_SQL_EXPR_TO_CMDLN@

            '$infile'
            #if str($query.oformat.report.format) == 'maf':
                > temp.txt && python '$__tool_directory__/gemini_mafify.py' temp.txt '${query.oformat.report.tumor_sample_name}' '${query.oformat.report.normal_sample_name}'
            #end if
            > '$outfile'
]]>
    </command>
    <inputs>
        <expand macro="infile" />
        <conditional name="query">
            <param name="interface" type="select"
            label="Build GEMINI query using"
            help="">
                <option value="basic">Basic variant query constructor</option>
                <option value="advanced">Advanced query constructor</option>
            </param>
            <when value="basic">
                <expand macro="gt_filter" />
                <expand macro="sample_filter" />
                <expand macro="region_filter" />
                <expand macro="filter" argument="" />
                <section name="oformat" title="Output format options" expanded="true">
                    <conditional name="report">
                        <param name="format" type="select"
                        label="Type of report to generate">
                            <option value="default">tabular (GEMINI default)</option>
                            <option value="with_samples">tabular with affected samples</option>
                            <option value="with_samples_flattened">tabular with affected samples flattened</option>
                            <option value="with_families">tabular with affected families</option>
                            <option value="carrier_summary">tabular with carrier summary</option>
                            <option value="vcf">VCF (simplified)</option>
                            <option value="json">JSON</option>
                            <option value="maf">MAF (cBioportal-compatible)</option>
                            <option value="tped">TPED</option>
                        </param>
                        <when value="default">
                            <expand macro="add_header_column" />
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <expand macro="dgidb_query" />
                            <expand macro="sorting" />
                        </when>
                        <when value="with_samples">
                            <expand macro="add_header_column" />
                            <expand macro="sample_delimiter" />
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <expand macro="dgidb_query" />
                            <expand macro="sorting" />
                        </when>
                        <when value="with_samples_flattened">
                            <expand macro="add_header_column" />
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <param name="dgidb" type="hidden" value="" />
                            <expand macro="sorting" />
                        </when>
                        <when value="with_families">
                            <expand macro="add_header_column" />
                            <expand macro="sample_delimiter" applied_to="families"/>
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <expand macro="dgidb_query" />
                            <expand macro="sorting" />
                        </when>
                        <when value="carrier_summary">
                            <expand macro="add_header_column" />
                            <expand macro="pheno_strat" />
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <expand macro="dgidb_query" />
                            <expand macro="sorting" />
                        </when>
                        <when value="vcf">
                            <expand macro="add_header_column" />
                            <param name="order_by" type="hidden" value="" />
                            <param name="dgidb" type="hidden" value="" />
                        </when>
                        <when value="json">
                            <param name="header" type="hidden" value="" />
                            <expand macro="column_filter"
                            minimalset="chrom, start, end, ref, alt, gene, impact"
                            help=""/>
                            <param name="dgidb" type="hidden" value="" />
                            <expand macro="sorting" />
                        </when>
                        <when value="maf">
                            <param name="header" type="hidden" value="--header" />
                            <param name="dgidb" type="hidden" value="" />
                            <expand macro="maf_extra_info" />
                            <param name="tumor_sample_id" type="text"
                            label="Tumor sample ID">
                            <validator type="expression" message="A tumor sample identifier is required">value.strip()</validator>
                            </param>
                            <param name="norm_sample_id" type="text"
                            label="Normal sample ID (for matched tumor/normal sample pair analyses only)" />
                            <conditional name="mutation_status">
                                <param name="status_select" type="select"
                                label="Mutation status to be recorded for variants reported with this query"
                                help="'Somatic', 'Germline', 'LOH', 'Wildtype' and 'None' are fixed values that are treated explicitly by cBioportal. In particular, 'Somatic' and 'Germline' are supported by the cBioportal user interface in the Mutations tab, while 'LOH', 'Wildtype' and 'None' will not be loaded. Any other values will cause the variants to be loaded into cBioPortal and will be displayed as text in the Mutations tab.">
                                    <option value="expression">Calculate per variant</option>
                                    <option value="custom">Other fixed value</option>
                                    <option value="Somatic" selected="true">Somatic</option>
                                    <option value="Germline">Germline</option>
                                    <option value="LOH">LOH</option>
                                    <option value="Wildtype">Wildtype</option>
                                    <option value="None">None</option>
                                </param>
                                <when value="Somatic" />
                                <when value="Germline" />
                                <when value="LOH" />
                                <when value="Wildtype" />
                                <when value="None" />
                                <when value="custom">
                                    <param name="status_custom" type="text"
                                    label="Mutation status (custom value)">
                                        <validator type="expression" message="Need a value for Mutation status">value.strip()</validator>
                                    </param>
                                </when>
                                <when value="expression">
                                    <param name="status_expression" type="text"
                                    label="SQL expression used to compute per-variant status"
                                    help="Enter a valid SQL result column expression to compute the mutation status from columns of the variants table in the GEMINI database. As one example, the expression ifnull(nullif(ifnull(nullif(ifnull(nullif(somatic_status, 3), 'LOH'), 2), 'Somatic'), 1), 'Germline') assumes that you have a column somatic_status added to the variants table of your database, and will record 'Germline', 'Somatic', or 'LOH' for variants with a value of 1, 2, or 3 in that column, respectively.">
                                        <expand macro="sanitize_query" />
                                        <validator type="expression" message="Mutation status expression cannot be empty">value.strip()</validator>
                                    </param>
                                </when>
                            </conditional>
                            <expand macro="sorting" />
                        </when>
                        <when value="tped">
                            <param name="header" type="hidden" value="" />
                            <param name="dgidb" type="hidden" value="" />
                            <expand macro="sorting" />
                        </when>
                    </conditional>
                </section>
            </when>
            <when value="advanced">
                <param argument="-q" name="q" type="text" area="True" size="5x50"
                label="The query to be issued to the database"
                help="Formulate your query using SQL syntax.">
                    <expand macro="sanitize_query" />
                    <validator type="expression" message="Query cannot be empty">value.strip()</validator>
                </param>
                <expand macro="gt_filter" />
                <expand macro="sample_filter" />
                <section name="oformat" title="Output format options" expanded="true">
                    <conditional name="report">
                        <param name="format" type="select"
                        label="Type of report to generate">
                            <option value="default">tabular (GEMINI default)</option>
                            <option value="with_samples">tabular with affected samples</option>
                            <option value="with_samples_flattened">tabular with affected samples flattened</option>
                            <option value="with_families">tabular with affected families</option>
                            <option value="carrier_summary">tabular with carrier summary</option>
                            <option value="vcf">VCF (simplified)</option>
                            <option value="json">JSON</option>
                            <option value="maf">MAF (cBioportal-compatible)</option>
                            <option value="tped">TPED</option>
                        </param>
                        <when value="default">
                            <expand macro="add_header_column" />
                            <expand macro="dgidb_query" />
                        </when>
                        <when value="with_samples">
                            <expand macro="add_header_column" />
                            <expand macro="sample_delimiter" />
                            <expand macro="dgidb_query" />
                        </when>
                        <when value="with_samples_flattened">
                            <expand macro="add_header_column" />
                            <param name="dgidb" type="hidden" value="" />
                        </when>
                        <when value="with_families">
                            <expand macro="add_header_column" />
                            <expand macro="sample_delimiter" />
                            <expand macro="dgidb_query" />
                        </when>
                        <when value="carrier_summary">
                            <expand macro="pheno_strat" />
                            <expand macro="add_header_column" />
                            <expand macro="dgidb_query" />
                        </when>
                        <when value="vcf">
                            <expand macro="add_header_column" />
                            <param name="dgidb" type="hidden" value="" />
                        </when>
                        <when value="json">
                            <param name="header" type="hidden" value="" />
                            <param name="dgidb" type="hidden" value="" />
                        </when>
                        <when value="maf">
                            <param name="header" type="hidden" value="--header" />
                            <param name="dgidb" type="hidden" value="" />
                            <expand macro="maf_extra_info" />
                        </when>
                        <when value="tped">
                            <param name="header" type="hidden" value="" />
                            <param name="dgidb" type="hidden" value="" />
                        </when>
                    </conditional>
                </section>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="outfile" format="tabular">
	        <change_format>
	            <when input="query.oformat.report.format" value="json" format="json" />
	            <when input="query.oformat.report.format" value="vcf" format="vcf" />
	        </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <conditional name="query">
                <param name="interface" value="advanced" />
                <param name="q" value="select chrom,start from variants limit 10" />
            </conditional>
            <output name="outfile">
                <assert_contents>
                    <has_line_matching expression="chrom&#009;start" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
            <conditional name="query">
                <param name="interface" value="basic" />
            </conditional>
            <section name="oformat">
                <conditional name="report">
                    <param name="format" value="maf" />
                    <param name="tumor_sample_id" value="test" />
                </conditional>
            </section>
            <output name="outfile" file="gemini_query_as_maf_result.tabular" />
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

The real power in the GEMINI framework lies in the fact that all of your
genetic variants have been stored in a convenient database in the context of a
wealth of genome annotations that facilitate variant interpretation.
The expressive power of SQL allows one to pose intricate questions of one’s
variation data. This tool offers you a flexible, yet relatively easy way
to query your variants!

-----

*Building your variant query with the Basic variant query constructor*

This mode tries to break down the complexity of formulating GEMINI queries
into more easily digestable parts. In this mode, the tool also prevents you
from combining options that are incompatible or not meaningful.

*Genotype filters*

These are discussed `here
<https://gemini.readthedocs.io/en/latest/content/querying.html#gt-filter-filtering-on-genotypes>`__
in the GEMINI documentation.

The tool supports regular genotype filters like::

  gt.sample1 == HET and gt_depths.sample1 >= 15

, which would keep only variants for which sample 1 is a heterozygous carrier
and if the genomic position in sample1 is covered by at least 15 sequencing
reads, as well as GEMINI wildcard filters of the general form
*(COLUMN).(SAMPLE_FILTER).(RULE).(RULE_ENFORCEMENT)* like::

  (gt_types).(phenotype==2).(!=HOM_REF).(all)

, which keeps only variants for which all phenotypic samples are homozygous.

*Sample filters*

Sample filters have the same format as the second component of the genotype
wildcard filters above, so::

  phenotype == 2

would filter for phenotypically affected samples. In this case, however, the
filter determines, from which samples variants should be reported, i.e., here,
only variants found in phenotypically affected samples become analyzed. You can
use the ``--in`` filter to adjust the exact meaning of the sample filter.

*Region filters*

They let you restrict your analysis to parts of the genome, which can be useful
if you have prior knowledge of the approximate location of a variant of
interest.

If you specify more then one region filter, they get combined with a logical
*OR*, meaning variants and genes falling in *any* of the regions are reported.

*Additional constraints on variants*

These get translated directly into the WHERE clause of an SQL query and, thus,
have to be expressed in valid SQL syntax. As an example you could use::

  is_exonic = 1 and impact_severity != 'LOW'

to indicate that you are only interested in exonic variants that are not of
*LOW* impact severity, *i.e.*, not silent mutations.

Note that in SQL syntax tests for equality use a single ``=``, while genotype
filters (discussed above) are following Python syntax and use ``==`` for the
same purpose. Also note that non-numerical values need to be enclosed in
single-quotes, *e.g.* ``'LOW'``, but numerical values must *NOT* be.

-----

*Building your query with the Advanced query constructor*

For the sake of simplicity, the basic mode of the tool limits your queries to
the variants table of the underlying database. While this still allows many
useful queries to be formulated, it prevents you from joining information from
other tables (in particular, the gene_detailed table) or to query a different
table directly.

In advanced mode, you take responsibility for formulating the complete SQL
query in correct syntax, which allows you to do anything you could do with the
command line tool. Beyond querying other tables, this includes changing output
column names, deriving simple statistics on columns using the SQL Min, Max,
Count, Avg and Sum functions, and more.

The price you pay for this extra flexibility is that you will have to make sure
that any other tool options you set are compatible with the result of your
particular query. For example, most output formats except the tabular default
output of GEMINI are incompatible with non-standard queries. Choosing
non-compatible options can result in them getting ignored silently, but also
in tool errors, or in problems with downstream tools.

The chapter `Querying the GEMINI database
<http://gemini.readthedocs.org/en/latest/content/querying.html>`__ of the
GEMINI documentation can get you started with formulating your own queries.

Note that genotype filters and sample filters cannot be expressed as genuine
SQL queries, so even the Advanced query constructor is offering them. Region
filters and sort order of rows and columns on the other hand can be controlled
through SQL queries, like in this example::

  SELECT gene, chrom, start, end, ref, alt FROM variants WHERE chrom = 'chr1'
  AND start >= 10000000 and stop <= 20000000 and is_lof = 1 ORDER BY chrom,
  start

, which would report all loss-of-function variants between 10,000,000 and
20,000,000 on chr1 and report the selected columns sorted on chromosome, then
position.

]]>
    </help>
    <expand macro="citations"/>
</tool>