Mercurial > repos > iuc > gemini_load

--- a/gemini_load.xml	Fri Dec 14 13:01:22 2018 -0500
+++ b/gemini_load.xml	Fri Jan 11 17:50:01 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1">
+<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
     <description>Loading a VCF file into GEMINI</description>
     <macros>
         <import>gemini_macros.xml</import>
@@ -11,7 +11,7 @@
 <![CDATA[
         @PROVIDE_ANNO_DATA@

-        ln -s "${ infile }" input.vcf &&
+        ln -s '$infile' input.vcf &&
         bgzip -c input.vcf > input.vcf.gz &&
         tabix -p vcf input.vcf.gz &&

@@ -19,62 +19,77 @@
             @BINARY@
             -v input.vcf.gz
             #if str( $annotation_type ) != "None":
-                -t "$annotation_type"
+                -t $annotation_type
             #end if

+            $has_genotypes
+
             #if $ped:
                 -p $ped
             #end if

-            $skip_gerp_bp
-            $skip_cadd
-            $skip_gene_tables
-            $no_load_genotypes
-            $no_genotypes
-            $passonly
-            $infostring
+            #if 'gerp_bp' not in str($opt_content):
+                --skip-gerp-bp
+            #end if
+            #if 'cadd' not in str($opt_content):
+                --skip-cadd
+            #end if
+            #if 'gene_tables' not in str($opt_content):
+                --skip-gene-tables
+            #end if
+            #if 'genotypes' not in str($opt_content):
+                --no-load-genotypes
+            #end if
+            #if 'gt_pl' not in str($opt_content):
+                --skip-pls
+            #end if
+            #if 'passonly' in str($opt_content):
+                --passonly
+            #end if
+            #if 'info_string' in str($opt_content):
+                --save-info-string
+            #end if
+
             --cores \${GALAXY_SLOTS:-4}

-            "${ outfile }"
+            '$outfile'
 ]]>
     </command>
     <inputs>
-        <param name="infile" type="data" format="vcf" label="VCF file to be loaded in the GEMINI database" help="Only build 37 (aka hg19) of the human genome is supported.">
+        <param name="infile" type="data" format="vcf"
+        label="VCF dataset to be loaded in the GEMINI database"
+        help="Only build 37 (aka hg19) of the human genome is supported.">
             <options>
                 <filter type="add_value" value="hg19" />
                 <filter type="add_value" value="Homo_sapiens_nuHg19_mtrCRS" />
                 <filter type="add_value" value="hg_g1k_v37" />
             </options>
         </param>
-
-        <param name="annotation_type" type="select" label="The annotations to be used with the input vcf" help="(-t)">
-            <option value="None">None (not recommended)</option>
-            <option value="snpEff" selected="True">snpEff annotated VCF file</option>
-            <option value="VEP">VEP annotated VCF file</option>
+        <param argument="-t" name="annotation_type" type="select"
+        label="The variants in this input are"
+        help="GEMINI can parse and use annotations generated with either snpEff (both 'EFF'- and 'ANN'-style annotations are supported) or VEP. You can also load unannotated variants, but most of GEMINI's functionality will not be available or not be very useful without annotations.">
+            <option value="snpEff" selected="True">annotated with snpEff</option>
+            <option value="VEP">annotated with VEP</option>
+            <option value="None">not annotated (not recommended)</option>
         </param>
-        <param name="ped" type="data" format="tabular" optional="True" label="Sample information file in PED+ format" help="(-p)" />
+        <param argument="--no-genotypes" name="has_genotypes" type="boolean" falsevalue="--no-genotypes" truevalue="" checked="True"
+        label="This input comes with genotype calls for its samples"
+        help="This is usually the case, but some published datasets, like some 1000G VCFs, are missing genotype information."/>
         <expand macro="annotation_dir" />
-
-        <param name="skip_gerp_bp" type="boolean" truevalue="--skip-gerp-bp" falsevalue="" checked="False"
-            label="Do not load GERP scores at base pair resolution" help="(--skip-gerp-bp)"/>
-
-        <param name="skip_cadd" type="boolean" truevalue="--skip-cadd" falsevalue="" checked="False"
-            label="Do not load CADD scores" help="(--skip-cadd)"/>
-
-        <param name="skip_gene_tables" type="boolean" truevalue="--skip-gene-tables" falsevalue="" checked="False"
-            label="Do not load gene tables" help="(--skip-gene-tables)"/>
-
-        <param name="no_load_genotypes" type="boolean" truevalue="--no-load-genotypes" falsevalue="" checked="False"
-            label="Genotypes exist in the file, but should not be stored" help="(--no-load-genotypes)"/>
-
-        <param name="no_genotypes" type="boolean" truevalue="--no-genotypes" falsevalue="" checked="False"
-            label="There are no genotypes in the file" help="e.g. some 1000G VCFs (--no-genotypes)"/>
-
-        <param name="passonly" type="boolean" truevalue="--passonly" falsevalue="" checked="False"
-            label="Keep only variants that pass all filters" help="e.g. some 1000G VCFs (--passonly)"/>
-
-        <param name="infostring" type="boolean" truevalue="--save-info-string" falsevalue="" checked="False"
-            label="Load INFO string from VCF file"  help="(--save-info-string)"/>
+        <param argument="-p" name="ped" type="data" format="tabular" optional="True"
+        label="Sample and family information in PED format"
+        help="The pedigree dataset is optional, but several GEMINI tools require the relationship between samples (i.e., the family structure) and/or the sample phenotype to be defined. The PED format is a simple tabular format (see the tool help below for details). If you choose to not provide sample information now, but later find that you need it for your analysis, you can also add it to an existing GEMINI database by using the GEMINI amend tool." />
+        <param name="opt_content" type="select" display="checkboxes" multiple="true" optional="true"
+        label="Load the following optional content into the database"
+        help="The preselected defaults should be ok for most use cases. If you are not interested in certain annotations, you can speed up database creation and decrease the resulting database size slightly by not loading them into the database. Note: GERP and CADD scores are optional parts of the annotation source and can only be loaded if available.">
+            <option value="gerp_bp" selected="true">GERP scores</option>
+            <option value="cadd" selected="true">CADD scores</option>
+            <option value="gene_tables" selected="true">Gene tables</option>
+            <option value="genotypes" selected="true">Sample genotypes</option>
+            <option value="gt_pl" selected="true">Genotype likelihoods (sample PLs)</option>
+            <option value="passonly" selected="false">only variants that passed all filters</option>
+            <option value="info_string" selected="false">variant INFO field</option>
+        </param>
     </inputs>
     <outputs>
         <data name="outfile" format="gemini.sqlite" />
@@ -83,41 +98,62 @@
         <test>
             <param name="annotation_databases" value="1999-01-01" />
             <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
-            <param name="skip_gene_tables" value="False" />
-            <param name="skip_gerp_bp" value="True" />
-            <param name="skip_cadd" value="True" />
-            <param name="no_genotypes" value="False" />
+            <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
             <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
+            <assert_command>
+                <has_text text="--skip-gerp-bp" />
+                <has_text text="--skip-cadd" />
+                <not_has_text text="--skip-gene-tables" />
+                <not_has_text text="--skip-pls" />
+                <not_has_text text="--no-load-genotypes" />
+                <not_has_text text="--passonly" />
+                <not_has_text text="--save-info-string" />
+                <not_has_text text="--no-genotypes" />
+            </assert_command>
         </test>
         <test>
             <param name="annotation_databases" value="1999-01-01" />
             <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
-            <param name="skip_gene_tables" value="False" />
-            <param name="skip_gerp_bp" value="False" />
-            <param name="skip_cadd" value="False" />
-            <param name="no_genotypes" value="False" />
+            <param name="opt_content" value="gerp_bp,cadd,gene_tables,genotypes,gt_pl" />
+            <param name="has_genotypes" value="True" />
             <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
             <assert_stderr>
                 <has_text text="CADD scores are not being loaded because the annotation file could not be found." />
                 <has_text text="GERP per bp is not being loaded because the annotation file could not be found." />
             </assert_stderr>
+            <assert_command>
+                <not_has_text text="--skip-gerp-bp" />
+                <not_has_text text="--skip-cadd" />
+                <not_has_text text="--skip-gene-tables" />
+                <not_has_text text="--skip-pls" />
+                <not_has_text text="--no-load-genotypes" />
+                <not_has_text text="--passonly" />
+                <not_has_text text="--save-info-string" />
+                <not_has_text text="--no-genotypes" />
+            </assert_command>
         </test>
         <test>
             <param name="annotation_databases" value="1999-01-01" />
             <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
-            <param name="skip_gene_tables" value="True" />
-            <param name="skip_gerp_bp" value="True" />
-            <param name="skip_cadd" value="True" />
-            <param name="no_genotypes" value="True" />
+            <param name="opt_content" value="genotypes,gt_pl" />
+            <param name="has_genotypes" value="False" />
             <output name="outfile" file="gemini_load_result2.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
+            <assert_command>
+                <has_text text="--skip-gerp-bp" />
+                <has_text text="--skip-cadd" />
+                <has_text text="--skip-gene-tables" />
+                <not_has_text text="--skip-pls" />
+                <not_has_text text="--no-load-genotypes" />
+                <not_has_text text="--passonly" />
+                <not_has_text text="--save-info-string" />
+                <has_text text="--no-genotypes" />
+            </assert_command>
         </test>
         <test>
             <param name="annotation_databases" value="1999-01-01" />
             <param name="infile" dbkey="hg19" value="gemini_amend.vcf" ftype="vcf" />
-            <param name="skip_gene_tables" value="False" />
-            <param name="skip_gerp_bp" value="True" />
-            <param name="skip_cadd" value="True" />
-            <param name="no_genotypes" value="False" />
+            <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
+            <param name="has_genotypes" value="True" />
             <param name="ped" value="gemini_amend.ped" ftype="tabular" />
             <output name="outfile" file="gemini_auto_rec_input.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
         </test>
@@ -125,8 +161,95 @@
     <help><![CDATA[
 **What it does**

-Before we can use GEMINI to explore genetic variation, we must first load our VCF file into the GEMINI database framework.
-We expect you to have first annotated the functional consequence of each variant in your VCF using either VEP or snpEff.
+Before we can use GEMINI to explore genetic variation, we must first load the
+variant information stored in VCF format into the GEMINI database framework.
+
+To fully leverage the power of GEMINI, you should first **annotate your VCF
+dataset** with the functional consequences of the variants using either *VEP*
+or *snpEff*.
+
+.. class:: Warning mark
+
+    To avoid problems during annotation, but also during later variant queries with
+    GEMINI tools, it is good practice to preprocess your VCF dataset even before
+    annoation to split records with multiple alternate alleles, and to left-align
+    and trim indels. The authors of GEMINI recommend the tool *vt* for this purpose,
+    an equivalently good option is *bcftools norm*, and Galaxy wrappers exist for
+    both tools.
+
+In addition, you are encouraged to provide **family and sample phenotype
+information in PED format**, if you are planning to use GEMINI for any kind of
+variant identification based on inheritance patterns.
+
+A PED file is simply a tabular text file (columns can be separated by either
+spaces or TABs, but not a mixture of the two within the same file) with the
+header::
+
+  #family_id    name     paternal_id    maternal_id    sex    phenotype
+
+and optional additional columns. The actual column names in the header are not
+fixed, but there have to be at least six columns that are interpreted as
+detailed next.
+
+Subsequent lines describe one sample from the VCF input dataset each, where
+
+- *family_id* is an alphanumeric identifier of a family
+
+  If the family, to which the sample belongs, is unknown, a placeholder of
+  ``0``, ``-9`` or ``None`` can be used to indicate this fact.
+
+- *name* is the identifier of the sample described by the line
+
+- *paternal_id* is the identifier of the sample's father
+
+  If the sample's father is not available in the VCF, a placeholder of
+  ``0``, ``-9`` or ``None`` can be used to indicate this fact.
+
+- *maternal_id* is the identifier of the sample's mother
+
+  If the sample's mother is not available in the VCF, a placeholder of
+  ``0``, ``-9`` or ``None`` can be used to indicate this fact.
+
+- *sex* is a numeric code for the sample's sex
+  (1=male, 2=female, any other number=unknown sex)
+
+- *phenotype* is a numeric code for the sample's phenotypic affection status
+  (1=unaffected, 2=affected)
+
+  If the sample's phenotype is unknown, a placeholder of ``0`` or ``-9`` can be
+  used to indicate this fact.
+
+- Optional additional columns can have any column name you like, and accept any
+  per-sample value. The data from such extra columns will be added to the
+  samples table of the GEMINI database so you can use them in queries. Extra
+  columns can be used, *e.g.*, to describe additional phenotypes.
+
+- If no extra columns are present in a PED file, then the header line is
+  optional.
+
+Here are two examples of valid PED file contents::
+
+  #family_id    name    paternal_id    maternal_id    sex    phenotype    hair_color
+  1             M10475  -9             -9             1      1            brown
+  1             M10478  M10475         M10500         2      2            brown
+  1             M10500  -9             -9             2      2            black
+  1             M128215 M10475         M10500         1      1            blue
+
+This describes a family with two kids, in which mother and daughter, but not
+father and son are phenotypically affected. The file also stores the hair color
+of all family members.
+
+::
+
+  #family_id    name         paternal_id    maternal_id    sex    phenotype
+  0             M10475       0              0              -1     1
+  0             M10478       0              0              -1     2
+  0             M10500       0              0              -1     2
+  0             M128215      0              0              -1     1
+
+This describes the same samples as above, but without recording family
+structure, sex or additional traits. Only the sample phenotypes are provided.
+In this case (no extra columns), the header line could be omitted.

     ]]></help>
     <expand macro="citations"/>
--- a/gemini_macros.xml	Fri Dec 14 13:01:22 2018 -0500
+++ b/gemini_macros.xml	Fri Jan 11 17:50:01 2019 -0500
@@ -1,15 +1,12 @@
 <macros>
     <!-- gemini version to be used -->
-    <token name="@VERSION@">0.18.1</token>
+    <token name="@VERSION@">0.20.1</token>
     <!-- minimal annotation files version required by this version of gemini -->
-    <token name="@DB_VERSION@">181</token>
+    <token name="@DB_VERSION@">200</token>

     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@VERSION@">gemini</requirement>
-            <requirement type="package" version="0.2.6">tabix</requirement>
-            <!-- for conda useage -->
-            <!--requirement type="package" version="1.3.1">htslib</requirement-->
             <yield />
         </requirements>
     </xml>
@@ -24,9 +21,17 @@
             <exit_code range=":-1" />
             <regex match="Error:" />
             <regex match="Exception:" />
+            <yield />
         </stdio>
     </xml>

+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1003153</citation>
+            <yield />
+        </citations>
+    </xml>
+
     <xml name="annotation_dir">
         <param name="annotation_databases" type="select" label="Choose a gemini annotation source">
             <options from_data_table="gemini_versioned_databases">
@@ -36,31 +41,36 @@
         </param>
     </xml>

-    <xml name="add_header_column">
-        <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False"
-            label="Add a header of column names to the output" help="(--header)"/>
-    </xml>
-
-    <xml name="radius">
-        <param name="radius" type="integer" value="3" label="Set filter for Breadth-first search (BFS) in the Protein-Protein Interaction network" help="(-r)" >
-            <validator type="in_range" min="0"/>
+    <xml name="infile">
+        <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." >
+            <options options_filter_attribute="metadata.gemini_version" >
+                <filter type="add_value" value="@VERSION@" />
+            </options>
         </param>
     </xml>
-    <xml name="variant_mode">
-        <param name="variant_mode" type="boolean" truevalue="--var" falsevalue="" checked="False"
-            label="Returns variant info (e.g. impact, biotype) for interacting genes" help="(--var)"/>
+
+    <xml name="add_header_column">
+        <param argument="--header" name="header" type="boolean" truevalue="--header" falsevalue="" checked="True"
+        label="Add a header of column names to the output" />
     </xml>

-    <xml name="column_filter">
+    <xml name="column_filter" token_help="" token_minimalset="variant_id, gene">
         <conditional name="report">
-            <param name="report_selector" type="select" label="Columns to include in the report"
-                help="By default, this tool reports all columns in the variants table. One may choose to report only a subset of the columns.">
-                <option value="all" selected="True">all</option>
-                <option value="column_filter">User given columns</option>
+            <param name="report_selector" type="select"
+            label="Set of columns to include in the variant report table"
+            help="@HELP@">
+                <option value="minimal">Minimal (report only a preconfigured minimal set of columns)</option>
+                <option value="full">Full (report all columns defined in the GEMINI database variants table)</option>
+                <option value="custom">Custom (report user-specified columns)</option>
             </param>
-            <when value="all"/>
-            <when value="column_filter">
-                <param name="columns" type="select" display="checkboxes" multiple="True" label="Choose columns to include in the report" help="(--columns)">
+            <when value="full" />
+            <when value="minimal">
+                <param name="columns" type="hidden" value="@MINIMALSET@" />
+                <param name="extra_cols" type="hidden" value="" />
+            </when>
+            <when value="custom">
+                <param name="columns" type="select" display="checkboxes" multiple="true" optional="true"
+                label="Choose columns to include in the report" help="(--columns)">
                     <option value="gene">gene</option>
                     <option value="chrom">chrom</option>
                     <option value="start">start</option>
@@ -69,27 +79,23 @@
                     <option value="alt">alt</option>
                     <option value="impact">impact</option>
                     <option value="impact_severity">impact_severity</option>
-                    <option value="max_aaf_all">alternative allele frequency</option>
+                    <option value="max_aaf_all">alternative allele frequency (max_aaf_all)</option>
                 </param>
-                <param name="extra_cols" type="text" label="Additional columns." help="Separate by whitespace"/>
+                <param name="extra_cols" type="text"
+                label="Additional columns (comma-separated)"
+                help="Column must be specified by the exact name they have in the GEMINI database, e.g., is_exonic or num_hom_alt, but, for genotype columns, GEMINI wildcard syntax is supported. The order of columns in the list is maintained in the output.">
+                    <expand macro="sanitize_query" />
+                </param>
             </when>
         </conditional>
     </xml>

-    <xml name="filter">
-        <conditional name="filter">
-            <param name="filter_selector" type="select" label="Apply additional constraints"
-                help="By default, this tool will report all variants regardless of their putative functional impact. In order to apply additional constraints on the variants returned, you can this optional filter.">
-                <option value="no">No additional constraints</option>
-                <option value="yes">Apply additional constraints</option>
-            </param>
-            <when value="no"/>
-            <when value="yes">
-                <param name="filter" type="text" label="Contraints in SQL syntax" help="Conditions applied here will become WHERE clauses in the query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'. (--filter)">
-                    <expand macro="sanitize_query" />
-                </param>
-            </when>
-        </conditional>
+    <xml name="filter" token_argument="--filter">
+        <param argument="@ARGUMENT@" name="filter" type="text"
+        label="Additional constraints expressed in SQL syntax"
+        help="Constraints defined here will become the WHERE clause of the SQL query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'.">
+            <expand macro="sanitize_query" />
+        </param>
     </xml>

     <xml name="sanitize_query">
@@ -103,10 +109,90 @@
        </sanitizer>
     </xml>

+    <xml name="lenient" token_argument="--lenient" token_truevalue="--lenient" token_help="The exact consequence of this setting depends on the type of inheritance pattern you are looking for (see the tool help below).">
+        <param argument="@ARGUMENT@" name="lenient" type="boolean" truevalue="@TRUEVALUE@" falsevalue="" checked="False"
+        label="Include hits with less convincing inheritance patterns"
+        help= "@HELP@" />
+    </xml>
+
+    <xml name="unaffected">
+        <param argument="--allow-unaffected" name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False"
+        label="Report candidates shared by unaffected samples"
+        help="Activating this option will enable the reporting of variants as candidate causative even if they are shared by unaffected samples in the family tree. The default will only report variants that are unique to affected samples."/>
+    </xml>
+
+    <xml name="min_kindreds" token_label="Minimum number of families with a candidate variant for a gene to be reported" token_help="This is the number of families required to have a variant fitting the inheritance model in the same gene in order for the gene and its variants to be reported. For example, we may only be interested in candidates where at least 4 families have a variant (with a fitting inheritance pattern) in that gene.">
+        <param argument="--min-kindreds" name="min_kindreds" type="integer" value="1" min="1"
+        label="@LABEL@"
+        help="@HELP@" />
+    </xml>
+
+    <xml name="insert_constraint" token_max_repeat="1">
+        <repeat name="constraint" title="Additional constraints on variants" default="0" max="@MAX_REPEAT@">
+            <expand macro="filter" />
+            <yield />
+        </repeat>
+    </xml>
+
+    <xml name="overwritable_where_default" token_default_where="">
+        <param name="overwrite_default_filter" type="boolean" checked="false"
+        label="Overwrite the default constraint of this tool"
+        help="By default, this tool restricts its analysis to @DEFAULT_WHERE@ and this constraint is applied on top of any constraint expressed above. With this option here selected, your custom constraint, if given, will overwrite the default instead." />
+    </xml>
+
+    <xml name="gt_filter" token_default_repeat="0" token_min_repeat="0" token_max_repeat="1">
+        <repeat name="filter_by_genotype" title="Genotype filter expression" default="@DEFAULT_REPEAT@" min="@MIN_REPEAT@" max="@MAX_REPEAT@">
+            <param argument="--gt-filter" name="gt_filter" type="text" value="" area="True" size="5x50"
+            label="Restrictions to apply to genotype values" help="">
+                <expand macro="sanitize_query" />
+                <validator type="expression" message="Genotype filter expression cannot be empty">value.strip()</validator>
+            </param>
+            <yield />
+        </repeat>
+    </xml>
+
+    <xml name="sample_filter">
+        <repeat name="filter_by_sample" title="Sample filter expression" default="0" max="1">
+            <param argument="--sample-filter" name="sample_filter" type="text" area="True" size="5x50"
+            label="SQL filter to use to filter the sample table" help="">
+                <expand macro="sanitize_query" />
+                <validator type="expression" message="Sample filter expression cannot be empty">value.strip()</validator>
+            </param>
+            <param argument="--in" name="in" type="select"
+            label="A variant must be in either all, none or any samples passing the sample-query filter"
+            help="">
+                <option value="">Return a variant if it is found in any sample passing the sample filter. (default) </option>
+                <option value="--in all">Return a variant if it is found in ALL samples passing the sample filter. (all)</option>
+                <option value="--in none">Return a variant if it is found in NO sample passing the sample filter. (none)</option>
+                <option value="--in only">Return a variant if it is found in any sample passing the sample filter, and in NO sample NOT passing it. (only)</option>
+                <option value="--in only all">Return a variant if is found in ALL samples passing the sample filter, and in NO sample NOT passing it. (only all)</option>
+            </param>
+            <expand macro="min_kindreds"
+            label="Minimum number of families in which a variant must pass the sample filter" help=""/>
+            <param argument="--family-wise" name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False"
+            label="Apply the sample-filter on a family-wise basis" help="If a variant passes the sample filter in at least the minimum number of families specified above it is retained." />
+        </repeat>
+    </xml>
+
+    <xml name="region_filter">
+        <repeat name="regions" title="Region Filter" default="0" min="0"
+        help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported.">
+            <param name="chrom" type="text" label="Chromosome">
+                <validator type="expression" message="A chromosome identifier is required when specifying a region filter">value.strip()</validator>
+            </param>
+            <param name="start" type="text" label="Region Start">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+            <param name="stop" type="text" label="Region End">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+        </repeat>
+    </xml>
+
     <token name="@PROVIDE_ANNO_DATA@"><![CDATA[
         mkdir gemini &&
-        ln -s "${annotation_databases.fields.path}/gemini/data" gemini/data &&
-        export GEMINI_CONFIG="${annotation_databases.fields.path}" &&
+        ln -s '${annotation_databases.fields.path}/gemini/data' gemini/data &&
+        export GEMINI_CONFIG='${annotation_databases.fields.path}' &&
     ]]></token>

     <token name="@MULTILN_SQL_EXPR_TO_CMDLN@">
@@ -119,67 +205,50 @@
         #end if
     </token>

-    <token name="@CMDLN_SQL_FILTER_FILTER_OPTION@">
-        #if str($filter.filter_selector) == 'yes' and $filter.filter:
-            --filter '${ str( $filter.filter ) }'
+    <token name="@SET_COLS@">
+        #if str($report.report_selector) == 'full':
+            #set cols = "*"
+        #else:
+            #if $report.columns and str($report.columns) != '':
+                #set $cols = str($report.columns)
+            #else
+                #set $cols = ''
+            #end if
+            #if str($report.extra_cols).strip():
+                #if $cols:
+                    #set $cols = $cols + ', ' + str($report.extra_cols)
+                #else:
+                    #set $cols = str($report.extra_cols)
+                #end if
+            #end if
+            #if not $cols:
+                #set $cols = "variant_id, gene"
+            #end if
         #end if
     </token>

     <token name="@COLUMN_SELECT@">
-        #if $report.report_selector != 'all':
-            --columns "${report.columns}
-            #if str($report.extra_cols).strip()
-                #echo ','+','.join(str($report.extra_cols).split())
-            #end if
-            "
+        @SET_COLS@
+        #if $cols != "*"
+            --columns '$cols'
         #end if
     </token>

-    <xml name="family">
-        <param name="families" type="text" value="" label="Comma seperated list of families to restrict the analysis to." help="e.g. Family1,Family3 (--families)"/>
-    </xml>
-
-    <xml name="lenient">
-        <param name="lenient" type="boolean" truevalue="--lenient" falsevalue="" checked="False" label="Loosen the restrictions on family structure"/>
-    </xml>
-
-    <xml name="unaffected">
-        <param name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False" label="Report candidates that also impact samples labeled as unaffected." help="(--allow-unaffected)"/>
-    </xml>
-
-    <xml name="min_kindreds">
-        <param name="min_kindreds" type="integer" value="1" label="The min. number of kindreds that must have a candidate variant in a gene" help="default: 1 (--min-kindreds)" />
-    </xml>
-
-    <xml name="min_sequence_depth">
-        <param name="d" type="integer" value="0" min="0" label="The minimum aligned sequence depth (genotype DP) required for each sample"
-                help="default: 0 (-d)" />
-    </xml>
-
-    <xml name="min_gq">
-        <param name="min_gq" type="integer" value="0" label="the minimum genotype quality required for each sample in a family" help="default: 0 (--min-gq)">
-            <validator type="in_range" min="0"/>
-        </param>
-    </xml>
-
-    <xml name="gt_pl_max">
-        <param name="gt_pl_max" type="integer" value="-1" min="-1" label="The maximum phred-scaled genotype likelihod (PL) allowed for each sample in a family" help="default: -1 (not set) (--gt-pl-max)" />
-    </xml>
-
-    <xml name="citations">
-        <citations>
-            <citation type="doi">10.1371/journal.pcbi.1003153</citation>
-            <yield />
-        </citations>
-    </xml>
-
-    <xml name="infile">
-        <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." >
-            <options options_filter_attribute="metadata.gemini_version" >
-                <filter type="add_value" value="@VERSION@" />
-            </options>
-            <validator type="expression" message="This version of Gemini will only work with Gemini files that are for version @VERSION@.">value is not None and value.metadata.gemini_version == "@VERSION@"</validator>
-        </param>
-    </xml>
-
+    <token name="@PARSE_REGION_ELEMENTS@"><![CDATA[
+        #set $region_elements = []
+        #for $r in $regions:
+            ## The actual chromosome name needs to be single-quoted
+            ## in SQL, so we need to quote the single quotes like the
+            ## sanitize_query macro would if the whole was a parameter.
+            #set $r_elements = ["chrom = '\"'\"'%s'\"'\"'" % str($r.chrom).strip()]
+            #if str($r.start).strip():
+                #silent $r_elements.append("start >= %d" % int($r.start))
+            #end if
+            #if str($r.stop).strip():
+                #silent $r_elements.append("end <= %d" % int($r.stop))
+            #end if
+            #silent $region_elements.append("(%s)" % " AND ".join($r_elements))
+        #end for
+    ]]>
+    </token>
 </macros>
--- a/repository_dependencies.xml	Fri Dec 14 13:01:22 2018 -0500
+++ b/repository_dependencies.xml	Fri Jan 11 17:50:01 2019 -0500
@@ -1,4 +1,4 @@
 <?xml version="1.0" ?>
 <repositories description="This requires the GEMINI data manager definition to install all required annotation databases.">
-    <repository changeset_revision="fe5a9a7d95b0" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/>
+    <repository changeset_revision="f57426daa04d" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/>
 </repositories>
\ No newline at end of file
Binary file test-data/gemini_amend_input.db has changed
Binary file test-data/gemini_annotate_result.db has changed
Binary file test-data/gemini_auto_dom_input.db has changed
Binary file test-data/gemini_auto_rec_input.db has changed
Binary file test-data/gemini_comphets_input.db has changed
Binary file test-data/gemini_de_novo_input.db has changed
Binary file test-data/gemini_is_somatic_result.db has changed
Binary file test-data/gemini_load_result1.db has changed
Binary file test-data/gemini_load_result2.db has changed
--- a/test-data/gemini_versioned_databases.loc	Fri Dec 14 13:01:22 2018 -0500
+++ b/test-data/gemini_versioned_databases.loc	Fri Jan 11 17:50:01 2019 -0500
@@ -1,3 +1,3 @@
 ## GEMINI versioned databases
 #DownloadDate	dbkey	DBversion	Description	Path
-1999-01-01	hg19	181	GEMINI annotations (test snapshot)	${__HERE__}/test-cache
+1999-01-01	hg19	200	GEMINI annotations (test snapshot)	${__HERE__}/test-cache
--- a/test-data/test-cache/gemini-config.yaml	Fri Dec 14 13:01:22 2018 -0500
+++ b/test-data/test-cache/gemini-config.yaml	Fri Jan 11 17:50:01 2019 -0500
@@ -2,12 +2,14 @@
 versions:
   ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz: 4
   ESP6500SI.all.snps_indels.tidy.v2.vcf.gz: 2
-  ExAC.r0.3.sites.vep.tidy.vcf.gz: 3
+  ExAC.r0.3.sites.vep.tidy.vcf.gz: 4
   GRCh37-gms-mappability.vcf.gz: 2
-  clinvar_20160203.tidy.vcf.gz: 5
+  clinvar_20170130.tidy.vcf.gz: 5
   cosmic-v68-GRCh37.tidy.vcf.gz: 3
-  dbsnp.b141.20140813.hg19.tidy.vcf.gz: 4
+  dbsnp.b147.20160601.tidy.vcf.gz: 1
   detailed_gene_table_v75: 2
   geno2mp.variants.tidy.vcf.gz: 1
+  gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz: 2
   hg19.rmsk.bed.gz: 2
   summary_gene_table_v75: 2
+  whole_genome_SNVs.tsv.compressed.gz: 2
Binary file test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz.tbi has changed