comparison gemini_query.xml @ 5:cd00221d67cb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author iuc
date Fri, 11 Jan 2019 17:47:02 -0500
parents 7ca6716748c2
children da74170c55c7
comparison
equal deleted inserted replaced
4:7ca6716748c2 5:cd00221d67cb
1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> 1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
2 <description>Querying the GEMINI database</description> 2 <description>Querying the GEMINI database</description>
3 <macros> 3 <macros>
4 <import>gemini_macros.xml</import> 4 <import>gemini_macros.xml</import>
5 <token name="@BINARY@">query</token> 5 <token name="@BINARY@">query</token>
6
7 <xml name="sorting">
8 <param name="order_by" type="text"
9 label="Sort the output by the following column(s)"
10 help="" />
11 <param name="sort_order" type="select" label="Sort order">
12 <option value=" ASC">Ascending</option>
13 <option value=" DESC">Descending</option>
14 </param>
15 </xml>
16 <xml name="pheno_strat">
17 <param name="phenotype" type="text"
18 label="Phenotype to stratify samples across"
19 help="Leave blank to stratify across the default phenotype column" />
20 </xml>
21 <xml name="sample_delimiter" token_applied_to="samples">
22 <param argument="--sample-delim" name="sample_delim" type="text" value=","
23 label="Delimiter to use in the list of affected @APPLIED_TO@"
24 help="" />
25 </xml>
26 <xml name="dgidb_query">
27 <param argument="--dgidb" name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False"
28 label="Request drug-gene interaction info from DGIdb" help="" />
29 </xml>
6 </macros> 30 </macros>
7 <expand macro="requirements" /> 31 <expand macro="requirements" />
8 <expand macro="stdio" /> 32 <expand macro="stdio" />
9 <expand macro="version_command" /> 33 <expand macro="version_command" />
10 <command> 34 <command>
11 <![CDATA[ 35 <![CDATA[
12 gemini @BINARY@ 36 gemini @BINARY@
13 37 ${query.oformat.report.header}
14 --in "${in}" 38 ${query.oformat.report.dgidb}
15 39
16 #set $multiline_sql_expr = $gt_filter 40 #for $i in $query.filter_by_genotype:
17 #set $cmdln_param = "--gt-filter" 41 #set $multiline_sql_expr = str($i.gt_filter)
18 @MULTILN_SQL_EXPR_TO_CMDLN@ 42 #set $cmdln_param = "--gt-filter"
19 43 @MULTILN_SQL_EXPR_TO_CMDLN@
20 #set $multiline_sql_expr = $sample_filter 44 #end for
21 #set $cmdln_param = "--sample-filter" 45
22 @MULTILN_SQL_EXPR_TO_CMDLN@ 46 #for $i in $query.filter_by_sample:
23 47 $i.family_wise
24 $show_samples 48 #if int($i.min_kindreds) > 0:
25 $show_families 49 --min-kindreds ${i.min_kindreds}
26 $family_wise 50 #end if
27 $header 51 ${i.in}
28 $dgidb 52 #set $multiline_sql_expr = str($i.sample_filter)
29 #if $region.strip(): 53 #set $cmdln_param = "--sample-filter"
30 --region "${region}" 54 @MULTILN_SQL_EXPR_TO_CMDLN@
55 #end for
56
57 #if str($query.oformat.report.format) == 'with_samples':
58 #set $sample_delim = str($query.oformat.report.sample_delim) or ','
59 --show-samples --sample-delim '$sample_delim'
60 #elif str($query.oformat.report.format) == 'with_samples_flattened':
61 --show-samples --format sampledetail
62 #elif str($query.oformat.report.format) == 'with_families':
63 #set $sample_delim = str($query.oformat.report.sample_delim) or ','
64 --show-families --sample-delim '$sample_delim'
65 #elif str($query.oformat.report.format) == 'carrier_summary':
66 --carrier-summary-by-phenotype
67 #if str($query.oformat.report.phenotype).strip():
68 '${query.oformat.report.phenotype}'
69 #else:
70 affected
71 #end if
72 #else:
73 --format ${query.oformat.report.format}
31 #end if 74 #end if
32 #if int($min_kindreds) > 0: 75
33 --min-kindreds $min_kindreds 76 #if str($query.interface) == 'basic':
77 ## build the SQL query string from its components
78 #if str($query.oformat.report.format) in ('vcf', 'tped'):
79 #set $cols = "*"
80 #else:
81 #set $report = $query.oformat.report.report
82 @SET_COLS@
83 #end if
84 #set $q = "SELECT %s FROM variants" % $cols
85 #set $where_clause_elements = []
86 #if str($query.filter).strip():
87 #silent $where_clause_elements.append(str($query.filter).strip())
88 #end if
89
90 #set $regions = $query.regions
91 @PARSE_REGION_ELEMENTS@
92 #if $region_elements:
93 #silent $where_clause_elements.append(" OR ".join($region_elements))
94 #end if
95 #if $where_clause_elements:
96 #set $q = $q + " WHERE " + " AND ".join($where_clause_elements)
97 #end if
98 #if str($query.oformat.report.order_by).strip():
99 #set $q = $q + " ORDER BY " + str($query.oformat.report.order_by).strip() + str($query.oformat.report.sort_order)
100 #end if
101 #else
102 ## The user entered the SQL query string directly.
103 #set $q = str($query.q)
34 #end if 104 #end if
35 ##--format FORMAT Format of output (JSON, TPED or default) # we will take default for the time being
36 ## --sample-delim STRING The delimiter to be used with the --show-samples option.
37 105
38 #set $multiline_sql_expr = $q 106 #set $multiline_sql_expr = $q
39 #set $cmdln_param = "-q" 107 #set $cmdln_param = "-q"
40 @MULTILN_SQL_EXPR_TO_CMDLN@ 108 @MULTILN_SQL_EXPR_TO_CMDLN@
41 109
42 "${ infile }" 110 '$infile'
43 > "${ outfile }" 111 > '$outfile'
44 ]]> 112 ]]>
45 </command> 113 </command>
46 <!--
47 ##TODO:
48 - -carrier-summary-by-phenotype CARRIER_SUMMARY
49 Output columns of counts of carriers and non-carriers
50 stratified by the given sample phenotype column-->
51 <inputs> 114 <inputs>
52 <expand macro="infile" /> 115 <expand macro="infile" />
53 116 <conditional name="query">
54 <param name="q" type="text" area="True" size="5x50" label="The query to be issued to the database" help="(-q)"> 117 <param name="interface" type="select"
55 <expand macro="sanitize_query" /> 118 label="Build GEMINI query using"
56 </param> 119 help="">
57 <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> 120 <option value="basic">Basic variant query constructor</option>
58 <expand macro="sanitize_query" /> 121 <option value="advanced">Advanced query constructor</option>
59 </param> 122 </param>
60 <param name="sample_filter" type="text" area="True" size="5x50" label="SQL filter to use to filter the sample table" help="(--sample-filter)"> 123 <when value="basic">
61 <expand macro="sanitize_query" /> 124 <expand macro="gt_filter" />
62 </param> 125 <expand macro="sample_filter" />
63 126 <expand macro="region_filter" />
64 <param name="show_samples" type="boolean" truevalue="--show-samples" falsevalue="" checked="False" 127 <expand macro="filter" argument="" />
65 label="Add a column of all sample names with a variant to each variant" help="(--show-samples)"/> 128 <section name="oformat" title="Output format options" expanded="true">
66 129 <conditional name="report">
67 <param name="show_families" type="boolean" truevalue="--show-families" falsevalue="" checked="False" 130 <param name="format" type="select"
68 label="Add a column listing all of the families with a variant to each variant" help="(--show-families)"/> 131 label="Type of report to generate">
69 132 <option value="default">tabular (GEMINI default)</option>
70 <param name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False" 133 <option value="with_samples">tabular with affected samples</option>
71 label="Perform the sample-filter on a family-wise basis" help="(--family-wise)"/> 134 <option value="with_samples_flattened">tabular with affected samples flattened</option>
72 135 <option value="with_families">tabular with affected families</option>
73 <expand macro="add_header_column" /> 136 <option value="carrier_summary">tabular with carrier summary</option>
74 <expand macro="min_kindreds" /> 137 <option value="vcf">VCF (simplified)</option>
75 138 <option value="json">JSON</option>
76 <param name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False" 139 <option value="tped">TPED</option>
77 label="Request drug-gene interaction info from DGIdb" help="(--dgidb)"/> 140 </param>
78 141 <when value="default">
79 <param name="in" type="select" label="A variant must be in either all, none or any samples passing the sample-query filter" help="(--in)"> 142 <expand macro="add_header_column" />
80 <option value="all">Return a variant if all samples matching the query have the variant. (all)</option> 143 <expand macro="column_filter"
81 <option value="none">Return a variant if the variant does not appear in any of the matching samples. (none)</option> 144 minimalset="chrom, start, end, ref, alt, gene, impact"
82 <option value="any">Return all of the variant which are in all of the matching samples and not in any of the non-matching samples. (any)</option> 145 help=""/>
83 <option value="only">Return a variant if the variant is only in the matching samples and not in any of the non-matching samples. (only)</option> 146 <expand macro="dgidb_query" />
84 </param> 147 <expand macro="sorting" />
85 148 </when>
86 <param name="region" type="text" value="" label="Restrict query to this region" help="e.g. chr1:10-20 (--region)"/> 149 <when value="with_samples">
87 150 <expand macro="add_header_column" />
88 151 <expand macro="sample_delimiter" />
152 <expand macro="column_filter"
153 minimalset="chrom, start, end, ref, alt, gene, impact"
154 help=""/>
155 <expand macro="dgidb_query" />
156 <expand macro="sorting" />
157 </when>
158 <when value="with_samples_flattened">
159 <expand macro="add_header_column" />
160 <expand macro="column_filter"
161 minimalset="chrom, start, end, ref, alt, gene, impact"
162 help=""/>
163 <param name="dgidb" type="hidden" value="" />
164 <expand macro="sorting" />
165 </when>
166 <when value="with_families">
167 <expand macro="add_header_column" />
168 <expand macro="sample_delimiter" applied_to="families"/>
169 <expand macro="column_filter"
170 minimalset="chrom, start, end, ref, alt, gene, impact"
171 help=""/>
172 <expand macro="dgidb_query" />
173 <expand macro="sorting" />
174 </when>
175 <when value="carrier_summary">
176 <expand macro="add_header_column" />
177 <expand macro="pheno_strat" />
178 <expand macro="column_filter"
179 minimalset="chrom, start, end, ref, alt, gene, impact"
180 help=""/>
181 <expand macro="dgidb_query" />
182 <expand macro="sorting" />
183 </when>
184 <when value="vcf">
185 <expand macro="add_header_column" />
186 <param name="order_by" type="hidden" value="" />
187 <param name="dgidb" type="hidden" value="" />
188 </when>
189 <when value="json">
190 <param name="header" type="hidden" value="" />
191 <expand macro="column_filter"
192 minimalset="chrom, start, end, ref, alt, gene, impact"
193 help=""/>
194 <param name="dgidb" type="hidden" value="" />
195 <expand macro="sorting" />
196 </when>
197 <when value="tped">
198 <param name="header" type="hidden" value="" />
199 <param name="dgidb" type="hidden" value="" />
200 <expand macro="sorting" />
201 </when>
202 </conditional>
203 </section>
204 </when>
205 <when value="advanced">
206 <param argument="-q" name="q" type="text" area="True" size="5x50"
207 label="The query to be issued to the database"
208 help="Formulate your query using SQL syntax.">
209 <expand macro="sanitize_query" />
210 <validator type="expression" message="Query cannot be empty">value.strip()</validator>
211 </param>
212 <expand macro="gt_filter" />
213 <expand macro="sample_filter" />
214 <section name="oformat" title="Output format options" expanded="true">
215 <conditional name="report">
216 <param name="format" type="select"
217 label="Type of report to generate">
218 <option value="default">tabular (GEMINI default)</option>
219 <option value="with_samples">tabular with affected samples</option>
220 <option value="with_samples_flattened">tabular with affected samples flattened</option>
221 <option value="with_families">tabular with affected families</option>
222 <option value="carrier_summary">tabular with carrier summary</option>
223 <option value="vcf">VCF (simplified)</option>
224 <option value="json">JSON</option>
225 <option value="tped">TPED</option>
226 </param>
227 <when value="default">
228 <expand macro="add_header_column" />
229 <expand macro="dgidb_query" />
230 </when>
231 <when value="with_samples">
232 <expand macro="add_header_column" />
233 <expand macro="sample_delimiter" />
234 <expand macro="dgidb_query" />
235 </when>
236 <when value="with_samples_flattened">
237 <expand macro="add_header_column" />
238 <param name="dgidb" type="hidden" value="" />
239 </when>
240 <when value="with_families">
241 <expand macro="add_header_column" />
242 <expand macro="sample_delimiter" />
243 <expand macro="dgidb_query" />
244 </when>
245 <when value="carrier_summary">
246 <expand macro="pheno_strat" />
247 <expand macro="add_header_column" />
248 <expand macro="dgidb_query" />
249 </when>
250 <when value="vcf">
251 <expand macro="add_header_column" />
252 <param name="dgidb" type="hidden" value="" />
253 </when>
254 <when value="json">
255 <param name="header" type="hidden" value="" />
256 <param name="dgidb" type="hidden" value="" />
257 </when>
258 <when value="tped">
259 <param name="header" type="hidden" value="" />
260 <param name="dgidb" type="hidden" value="" />
261 </when>
262 </conditional>
263 </section>
264 </when>
265 </conditional>
89 </inputs> 266 </inputs>
90 <outputs> 267 <outputs>
91 <data name="outfile" format="tabular" /> 268 <data name="outfile" format="tabular">
269 <change_format>
270 <when input="query.oformat.report.format" value="json" format="json" />
271 <when input="query.oformat.report.format" value="vcf" format="vcf" />
272 </change_format>
273 </data>
92 </outputs> 274 </outputs>
93 <tests> 275 <tests>
94 <test> 276 <test>
95 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" /> 277 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
96 <param name="q" value="select chrom,start from variants limit 10" /> 278 <conditional name="query">
97 <param name="header" value="True" /> 279 <param name="interface" value="advanced" />
280 <param name="q" value="select chrom,start from variants limit 10" />
281 </conditional>
98 <output name="outfile"> 282 <output name="outfile">
99 <assert_contents> 283 <assert_contents>
100 <has_line_matching expression="chrom&#009;start" /> 284 <has_line_matching expression="chrom&#009;start" />
101 </assert_contents> 285 </assert_contents>
102 </output> 286 </output>
104 </tests> 288 </tests>
105 <help> 289 <help>
106 <![CDATA[ 290 <![CDATA[
107 **What it does** 291 **What it does**
108 292
109 The real power in the GEMINI framework lies in the fact that all of your genetic variants have been stored in a convenient database in the context of a wealth of genome annotations that facilitate variant interpretation. 293 The real power in the GEMINI framework lies in the fact that all of your
110 The expressive power of SQL allows one to pose intricate questions of one’s variation data. This tool offers you an easy way to query your variants! 294 genetic variants have been stored in a convenient database in the context of a
111 295 wealth of genome annotations that facilitate variant interpretation.
112 http://gemini.readthedocs.org/en/latest/content/querying.html 296 The expressive power of SQL allows one to pose intricate questions of one’s
297 variation data. This tool offers you a flexible, yet relatively easy way
298 to query your variants!
299
300 -----
301
302 *Building your variant query with the Basic variant query constructor*
303
304 This mode tries to break down the complexity of formulating GEMINI queries
305 into more easily digestable parts. In this mode, the tool also prevents you
306 from combining options that are incompatible or not meaningful.
307
308 *Genotype filters*
309
310 These are discussed `here
311 <https://gemini.readthedocs.io/en/latest/content/querying.html#gt-filter-filtering-on-genotypes>`__
312 in the GEMINI documentation.
313
314 The tool supports regular genotype filters like::
315
316 gt.sample1 == HET and gt_depths.sample1 >= 15
317
318 , which would keep only variants for which sample 1 is a heterozygous carrier
319 and if the genomic position in sample1 is covered by at least 15 sequencing
320 reads, as well as GEMINI wildcard filters of the general form
321 *(COLUMN).(SAMPLE_FILTER).(RULE).(RULE_ENFORCEMENT)* like::
322
323 (gt_types).(phenotype==2).(!=HOM_REF).(all)
324
325 , which keeps only variants for which all phenotypic samples are homozygous.
326
327 *Sample filters*
328
329 Sample filters have the same format as the second component of the genotype
330 wildcard filters above, so::
331
332 phenotype == 2
333
334 would filter for phenotypically affected samples. In this case, however, the
335 filter determines, from which samples variants should be reported, i.e., here,
336 only variants found in phenotypically affected samples become analyzed. You can
337 use the ``--in`` filter to adjust the exact meaning of the sample filter.
338
339 *Region filters*
340
341 They let you restrict your analysis to parts of the genome, which can be useful
342 if you have prior knowledge of the approximate location of a variant of
343 interest.
344
345 If you specify more then one region filter, they get combined with a logical
346 *OR*, meaning variants and genes falling in *any* of the regions are reported.
347
348 *Additional constraints on variants*
349
350 These get translated directly into the WHERE clause of an SQL query and, thus,
351 have to be expressed in valid SQL syntax. As an example you could use::
352
353 is_exonic = 1 and impact_severity != 'LOW'
354
355 to indicate that you are only interested in exonic variants that are not of
356 *LOW* impact severity, *i.e.*, not silent mutations.
357
358 Note that in SQL syntax tests for equality use a single ``=``, while genotype
359 filters (discussed above) are following Python syntax and use ``==`` for the
360 same purpose. Also note that non-numerical values need to be enclosed in
361 single-quotes, *e.g.* ``'LOW'``, but numerical values must *NOT* be.
362
363 -----
364
365 *Building your query with the Advanced query constructor*
366
367 For the sake of simplicity, the basic mode of the tool limits your queries to
368 the variants table of the underlying database. While this still allows many
369 useful queries to be formulated, it prevents you from joining information from
370 other tables (in particular, the gene_detailed table) or to query a different
371 table directly.
372
373 In advanced mode, you take responsibility for formulating the complete SQL
374 query in correct syntax, which allows you to do anything you could do with the
375 command line tool. Beyond querying other tables, this includes changing output
376 column names, deriving simple statistics on columns using the SQL Min, Max,
377 Count, Avg and Sum functions, and more.
378
379 The price you pay for this extra flexibility is that you will have to make sure
380 that any other tool options you set are compatible with the result of your
381 particular query. For example, most output formats except the tabular default
382 output of GEMINI are incompatible with non-standard queries. Choosing
383 non-compatible options can result in them getting ignored silently, but also
384 in tool errors, or in problems with downstream tools.
385
386 The chapter `Querying the GEMINI database
387 <http://gemini.readthedocs.org/en/latest/content/querying.html>`__ of the
388 GEMINI documentation can get you started with formulating your own queries.
389
390 Note that genotype filters and sample filters cannot be expressed as genuine
391 SQL queries, so even the Advanced query constructor is offering them. Region
392 filters and sort order of rows and columns on the other hand can be controlled
393 through SQL queries, like in this example::
394
395 SELECT gene, chrom, start, end, ref, alt FROM variants WHERE chrom = 'chr1'
396 AND start >= 10000000 and stop <= 20000000 and is_lof = 1 ORDER BY chrom,
397 start
398
399 , which would report all loss-of-function variants between 10,000,000 and
400 20,000,000 on chr1 and report the selected columns sorted on chromosome, then
401 position.
402
113 ]]> 403 ]]>
114 </help> 404 </help>
115 <expand macro="citations"/> 405 <expand macro="citations"/>
116 </tool> 406 </tool>