Mercurial > repos > iuc > gemini
changeset 0:720cbfb4190d draft
Imported from capsule None
| author | iuc | 
|---|---|
| date | Mon, 25 Aug 2014 17:15:54 -0400 | 
| parents | |
| children | d3c4d0208bb2 | 
| files | gemini_annotate.xml gemini_autosomal_recessive.xml gemini_burden.xml gemini_comp_hets.xml gemini_db_info.xml gemini_de_novo.xml gemini_interactions.xml gemini_load.xml gemini_lof_sieve.xml gemini_macros.xml gemini_pathways.xml gemini_query.xml gemini_region.xml gemini_roh.xml gemini_stats.xml gemini_windower.xml readme.rst repository_dependencies.xml tool-data/gemini_databases.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml | 
| diffstat | 21 files changed, 1260 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_annotate.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,96 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>adding your own custom annotations</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">annotate</token> + </macros> + <command> +<![CDATA[ + +bgzip -c $annotate_source > tabixed.gz; +tabix -p bed tabixed.gz; + + gemini @BINARY@ + -f tabixed.gz + -c $column_name + -a $a.a_selector + #if $a.a_selector == 'extract': + -t $a.column_type + -e $a.column_extracts + -o $a.operation + #end if + + "${ infile }" + > "${ outfile }" +]]> + + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + <param name="annotate_source" type="data" format="bed" label="File containing the annotations in BED format" help="(-f)"/> + + <param name="column_name" size="20" type="text" value="" + label="The name of the column to be added to the variant table" help="(-c)"> + <sanitizer invalid_char=" "> + <valid initial="string.letters,string.digits"> + <add value="_" /> + </valid> + </sanitizer> + </param> + <conditional name="a"> + <param name="a_selector" type="select" label="How should the annotation file be used?" help="(-a)"> + <option value="boolean">Did a variant overlap a region or not? (boolean)</option> + <option value="count">How many regions did a variant overlap? (count)</option> + <option value="extract" selected="True">Extract specific values from a BED file. (extract)</option> + </param> + <when value="extract"> + + <param name="column_extracts" label="Column to extract information from for list annotations" + type="data_column" data_ref="annotate_source" force_select="true" help="(-e)"/> + + + <param name="column_type" type="select" label="What data type(s) should be used to represent the new values in the database?" + help="(-t)"> + <option value="float">Decimal precision number (float)</option> + <option value="integer">Integer number (integer)</option> + <option value="text">Text columns such as “valid”, “yes” (text)</option> + </param> + + <param name="operation" type="select" label="Operation to apply to the extract column values ..." + help="in the event that a variant overlaps multiple annotations in your annotation file. (-o)"> + <option value="mean">Compute the average of the (numeric) values</option> + <option value="median">Compute the median of the (numeric) values</option> + <option value="mix">Compute the minimum of the (numeric) values</option> + <option value="max">Compute the maximum of the (numeric) values</option> + <option value="mode">Compute the maximum of the (numeric) values</option> + <option value="first">Use the value from the first record in the annotation file</option> + <option value="last">Use the value from the last record in the annotation file</option> + <option value="list">Create a comma-separated list of the observed (text) values</option> + <option value="uniq_list">Create a comma-separated list of non-redundant observed (text) values</option> + </param> + + </when> + <when value="boolean"/> + <when value="count"/> + </conditional> + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +It is inevitable that researchers will want to enhance the gemini framework with their own, custom annotations. gemini provides a sub-command called annotate for exactly this purpose. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_autosomal_recessive.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,70 @@ +<tool id="gemini_recessive_and_dominant" name="GEMINI autosomal recessive/dominant" version="@VERSION@.0"> + <description>Find variants meeting an autosomal recessive/dominant model</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + </macros> + <command> +<![CDATA[ + gemini + + #if $rec_or_dom == 'recessive': + ## start autosomal_recessive + autosomal_recessive + #else: + ## start autosomal_dominant + autosomal_dominant + #end if + + #if $report.report_selector != 'all': + --columns "${report.columns}" + #end if + + #if $filter.filter_selector == 'yes': + --filter "${filter.filter}" + #end if + + -d $d + #if int($min_kindreds) > 0: + --min-kindreds $min_kindreds + #end if + + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + + <param name="rec_or_dom" type="select" label="Autosomal ..." help=""> + <option value="recessive">recessive</option> + <option value="dominant">dominant</option> + </param> + + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + <expand macro="column_filter" /> + <expand macro="filter" /> + <expand macro="min_sequence_depth" /> + <param name="min_kindreds" size="4" type="integer" value="-1" lebel="The min. number of kindreds that must have a candidate variant in a gene" + help="-1 means default values (--min-kindreds)" /> + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Assuming you have defined the familial relationships between samples when loading your VCF into GEMINI, one can leverage a +built-in tool for identifying variants that meet an autosomal recessive or dominant inheritance pattern. +The reported variants will be restricted to those variants having the potential to impact the function of affecting protein coding transcripts. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_burden.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,82 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>perform sample-wise gene-level burden calculations</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">burden</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + --cases $cases + --controls $controls + $save_tscores + $nonsynonymous + $calpha + --permutations $permutations + #if float( str($min_aaf) ) >= 0.0: + --min-aaf $min_aaf + #end if + #if float( str($max_aaf) ) >= 0.0: + --max-aaf $max_aaf + #end if + "${ infile }" + > "${ outfile }" +]]> + + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="cases" size="30" type="text" value="" label="Space separated list of cases for association testing" help="(--cases)"/> + <param name="controls" size="30" type="text" value="" label="Space separated list of controls for association testing" help="(--controls)"/> + + <param name="save_tscores" type="boolean" truevalue="--save_tscores" falsevalue="" checked="False" + label="Save the permuted T-scores in the output file" help="(--save_tscores)"/> + + <param name="nonsynonymous" type="boolean" truevalue="--nonsynonymous" falsevalue="" checked="False" + label="Count all nonsynonymous variants as contributing burden" help="(--nonsynonymous)"/> + <param name="calpha" type="boolean" truevalue="--calpha" falsevalue="" checked="False" + label="Run the C-alpha association test" help="(--calpha)"/> + <param name="min_aaf" type="float" value="-1" size="5" label="The min. alt. allele frequency for a variant to be included" + help="(--min-aaf)"> + <!--validator type="in_range" min="0.0"/--> + </param> + <param name="max_aaf" type="float" value="-1" size="5" label="The max. alt. allele frequency for a variant to be included" + help="(--max-aaf)"> + <!--validator type="in_range" min="0.0"/--> + </param> + + <param name="permutations" type="integer" value="1000" size="10" label="Number of permutations to run for the C-alpha test" + help="(--permutations)"> + <validator type="in_range" min="0"/> + </param> + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +The burden tool provides a set of utilities to perform burden summaries on a per-gene, per sample basis. +By default, it outputs a table of gene-wise counts of all high impact variants in coding regions for each sample. + +$ gemini burden test.burden.db +gene M10475 M10478 M10500 M128215 +WDR37 2 2 2 2 +CTBP2 0 0 0 1 +DHODH 1 0 0 0 + +@CITATION@ + </help> + <expand macro="citations"> + <citation type="doi">10.1371/journal.pgen.1001322</citation><!-- c-alpha citation --> + </expand> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_comp_hets.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,61 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Identifying potential compound heterozygotes</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">comp_hets</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + + #if $report.report_selector != 'all': + --columns "${report.columns}" + #end if + + #if $filter.filter_selector == 'yes': + --filter "${filter.filter}" + #end if + $only_affected + $ignore_phasing + + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + <expand macro="add_header_column" /> + <expand macro="column_filter" /> + <param name="only_affected" type="boolean" truevalue="--only-affected" falsevalue="" checked="False" + label="Report solely those compund heterozygotes impacted a sample labeled as affected" help="(--only-affected)"/> + <param name="ignore_phasing" type="boolean" truevalue="--ignore-phasing" falsevalue="" checked="False" + label="Ignore phasing when screening for compound hets" help="Candidates are inherently putative. (--ignore-phasing)"/> + <expand macro="filter" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Many recessive disorders are caused by compound heterozygotes. Unlike canonical recessive sites where the same recessive allele is +inherited from both parents at the _same_ site in the gene, compound heterozygotes occur when the individual’s phenotype is caused +by two heterozygous recessive alleles at _different_ sites in a particular gene. + +So basically, we are looking for two (typically loss-of-function (LoF)) heterozygous variants impacting the same gene at different loci. +The complicating factor is that this is _recessive_ and as such, we must also require that the consequential alleles at each heterozygous +site were inherited on different chromosomes (one from each parent). As such, in order to use this tool, we require that all variants are phased. +Once this has been done, the comp_hets tool will provide a report of candidate compound heterozygotes for each sample/gene. + + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_db_info.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,36 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>List the gemini database tables and columns</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">db_info</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + "${ infile }" | tr -s ' ' '\t' + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Because of the sheer number of annotations that are stored in gemini, there are admittedly too many columns to remember by rote. +If you can’t recall the name of particular column, just use the db_info tool. It will report all of the tables and all of the columns / types in each table. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_de_novo.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,49 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Identifying potential de novo mutations</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">de_novo</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + + #if $report.report_selector != 'all': + --columns "${report.columns}" + #end if + + #if $filter.filter_selector == 'yes': + --filter "${filter.filter}" + #end if + -d $d + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <expand macro="column_filter" /> + <expand macro="filter" /> + <expand macro="min_sequence_depth" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Assuming you have defined the familial relationships between samples when loading your VCF into GEMINI, +you can use this tool for identifying de novo (a.k.a spontaneous) mutations that arise in offspring. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_interactions.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,68 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Find genes among variants that are interacting partners</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">interactions</token> + </macros> + <command> +<![CDATA[ + gemini + --annotation-dir ${annotation_databases.fields.path} + #if $gene.gene_selector == 'lof': + ## lof interactions is a separate program + lof_interactions + #else: + ## use normal gemini interactions program + @BINARY@ + -g "${gene.gene}" + #end if + + -r "${radius}" + $variant_mode + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <conditional name="gene"> + <param name="gene_selector" type="select" label="Studying" help=""> + <option value="gene">Interesting gene</option> + <option value="lof">All loss-of-function variants</option> + </param> + <when value="gene"> + <param name="gene" type="text" label="Specify gene name" help="e.g. PTPN22 (-g)" /> + </when> + <when value="lof"/> + </conditional> + <expand macro="annotation_dir" /> + <expand macro="radius" /> + <expand macro="variant_mode" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Integrating the knowledge of the known protein-protein interactions would be useful in explaining variation data. +Meaning to say that a damaging variant in an interacting partner of a potential protein may be equally interesting as the +protein itself. We have used the HPRD_ binary interaction data to build a p-p network graph which can be explored by GEMINI. + +.. _HPRD: http://www.ncbi.nlm.nih.gov/pubmed/18988627 + + +@CITATION@ + </help> + <expand macro="citations"> + <citation type="doi">10.1093/nar/gkn892</citation><!-- HPRD citation --> + </expand> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_load.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,80 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Loading a VCF file into GEMINI</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">load</token> + </macros> + <command> +<![CDATA[ + gemini + --annotation-dir ${annotation_databases.fields.path} + @BINARY@ + -v "${ infile }" + -t $annotation_type + + #if $ped: + -p $ped + #end if + + $skip_gerp_bp + $skip_cadd + $skip_gene_tables + $no_load_genotypes + $no_genotypes + $passonly + --cores \${GALAXY_SLOTS:-4} + + "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="vcf" label="VCF file to be loaded in the GEMINI database" /> + + <param name="annotation_type" type="select" label="The annotations to be used with the input vcf" help="(-t)"> + <option value="snpEff">snpEff annotated VCF file</option> + <option value="VEP">VEP annotated VCF file</option> + </param> + <param name="ped" type="data" format="tablar" optional="True" label="Sample information file in PED+ format" help="(-p)" /> + <expand macro="annotation_dir" /> + + <param name="skip_gerp_bp" type="boolean" truevalue="--skip-gerp-bp" falsevalue="" checked="False" + label="Do not load GERP scores at base pair resolution" help="(--skip-gerp-bp)"/> + + <param name="skip_cadd" type="boolean" truevalue="--skip-cadd" falsevalue="" checked="False" + label="Do not load CADD scores" help="(--skip-cadd)"/> + + <param name="skip_gene_tables" type="boolean" truevalue="--skip-gene-tables" falsevalue="" checked="False" + label="Do not load gene tables" help="(--skip-gene-tables)"/> + + <param name="no_load_genotypes" type="boolean" truevalue="--no-load-genotypes" falsevalue="" checked="False" + label="Genotypes exist in the file, but should not be stored" help="(--no-load-genotypes)"/> + + <param name="no_genotypes" type="boolean" truevalue="--no-load-genotypes" falsevalue="" checked="False" + label="There are no genotypes in the file" help="e.g. some 1000G VCFs (--no-genotypes)"/> + + <param name="passonly" type="boolean" truevalue="--passonly" falsevalue="" checked="False" + label="Keep only variants that pass all filters" help="e.g. some 1000G VCFs (--passonly)"/> + + </inputs> + <outputs> + <data name="outfile" format="sqlite" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Before we can use GEMINI to explore genetic variation, we must first load our VCF file into the GEMINI database framework. +We expect you to have first annotated the functional consequence of each variant in your VCF using either VEP or snpEff. + +http://gemini.readthedocs.org/en/latest/content/loading.html + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_lof_sieve.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,39 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@version@.0"> + <description>Filter LoF variants by transcript position and type</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">lof_sieve</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Not all candidate LoF variants are created equal. For e.g, a nonsense (stop gain) variant impacting the first 5% of a polypeptide is far +more likely to be deleterious than one affecting the last 5%. Assuming you’ve annotated your VCF with snpEff v3.0+, the lof_sieve tool +reports the fractional position (e.g. 0.05 for the first 5%) of the mutation in the amino acid sequence. +In addition, it also reports the predicted function of the transcript so that one can segregate candidate +LoF variants that affect protein_coding transcripts from processed RNA, etc. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_macros.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,118 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.10.0">gemini</requirement> + <requirement type="package" version="0.1.3">grabix</requirement> + <requirement type="package" version="0.2.6">tabix</requirement> + <requirement type="package" version="0.1.19">samtools</requirement> + <requirement type="package" version="2.19.1">bedtools</requirement> + <yield /> + </requirements> + </xml> + + <xml name="version_command"> + <version_command>gemini --version</version_command> + </xml> + + <xml name="stdio"> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + </xml> + + <xml name="annotation_dir"> + <param name="annotation_databases" type="select" label="Choose a gemini annotation database"> + <options from_data_table="gemini_databases"> + <filter type="sort_by" column="0" /> + <validator type="no_options" message="No annotation database is available" /> + </options> + </param> + </xml> + + <xml name="add_header_column"> + <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" + label="Add a header of column names to the output" help="(--header)"/> + </xml> + + <xml name="radius"> + <param name="radius" type="integer" value="3" size="5" label="Set filter for Breadth-first search (BFS) in the Protein-Protein Interaction network" help="(-r)" > + <validator type="in_range" min="0"/> + </param> + </xml> + <xml name="variant_mode"> + <param name="variant_mode" type="boolean" truevalue="--var" falsevalue="" checked="False" + label="Returns variant info (e.g. impact, biotype) for interacting genes" help="(--var)"/> + </xml> + + <xml name="column_filter"> + <conditional name="report"> + <param name="report_selector" type="select" label="Columns to include in the report" + help="By default, this tool reports all columns in the variants table. One may choose to report only a subset of the columns."> + <option value="all" selected="True">all</option> + <option value="column_filter">User given columns</option> + </param> + <when value="all"/> + <when value="column_filter"> + <param name="columns" type="select" display="checkboxes" multiple="True" label="Choose columns to include in the report" help="(--columns)"> + <option value="gene">gene</option> + <option value="chrom">chrom</option> + <option value="start">start</option> + <option value="end">end</option> + <option value="ref">ref</option> + <option value="alt">alt</option> + <option value="impact">impact</option> + <option value="impact_severity">impact_severity</option> + </param> + </when> + </conditional> + </xml> + + <xml name="filter"> + <conditional name="filter"> + <param name="filter_selector" type="select" label="Apply additional constraints" + help="By default, this tool will report all variants regardless of their putative functional impact. In order to apply additional constraints on the variants returned, you can this optional filter."> + <option value="no">No additional constraints</option> + <option value="yes">Apply additional constraints</option> + </param> + <when value="no"/> + <when value="yes"> + <param name="filter" type="text" size="20" label="Contraints in SQL syntax" help="Conditions applied here will become WHERE clauses in the query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'. (--filter)"> + <expand macro="sanitize_query" /> + </param> + </when> + </conditional> + </xml> + + <xml name="sanitize_query"> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + </valid> + </sanitizer> + </xml> + + <xml name="min_sequence_depth"> + <param name="d" type="integer" value="0" size="5" label="The minimum aligned sequence depth (genotype DP) required for each sample" + help="default: 0 (-d)"> + <validator type="in_range" min="0"/> + </param> + </xml> + + <token name="@VERSION@">0.10.0</token> + + <token name="@CITATION@">------ + +**Citation** + +If you use GEMINI in your research, please cite the following manuscript: + + </token> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1003153</citation> + <yield /> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_pathways.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,52 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Map genes and variants to KEGG pathways</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">pathways</token> + </macros> + <command> +<![CDATA[ + gemini + --annotation-dir ${annotation_databases.fields.path} + @BINARY@ + -v $ensembl + $lof + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="ensembl" type="integer" value="68" size="5" label="Version of ensembl genes to use" + help="Supported versions: 66 to 71. use versions that match the VEP/snpEff versions of the annotated vcf for correctness. For e.g VEP v2.6 and snpEff v3.1 use Ensembl 68 version of the genomes. (-v)"> + <validator type="in_range" min="66" max="71"/> + </param> + + <param name="lof" type="boolean" truevalue="--lof" falsevalue="" checked="False" + label="Report only pathways with loss-of-function variants" help="(--lof)"/> + <expand macro="annotation_dir" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +Mapping genes to biological pathways is useful in understanding the function/role played by a gene. +Likewise, genes involved in common pathways is helpful in understanding heterogeneous diseases. +We have integrated the KEGG pathway mapping for gene variants, to explain/annotate variation. + +This requires your VCF be annotated with either snpEff/VEP. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_query.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,110 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Querying the GEMINI database</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">query</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + + --in "${in}" + + #if $gt_filter.strip(): + --gt-filter "${gt_filter}" + #end if + + #if $sample_filter.strip(): + --sample-filter "${sample_filter}" + #end if + + $show_samples + $show_families + $family_wise + $header + $dgidb + #if $region.strip(): + --region "${region}" + #end if + #if int($min_kindreds) > 0: + --min-kindreds $min_kindreds + #end if + ##--format FORMAT Format of output (JSON, TPED or default) # we will take default for the time being + ## --sample-delim STRING The delimiter to be used with the --show-samples option. + + #if $q.strip(): + -q "${q}" + #end if + + "${ infile }" + > "${ outfile }" +]]> + </command> + <!-- + ##TODO: + - -carrier-summary-by-phenotype CARRIER_SUMMARY + Output columns of counts of carriers and non-carriers + stratified by the given sample phenotype column--> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="q" type="text" area="True" size="5x50" label="The query to be issued to the database" help="(-q)"> + <expand macro="sanitize_query" /> + </param> + <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> + <expand macro="sanitize_query" /> + </param> + <param name="sample_filter" type="text" area="True" size="5x50" label="SQL filter to use to filter the sample table" help="(--sample-filter)"> + <expand macro="sanitize_query" /> + </param> + + <param name="show_samples" type="boolean" truevalue="--show-samples" falsevalue="" checked="False" + label="Add a column of all sample names with a variant to each variant" help="(--show-samples)"/> + + <param name="show_families" type="boolean" truevalue="--show-families" falsevalue="" checked="False" + label="Add a column listing all of the families with a variant to each variant" help="(--show-families)"/> + + <param name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False" + label="Perform the sample-filter on a family-wise basis" help="(--family-wise)"/> + + <expand macro="add_header_column" /> + + <!-- TODO: is there any default values set? --> + <param name="min_kindreds" size="4" type="integer" value="-1" lebel="Minimum number of families for a variant passing a family-wise filter to be in" help="-1 means default values (--min-kindreds)" /> + + <param name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False" + label="Request drug-gene interaction info from DGIdb" help="(--dgidb)"/> + + <param name="in" type="select" label="A variant must be in either all, none or any samples passing the sample-query filter" help="(--in)"> + <option value="all">Return a variant if all samples matching the query have the variant. (all)</option> + <option value="none">Return a variant if the variant does not appear in any of the matching samples. (none)</option> + <option value="any">Return all of the variant which are in all of the matching samples and not in any of the non-matching samples. (any)</option> + <option value="only">Return a variant if the variant is only in the matching samples and not in any of the non-matching samples. (only)</option> + </param> + + <param name="region" size="30" type="text" value="" label="Restrict query to this region" help="e.g. chr1:10-20 (--region)"/> + + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +The real power in the GEMINI framework lies in the fact that all of your genetic variants have been stored in a convenient database in the context of a wealth of genome annotations that facilitate variant interpretation. +The expressive power of SQL allows one to pose intricate questions of one’s variation data. This tool offers you an easy way to query your variants! + +http://gemini.readthedocs.org/en/latest/content/querying.html + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_region.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,75 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Extracting variants from specific regions or genes</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">region</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + + #if $region.region_selector == '--reg': + #if str(${region.region}) != '': + --reg "${region.region}" + #end if + #else: + #if str(${region.gene}) != '': + --gene "${region.gene}" + #end if + #end if + + #if $report.report_selector != 'all': + --columns "${report.columns}" + #end if + + #if $filter.filter_selector == 'yes': + --filter "${filter.filter}" + #end if + + $header + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <conditional name="region"> + <param name="region_selector" type="select" label="Select by ...?" help=""> + <option value="--reg">genomic coordinates</option> + <option value="--gene">gene name</option> + </param> + <when value="--reg"> + <param name="region" type="text" label="Specify genomic region" help="e.g. chr1:100-200 (--reg)"/> + </when> + <when value="--gene"> + <param name="gene" type="text" label="Specify gene name" help="e.g. PTPN22 (--gene)" /> + </when> + </conditional> + + <expand macro="column_filter" /> + <expand macro="filter" /> + + + <expand macro="add_header_column" /> + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +One often is concerned with variants found solely in a particular gene or genomic region. + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_roh.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,106 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Identifying runs of homozygosity</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">roh</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + --min-snps $min_snps + --min-total-depth $min_total_depth + --min-gt-depth $min_gt_depth + --min-size $min_size + --max-hets $max_hets + --max-unknowns $max_unknowns + #if $samples.strip() != '': + -s "${samples}" + #end if + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="min_snps" type="integer" value="25" size="5" label="Minimum number of expected homozygous SNPs" help="default: 25 (--min-snps)"> + <validator type="in_range" min="0"/> + </param> + <param name="min_total_depth" type="integer" value="20" size="10" label="The minimum overall sequencing depth requiredfor a SNP to be considered" help="default: 20 (--min-total-depth)"> + <validator type="in_range" min="0"/> + </param> + <param name="min_gt_depth" type="integer" value="0" size="10" label="The minimum required sequencing depth underlying a given sample's genotype for a SNP to be considered" + help="default: 0 (--min-gt-depth)"> + <validator type="in_range" min="0"/> + </param> + <param name="min_size" type="integer" value="100000" size="10" label="Minimum run size in base pairs" help="default: 100000 (--min-size)"> + <validator type="in_range" min="1"/> + </param> + <param name="max_hets" type="integer" value="1" size="5" label="Maximum number of allowed hets in the run" help="default: 1 (--max-hets)"> + <validator type="in_range" min="1"/> + </param> + <param name="max_unknowns" type="integer" value="3" size="5" label="Maximum number of allowed unknowns in the run" help="default: 3 (-max-unknowns)"> + <validator type="in_range" min="0"/> + </param> + + <param name="samples" size="30" type="text" value="" label="Comma separated list of samples to screen for ROHs" help="e.g S120,S450 (-s)"/> + + </inputs> + + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> + +**What it does** + +=========================================================================== +``ROH``: Identifying runs of homozygosity +=========================================================================== +Runs of homozygosity are long stretches of homozygous genotypes that reflect +segments shared identically by descent and are a result of consanguinity or +natural selection. Consanguinity elevates the occurrence of rare recessive +diseases (e.g. cystic fibrosis) that represent homozygotes for strongly deleterious +mutations. Hence, the identification of these runs holds medical value. + +The 'roh' tool in GEMINI returns runs of homozygosity identified in whole genome data. +The tool basically looks at every homozygous position on the chromosome as a possible +start site for the run and looks for those that could give rise to a potentially long +stretch of homozygous genotypes. + +For e.g. for the given example allowing ``1 HET`` genotype (h) and ``2 UKW`` genotypes (u) +the possible roh runs (H) would be: + + +:: + + genotype_run = H H H H h H H H H u H H H H H u H H H H H H H h H H H H H h H H H H H + roh_run1 = H H H H h H H H H u H H H H H u H H H H H H H + roh_run2 = H H H H u H H H H H u H H H H H H H h H H H H H + roh_run3 = H H H H H u H H H H H H H h H H H H H + roh_run4 = H H H H H H H h H H H H H + +roh returned for --min-snps = 20 would be: + +:: + + roh_run1 = H H H H h H H H H u H H H H H u H H H H H H H + roh_run2 = H H H H u H H H H H u H H H H H H H h H H H H H + + +As you can see, the immediate homozygous position right of a break (h or u) would be the possible +start of a new roh run and genotypes to the left of a break are pruned since they cannot +be part of a longer run than we have seen before. + + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_stats.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,57 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Compute useful variant statistics</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">stats</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + $stats_type + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="stats_type" type="select" label="Studying ..." help=""> + <option value="--tstv">Compute the transition and transversion ratios for the snps (--tstv)</option> + <option value="--tstv-coding">Compute the transition/transversion ratios for the snps in the coding regions (--tstv-coding)</option> + <option value="--tstv-noncoding">Compute the transition/transversion ratios for the snps in the non-coding regions (--tstv-noncoding)</option> + <option value="--snp-counts">Compute the type and count of the snps (--snp-counts)</option> + <option value="--sfs">Calculate the site frequency spectrum of the variants (--sfs)</option> + <option value="--mds">Compute the pair-wise genetic distance between each sample (--mds)</option> + <option value="--vars_by_sample">Return the total variants per sample, sum of homozygous and heterozygous variants (--vars-by-sample)</option> + </param> + + </inputs> + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +The stats tool computes some useful variant statistics for a GEMINI database. + + +$ gemini stats --summarize "select * from variants where in_dbsnp=1 and chrom='chr1'" my.db +sample total num_het num_hom_alt +M10475 1 1 0 +M128215 1 1 0 +M10478 2 2 0 +M10500 2 1 1 + + + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_windower.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,64 @@ +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.0"> + <description>Conducting analyses on genome "windows"</description> + <expand macro="requirements" /> + <expand macro="version_command" /> + <macros> + <import>gemini_macros.xml</import> + <token name="@BINARY@">windower</token> + </macros> + <command> +<![CDATA[ + gemini @BINARY@ + -w $w + -s $s + -t $window_analysis + -o $operation + "${ infile }" + > "${ outfile }" +]]> + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="sqlite" label="GEMINI database" /> + + <param name="window_analysis" type="select" label="The type of window analysis requested?" help="(-t)"> + <option value="nucl_div">(nucl_div)</option> + <option value="hwe">(hwe)</option> + </param> + + <param name="operation" type="select" label="The operation that should be applied to the -t values" help="(-o)"> + <option value="mean">mean</option> + <option value="median">median</option> + <option value="min">min</option> + <option value="max">max</option> + <option value="collapse">collapse</option> + </param> + + <param name="w" type="integer" value="10000" size="10" label="The window size in bp" + help="(-w)"> + <validator type="in_range" min="0"/> + </param> + + <param name="s" type="integer" value="1000" size="10" label="The step size for the windows in bp" + help="(-s)"> + <validator type="in_range" min="0"/> + </param> + + </inputs> + + <outputs> + <data name="outfile" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +**What it does** + +It computs variation metrics across genomic windows (both fixed and sliding). + +@CITATION@ + </help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,65 @@ +========================= +Galaxy wrapper for GEMINI +========================= + + +GEMINI: a flexible framework for exploring genome variation + +GEMINI (GEnome MINIng) is designed to be a flexible framework for exploring genetic variation in the context of +the wealth of genome annotations available for the human genome. By placing genetic variants, sample genotypes, +and useful genome annotations into an integrated database framework, GEMINI provides a simple, flexible, yet very +powerful system for exploring genetic variation for for disease and population genetics. + +Using the GEMINI framework begins by loading a VCF file into a database. Each variant is automatically +annotated by comparing it to several genome annotations from source such as ENCODE tracks, UCSC tracks, +OMIM, dbSNP, KEGG, and HPRD. All of this information is stored in portable SQLite database that allows +one to explore and interpret both coding and non-coding variation using “off-the-shelf” tools or an +enhanced SQL engine. + +Please also see the original [manuscript](http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003153). + + +============ +Installation +============ + +It is recommended to install this wrapper via the `Galaxy Tool Shed`. + +.. _`Galaxy Tool Shed`: https://testtoolshed.g2.bx.psu.edu/view/iuc/gemini + + +======= +History +======= +- 0.9.1: Initial public release + + +==================== +Detailed description +==================== + +View the original GEMINI documentation: http://gemini.readthedocs.org/en/latest/index.html + + +=============================== +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="This requires the GEMINI data manager definition to install all required annotation databases."> + <repository changeset_revision="fd9e9ac5ecb3" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> +</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gemini_databases.loc.sample Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,3 @@ +## GEMINI databases +#Version dbkey Description +#08_08_2014 hg19 Database (08-08-2014)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,7 @@ +<tables> + <table name="gemini_databases" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/gemini_databases.loc" /> + </table> +</tables> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="grabix" version="0.1.3"> + <repository changeset_revision="0714d88bd854" name="package_grabix_0_1_3" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="tabix" version="0.2.6"> + <repository changeset_revision="3d6beba7393e" name="package_tabix_0_2_6" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="0.1.19"> + <repository changeset_revision="923adc89c666" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bedtools" version="2.19.1"> + <repository changeset_revision="fb3a854c7104" name="package_bedtools_2_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="gemini" version="0.10.0"> + <repository changeset_revision="42c72725f879" name="package_gemini_0_10_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
