# HG changeset patch # User iuc # Date 1409001354 14400 # Node ID 720cbfb4190de12dc510fda2eebd7078d0868a8d Imported from capsule None diff -r 000000000000 -r 720cbfb4190d gemini_annotate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_annotate.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,96 @@ + + adding your own custom annotations + + + + gemini_macros.xml + annotate + + + tabixed.gz; +tabix -p bed tabixed.gz; + + gemini @BINARY@ + -f tabixed.gz + -c $column_name + -a $a.a_selector + #if $a.a_selector == 'extract': + -t $a.column_type + -e $a.column_extracts + -o $a.operation + #end if + + "${ infile }" + > "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +It is inevitable that researchers will want to enhance the gemini framework with their own, custom annotations. gemini provides a sub-command called annotate for exactly this purpose. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_autosomal_recessive.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_autosomal_recessive.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,70 @@ + + Find variants meeting an autosomal recessive/dominant model + + + + gemini_macros.xml + + + 0: + --min-kindreds $min_kindreds + #end if + + "${ infile }" + > "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Assuming you have defined the familial relationships between samples when loading your VCF into GEMINI, one can leverage a +built-in tool for identifying variants that meet an autosomal recessive or dominant inheritance pattern. +The reported variants will be restricted to those variants having the potential to impact the function of affecting protein coding transcripts. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_burden.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_burden.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,82 @@ + + perform sample-wise gene-level burden calculations + + + + gemini_macros.xml + burden + + += 0.0: + --min-aaf $min_aaf + #end if + #if float( str($max_aaf) ) >= 0.0: + --max-aaf $max_aaf + #end if + "${ infile }" + > "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +The burden tool provides a set of utilities to perform burden summaries on a per-gene, per sample basis. +By default, it outputs a table of gene-wise counts of all high impact variants in coding regions for each sample. + +$ gemini burden test.burden.db +gene M10475 M10478 M10500 M128215 +WDR37 2 2 2 2 +CTBP2 0 0 0 1 +DHODH 1 0 0 0 + +@CITATION@ + + + 10.1371/journal.pgen.1001322 + + diff -r 000000000000 -r 720cbfb4190d gemini_comp_hets.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_comp_hets.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,61 @@ + + Identifying potential compound heterozygotes + + + + gemini_macros.xml + comp_hets + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + +**What it does** + +Many recessive disorders are caused by compound heterozygotes. Unlike canonical recessive sites where the same recessive allele is +inherited from both parents at the _same_ site in the gene, compound heterozygotes occur when the individual’s phenotype is caused +by two heterozygous recessive alleles at _different_ sites in a particular gene. + +So basically, we are looking for two (typically loss-of-function (LoF)) heterozygous variants impacting the same gene at different loci. +The complicating factor is that this is _recessive_ and as such, we must also require that the consequential alleles at each heterozygous +site were inherited on different chromosomes (one from each parent). As such, in order to use this tool, we require that all variants are phased. +Once this has been done, the comp_hets tool will provide a report of candidate compound heterozygotes for each sample/gene. + + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_db_info.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_db_info.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,36 @@ + + List the gemini database tables and columns + + + + gemini_macros.xml + db_info + + + "${ outfile }" +]]> + + + + + + + + + + + + + +**What it does** + +Because of the sheer number of annotations that are stored in gemini, there are admittedly too many columns to remember by rote. +If you can’t recall the name of particular column, just use the db_info tool. It will report all of the tables and all of the columns / types in each table. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_de_novo.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_de_novo.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,49 @@ + + Identifying potential de novo mutations + + + + gemini_macros.xml + de_novo + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + +**What it does** + +Assuming you have defined the familial relationships between samples when loading your VCF into GEMINI, +you can use this tool for identifying de novo (a.k.a spontaneous) mutations that arise in offspring. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_interactions.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_interactions.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,68 @@ + + Find genes among variants that are interacting partners + + + + gemini_macros.xml + interactions + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Integrating the knowledge of the known protein-protein interactions would be useful in explaining variation data. +Meaning to say that a damaging variant in an interacting partner of a potential protein may be equally interesting as the +protein itself. We have used the HPRD_ binary interaction data to build a p-p network graph which can be explored by GEMINI. + +.. _HPRD: http://www.ncbi.nlm.nih.gov/pubmed/18988627 + + +@CITATION@ + + + 10.1093/nar/gkn892 + + diff -r 000000000000 -r 720cbfb4190d gemini_load.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_load.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,80 @@ + + Loading a VCF file into GEMINI + + + + gemini_macros.xml + load + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Before we can use GEMINI to explore genetic variation, we must first load our VCF file into the GEMINI database framework. +We expect you to have first annotated the functional consequence of each variant in your VCF using either VEP or snpEff. + +http://gemini.readthedocs.org/en/latest/content/loading.html + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_lof_sieve.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_lof_sieve.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,39 @@ + + Filter LoF variants by transcript position and type + + + + gemini_macros.xml + lof_sieve + + + "${ outfile }" +]]> + + + + + + + + + + + + + +**What it does** + +Not all candidate LoF variants are created equal. For e.g, a nonsense (stop gain) variant impacting the first 5% of a polypeptide is far +more likely to be deleterious than one affecting the last 5%. Assuming you’ve annotated your VCF with snpEff v3.0+, the lof_sieve tool +reports the fractional position (e.g. 0.05 for the first 5%) of the mutation in the amino acid sequence. +In addition, it also reports the predicted function of the transcript so that one can segregate candidate +LoF variants that affect protein_coding transcripts from processed RNA, etc. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_macros.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,118 @@ + + + + gemini + grabix + tabix + samtools + bedtools + + + + + + gemini --version + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.10.0 + + ------ + +**Citation** + +If you use GEMINI in your research, please cite the following manuscript: + + + + + 10.1371/journal.pcbi.1003153 + + + + diff -r 000000000000 -r 720cbfb4190d gemini_pathways.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_pathways.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,52 @@ + + Map genes and variants to KEGG pathways + + + + gemini_macros.xml + pathways + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + +**What it does** + +Mapping genes to biological pathways is useful in understanding the function/role played by a gene. +Likewise, genes involved in common pathways is helpful in understanding heterogeneous diseases. +We have integrated the KEGG pathway mapping for gene variants, to explain/annotate variation. + +This requires your VCF be annotated with either snpEff/VEP. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_query.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_query.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,110 @@ + + Querying the GEMINI database + + + + gemini_macros.xml + query + + + 0: + --min-kindreds $min_kindreds + #end if + ##--format FORMAT Format of output (JSON, TPED or default) # we will take default for the time being + ## --sample-delim STRING The delimiter to be used with the --show-samples option. + + #if $q.strip(): + -q "${q}" + #end if + + "${ infile }" + > "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +The real power in the GEMINI framework lies in the fact that all of your genetic variants have been stored in a convenient database in the context of a wealth of genome annotations that facilitate variant interpretation. +The expressive power of SQL allows one to pose intricate questions of one’s variation data. This tool offers you an easy way to query your variants! + +http://gemini.readthedocs.org/en/latest/content/querying.html + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_region.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_region.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,75 @@ + + Extracting variants from specific regions or genes + + + + gemini_macros.xml + region + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +One often is concerned with variants found solely in a particular gene or genomic region. + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_roh.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_roh.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,106 @@ + + Identifying runs of homozygosity + + + + gemini_macros.xml + roh + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +=========================================================================== +``ROH``: Identifying runs of homozygosity +=========================================================================== +Runs of homozygosity are long stretches of homozygous genotypes that reflect +segments shared identically by descent and are a result of consanguinity or +natural selection. Consanguinity elevates the occurrence of rare recessive +diseases (e.g. cystic fibrosis) that represent homozygotes for strongly deleterious +mutations. Hence, the identification of these runs holds medical value. + +The 'roh' tool in GEMINI returns runs of homozygosity identified in whole genome data. +The tool basically looks at every homozygous position on the chromosome as a possible +start site for the run and looks for those that could give rise to a potentially long +stretch of homozygous genotypes. + +For e.g. for the given example allowing ``1 HET`` genotype (h) and ``2 UKW`` genotypes (u) +the possible roh runs (H) would be: + + +:: + + genotype_run = H H H H h H H H H u H H H H H u H H H H H H H h H H H H H h H H H H H + roh_run1 = H H H H h H H H H u H H H H H u H H H H H H H + roh_run2 = H H H H u H H H H H u H H H H H H H h H H H H H + roh_run3 = H H H H H u H H H H H H H h H H H H H + roh_run4 = H H H H H H H h H H H H H + +roh returned for --min-snps = 20 would be: + +:: + + roh_run1 = H H H H h H H H H u H H H H H u H H H H H H H + roh_run2 = H H H H u H H H H H u H H H H H H H h H H H H H + + +As you can see, the immediate homozygous position right of a break (h or u) would be the possible +start of a new roh run and genotypes to the left of a break are pruned since they cannot +be part of a longer run than we have seen before. + + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_stats.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,57 @@ + + Compute useful variant statistics + + + + gemini_macros.xml + stats + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +The stats tool computes some useful variant statistics for a GEMINI database. + + +$ gemini stats --summarize "select * from variants where in_dbsnp=1 and chrom='chr1'" my.db +sample total num_het num_hom_alt +M10475 1 1 0 +M128215 1 1 0 +M10478 2 2 0 +M10500 2 1 1 + + + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d gemini_windower.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_windower.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,64 @@ + + Conducting analyses on genome "windows" + + + + gemini_macros.xml + windower + + + "${ outfile }" +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +It computs variation metrics across genomic windows (both fixed and sliding). + +@CITATION@ + + + diff -r 000000000000 -r 720cbfb4190d readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,65 @@ +========================= +Galaxy wrapper for GEMINI +========================= + + +GEMINI: a flexible framework for exploring genome variation + +GEMINI (GEnome MINIng) is designed to be a flexible framework for exploring genetic variation in the context of +the wealth of genome annotations available for the human genome. By placing genetic variants, sample genotypes, +and useful genome annotations into an integrated database framework, GEMINI provides a simple, flexible, yet very +powerful system for exploring genetic variation for for disease and population genetics. + +Using the GEMINI framework begins by loading a VCF file into a database. Each variant is automatically +annotated by comparing it to several genome annotations from source such as ENCODE tracks, UCSC tracks, +OMIM, dbSNP, KEGG, and HPRD. All of this information is stored in portable SQLite database that allows +one to explore and interpret both coding and non-coding variation using “off-the-shelf” tools or an +enhanced SQL engine. + +Please also see the original [manuscript](http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003153). + + +============ +Installation +============ + +It is recommended to install this wrapper via the `Galaxy Tool Shed`. + +.. _`Galaxy Tool Shed`: https://testtoolshed.g2.bx.psu.edu/view/iuc/gemini + + +======= +History +======= +- 0.9.1: Initial public release + + +==================== +Detailed description +==================== + +View the original GEMINI documentation: http://gemini.readthedocs.org/en/latest/index.html + + +=============================== +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. + diff -r 000000000000 -r 720cbfb4190d repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,4 @@ + + + + diff -r 000000000000 -r 720cbfb4190d tool-data/gemini_databases.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gemini_databases.loc.sample Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,3 @@ +## GEMINI databases +#Version dbkey Description +#08_08_2014 hg19 Database (08-08-2014) diff -r 000000000000 -r 720cbfb4190d tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,7 @@ + + + value, dbkey, name, path + +
+
+ diff -r 000000000000 -r 720cbfb4190d tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 25 17:15:54 2014 -0400 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + +