comparison gemini_load.xml @ 5:b5b53c27baca draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author iuc
date Fri, 11 Jan 2019 17:50:01 -0500
parents 5c5cdbdc3534
children b2c25142267e
comparison
equal deleted inserted replaced
4:5c5cdbdc3534 5:b5b53c27baca
1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> 1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
2 <description>Loading a VCF file into GEMINI</description> 2 <description>Loading a VCF file into GEMINI</description>
3 <macros> 3 <macros>
4 <import>gemini_macros.xml</import> 4 <import>gemini_macros.xml</import>
5 <token name="@BINARY@">load</token> 5 <token name="@BINARY@">load</token>
6 </macros> 6 </macros>
9 <expand macro="version_command" /> 9 <expand macro="version_command" />
10 <command> 10 <command>
11 <![CDATA[ 11 <![CDATA[
12 @PROVIDE_ANNO_DATA@ 12 @PROVIDE_ANNO_DATA@
13 13
14 ln -s "${ infile }" input.vcf && 14 ln -s '$infile' input.vcf &&
15 bgzip -c input.vcf > input.vcf.gz && 15 bgzip -c input.vcf > input.vcf.gz &&
16 tabix -p vcf input.vcf.gz && 16 tabix -p vcf input.vcf.gz &&
17 17
18 gemini 18 gemini
19 @BINARY@ 19 @BINARY@
20 -v input.vcf.gz 20 -v input.vcf.gz
21 #if str( $annotation_type ) != "None": 21 #if str( $annotation_type ) != "None":
22 -t "$annotation_type" 22 -t $annotation_type
23 #end if 23 #end if
24
25 $has_genotypes
24 26
25 #if $ped: 27 #if $ped:
26 -p $ped 28 -p $ped
27 #end if 29 #end if
28 30
29 $skip_gerp_bp 31 #if 'gerp_bp' not in str($opt_content):
30 $skip_cadd 32 --skip-gerp-bp
31 $skip_gene_tables 33 #end if
32 $no_load_genotypes 34 #if 'cadd' not in str($opt_content):
33 $no_genotypes 35 --skip-cadd
34 $passonly 36 #end if
35 $infostring 37 #if 'gene_tables' not in str($opt_content):
38 --skip-gene-tables
39 #end if
40 #if 'genotypes' not in str($opt_content):
41 --no-load-genotypes
42 #end if
43 #if 'gt_pl' not in str($opt_content):
44 --skip-pls
45 #end if
46 #if 'passonly' in str($opt_content):
47 --passonly
48 #end if
49 #if 'info_string' in str($opt_content):
50 --save-info-string
51 #end if
52
36 --cores \${GALAXY_SLOTS:-4} 53 --cores \${GALAXY_SLOTS:-4}
37 54
38 "${ outfile }" 55 '$outfile'
39 ]]> 56 ]]>
40 </command> 57 </command>
41 <inputs> 58 <inputs>
42 <param name="infile" type="data" format="vcf" label="VCF file to be loaded in the GEMINI database" help="Only build 37 (aka hg19) of the human genome is supported."> 59 <param name="infile" type="data" format="vcf"
60 label="VCF dataset to be loaded in the GEMINI database"
61 help="Only build 37 (aka hg19) of the human genome is supported.">
43 <options> 62 <options>
44 <filter type="add_value" value="hg19" /> 63 <filter type="add_value" value="hg19" />
45 <filter type="add_value" value="Homo_sapiens_nuHg19_mtrCRS" /> 64 <filter type="add_value" value="Homo_sapiens_nuHg19_mtrCRS" />
46 <filter type="add_value" value="hg_g1k_v37" /> 65 <filter type="add_value" value="hg_g1k_v37" />
47 </options> 66 </options>
48 </param> 67 </param>
49 68 <param argument="-t" name="annotation_type" type="select"
50 <param name="annotation_type" type="select" label="The annotations to be used with the input vcf" help="(-t)"> 69 label="The variants in this input are"
51 <option value="None">None (not recommended)</option> 70 help="GEMINI can parse and use annotations generated with either snpEff (both 'EFF'- and 'ANN'-style annotations are supported) or VEP. You can also load unannotated variants, but most of GEMINI's functionality will not be available or not be very useful without annotations.">
52 <option value="snpEff" selected="True">snpEff annotated VCF file</option> 71 <option value="snpEff" selected="True">annotated with snpEff</option>
53 <option value="VEP">VEP annotated VCF file</option> 72 <option value="VEP">annotated with VEP</option>
73 <option value="None">not annotated (not recommended)</option>
54 </param> 74 </param>
55 <param name="ped" type="data" format="tabular" optional="True" label="Sample information file in PED+ format" help="(-p)" /> 75 <param argument="--no-genotypes" name="has_genotypes" type="boolean" falsevalue="--no-genotypes" truevalue="" checked="True"
76 label="This input comes with genotype calls for its samples"
77 help="This is usually the case, but some published datasets, like some 1000G VCFs, are missing genotype information."/>
56 <expand macro="annotation_dir" /> 78 <expand macro="annotation_dir" />
57 79 <param argument="-p" name="ped" type="data" format="tabular" optional="True"
58 <param name="skip_gerp_bp" type="boolean" truevalue="--skip-gerp-bp" falsevalue="" checked="False" 80 label="Sample and family information in PED format"
59 label="Do not load GERP scores at base pair resolution" help="(--skip-gerp-bp)"/> 81 help="The pedigree dataset is optional, but several GEMINI tools require the relationship between samples (i.e., the family structure) and/or the sample phenotype to be defined. The PED format is a simple tabular format (see the tool help below for details). If you choose to not provide sample information now, but later find that you need it for your analysis, you can also add it to an existing GEMINI database by using the GEMINI amend tool." />
60 82 <param name="opt_content" type="select" display="checkboxes" multiple="true" optional="true"
61 <param name="skip_cadd" type="boolean" truevalue="--skip-cadd" falsevalue="" checked="False" 83 label="Load the following optional content into the database"
62 label="Do not load CADD scores" help="(--skip-cadd)"/> 84 help="The preselected defaults should be ok for most use cases. If you are not interested in certain annotations, you can speed up database creation and decrease the resulting database size slightly by not loading them into the database. Note: GERP and CADD scores are optional parts of the annotation source and can only be loaded if available.">
63 85 <option value="gerp_bp" selected="true">GERP scores</option>
64 <param name="skip_gene_tables" type="boolean" truevalue="--skip-gene-tables" falsevalue="" checked="False" 86 <option value="cadd" selected="true">CADD scores</option>
65 label="Do not load gene tables" help="(--skip-gene-tables)"/> 87 <option value="gene_tables" selected="true">Gene tables</option>
66 88 <option value="genotypes" selected="true">Sample genotypes</option>
67 <param name="no_load_genotypes" type="boolean" truevalue="--no-load-genotypes" falsevalue="" checked="False" 89 <option value="gt_pl" selected="true">Genotype likelihoods (sample PLs)</option>
68 label="Genotypes exist in the file, but should not be stored" help="(--no-load-genotypes)"/> 90 <option value="passonly" selected="false">only variants that passed all filters</option>
69 91 <option value="info_string" selected="false">variant INFO field</option>
70 <param name="no_genotypes" type="boolean" truevalue="--no-genotypes" falsevalue="" checked="False" 92 </param>
71 label="There are no genotypes in the file" help="e.g. some 1000G VCFs (--no-genotypes)"/>
72
73 <param name="passonly" type="boolean" truevalue="--passonly" falsevalue="" checked="False"
74 label="Keep only variants that pass all filters" help="e.g. some 1000G VCFs (--passonly)"/>
75
76 <param name="infostring" type="boolean" truevalue="--save-info-string" falsevalue="" checked="False"
77 label="Load INFO string from VCF file" help="(--save-info-string)"/>
78 </inputs> 93 </inputs>
79 <outputs> 94 <outputs>
80 <data name="outfile" format="gemini.sqlite" /> 95 <data name="outfile" format="gemini.sqlite" />
81 </outputs> 96 </outputs>
82 <tests> 97 <tests>
83 <test> 98 <test>
84 <param name="annotation_databases" value="1999-01-01" /> 99 <param name="annotation_databases" value="1999-01-01" />
85 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" /> 100 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
86 <param name="skip_gene_tables" value="False" /> 101 <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
87 <param name="skip_gerp_bp" value="True" />
88 <param name="skip_cadd" value="True" />
89 <param name="no_genotypes" value="False" />
90 <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" /> 102 <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
103 <assert_command>
104 <has_text text="--skip-gerp-bp" />
105 <has_text text="--skip-cadd" />
106 <not_has_text text="--skip-gene-tables" />
107 <not_has_text text="--skip-pls" />
108 <not_has_text text="--no-load-genotypes" />
109 <not_has_text text="--passonly" />
110 <not_has_text text="--save-info-string" />
111 <not_has_text text="--no-genotypes" />
112 </assert_command>
91 </test> 113 </test>
92 <test> 114 <test>
93 <param name="annotation_databases" value="1999-01-01" /> 115 <param name="annotation_databases" value="1999-01-01" />
94 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" /> 116 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
95 <param name="skip_gene_tables" value="False" /> 117 <param name="opt_content" value="gerp_bp,cadd,gene_tables,genotypes,gt_pl" />
96 <param name="skip_gerp_bp" value="False" /> 118 <param name="has_genotypes" value="True" />
97 <param name="skip_cadd" value="False" />
98 <param name="no_genotypes" value="False" />
99 <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" /> 119 <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
100 <assert_stderr> 120 <assert_stderr>
101 <has_text text="CADD scores are not being loaded because the annotation file could not be found." /> 121 <has_text text="CADD scores are not being loaded because the annotation file could not be found." />
102 <has_text text="GERP per bp is not being loaded because the annotation file could not be found." /> 122 <has_text text="GERP per bp is not being loaded because the annotation file could not be found." />
103 </assert_stderr> 123 </assert_stderr>
124 <assert_command>
125 <not_has_text text="--skip-gerp-bp" />
126 <not_has_text text="--skip-cadd" />
127 <not_has_text text="--skip-gene-tables" />
128 <not_has_text text="--skip-pls" />
129 <not_has_text text="--no-load-genotypes" />
130 <not_has_text text="--passonly" />
131 <not_has_text text="--save-info-string" />
132 <not_has_text text="--no-genotypes" />
133 </assert_command>
104 </test> 134 </test>
105 <test> 135 <test>
106 <param name="annotation_databases" value="1999-01-01" /> 136 <param name="annotation_databases" value="1999-01-01" />
107 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" /> 137 <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
108 <param name="skip_gene_tables" value="True" /> 138 <param name="opt_content" value="genotypes,gt_pl" />
109 <param name="skip_gerp_bp" value="True" /> 139 <param name="has_genotypes" value="False" />
110 <param name="skip_cadd" value="True" />
111 <param name="no_genotypes" value="True" />
112 <output name="outfile" file="gemini_load_result2.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" /> 140 <output name="outfile" file="gemini_load_result2.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
141 <assert_command>
142 <has_text text="--skip-gerp-bp" />
143 <has_text text="--skip-cadd" />
144 <has_text text="--skip-gene-tables" />
145 <not_has_text text="--skip-pls" />
146 <not_has_text text="--no-load-genotypes" />
147 <not_has_text text="--passonly" />
148 <not_has_text text="--save-info-string" />
149 <has_text text="--no-genotypes" />
150 </assert_command>
113 </test> 151 </test>
114 <test> 152 <test>
115 <param name="annotation_databases" value="1999-01-01" /> 153 <param name="annotation_databases" value="1999-01-01" />
116 <param name="infile" dbkey="hg19" value="gemini_amend.vcf" ftype="vcf" /> 154 <param name="infile" dbkey="hg19" value="gemini_amend.vcf" ftype="vcf" />
117 <param name="skip_gene_tables" value="False" /> 155 <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
118 <param name="skip_gerp_bp" value="True" /> 156 <param name="has_genotypes" value="True" />
119 <param name="skip_cadd" value="True" />
120 <param name="no_genotypes" value="False" />
121 <param name="ped" value="gemini_amend.ped" ftype="tabular" /> 157 <param name="ped" value="gemini_amend.ped" ftype="tabular" />
122 <output name="outfile" file="gemini_auto_rec_input.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" /> 158 <output name="outfile" file="gemini_auto_rec_input.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
123 </test> 159 </test>
124 </tests> 160 </tests>
125 <help><![CDATA[ 161 <help><![CDATA[
126 **What it does** 162 **What it does**
127 163
128 Before we can use GEMINI to explore genetic variation, we must first load our VCF file into the GEMINI database framework. 164 Before we can use GEMINI to explore genetic variation, we must first load the
129 We expect you to have first annotated the functional consequence of each variant in your VCF using either VEP or snpEff. 165 variant information stored in VCF format into the GEMINI database framework.
166
167 To fully leverage the power of GEMINI, you should first **annotate your VCF
168 dataset** with the functional consequences of the variants using either *VEP*
169 or *snpEff*.
170
171 .. class:: Warning mark
172
173 To avoid problems during annotation, but also during later variant queries with
174 GEMINI tools, it is good practice to preprocess your VCF dataset even before
175 annoation to split records with multiple alternate alleles, and to left-align
176 and trim indels. The authors of GEMINI recommend the tool *vt* for this purpose,
177 an equivalently good option is *bcftools norm*, and Galaxy wrappers exist for
178 both tools.
179
180 In addition, you are encouraged to provide **family and sample phenotype
181 information in PED format**, if you are planning to use GEMINI for any kind of
182 variant identification based on inheritance patterns.
183
184 A PED file is simply a tabular text file (columns can be separated by either
185 spaces or TABs, but not a mixture of the two within the same file) with the
186 header::
187
188 #family_id name paternal_id maternal_id sex phenotype
189
190 and optional additional columns. The actual column names in the header are not
191 fixed, but there have to be at least six columns that are interpreted as
192 detailed next.
193
194 Subsequent lines describe one sample from the VCF input dataset each, where
195
196 - *family_id* is an alphanumeric identifier of a family
197
198 If the family, to which the sample belongs, is unknown, a placeholder of
199 ``0``, ``-9`` or ``None`` can be used to indicate this fact.
200
201 - *name* is the identifier of the sample described by the line
202
203 - *paternal_id* is the identifier of the sample's father
204
205 If the sample's father is not available in the VCF, a placeholder of
206 ``0``, ``-9`` or ``None`` can be used to indicate this fact.
207
208 - *maternal_id* is the identifier of the sample's mother
209
210 If the sample's mother is not available in the VCF, a placeholder of
211 ``0``, ``-9`` or ``None`` can be used to indicate this fact.
212
213 - *sex* is a numeric code for the sample's sex
214 (1=male, 2=female, any other number=unknown sex)
215
216 - *phenotype* is a numeric code for the sample's phenotypic affection status
217 (1=unaffected, 2=affected)
218
219 If the sample's phenotype is unknown, a placeholder of ``0`` or ``-9`` can be
220 used to indicate this fact.
221
222 - Optional additional columns can have any column name you like, and accept any
223 per-sample value. The data from such extra columns will be added to the
224 samples table of the GEMINI database so you can use them in queries. Extra
225 columns can be used, *e.g.*, to describe additional phenotypes.
226
227 - If no extra columns are present in a PED file, then the header line is
228 optional.
229
230 Here are two examples of valid PED file contents::
231
232 #family_id name paternal_id maternal_id sex phenotype hair_color
233 1 M10475 -9 -9 1 1 brown
234 1 M10478 M10475 M10500 2 2 brown
235 1 M10500 -9 -9 2 2 black
236 1 M128215 M10475 M10500 1 1 blue
237
238 This describes a family with two kids, in which mother and daughter, but not
239 father and son are phenotypically affected. The file also stores the hair color
240 of all family members.
241
242 ::
243
244 #family_id name paternal_id maternal_id sex phenotype
245 0 M10475 0 0 -1 1
246 0 M10478 0 0 -1 2
247 0 M10500 0 0 -1 2
248 0 M128215 0 0 -1 1
249
250 This describes the same samples as above, but without recording family
251 structure, sex or additional traits. Only the sample phenotypes are provided.
252 In this case (no extra columns), the header line could be omitted.
130 253
131 ]]></help> 254 ]]></help>
132 <expand macro="citations"/> 255 <expand macro="citations"/>
133 </tool> 256 </tool>