0
|
1 <tool id="gatk_variant_select" name="Select Variants" version="0.0.2">
|
|
2 <description>from VCF files</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="1.4">gatk</requirement>
|
|
5 </requirements>
|
|
6 <macros>
|
|
7 <import>gatk_macros.xml</import>
|
|
8 </macros>
|
|
9 <command interpreter="python">gatk_wrapper.py
|
|
10 #from binascii import hexlify
|
|
11 --max_jvm_heap_fraction "1"
|
|
12 --stdout "${output_log}"
|
|
13 -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
|
|
14 -p 'java
|
|
15 -jar "\$JAVA_JAR_PATH/GenomeAnalysisTK.jar"
|
|
16 -T "SelectVariants"
|
|
17 --num_threads \${GALAXY_SLOTS:-4}
|
|
18 -et "NO_ET" ##ET no phone home
|
|
19 -o "${output_vcf}"
|
|
20
|
|
21 #if $reference_source.reference_source_selector != "history":
|
|
22 -R "${reference_source.ref_file.fields.path}"
|
|
23 #end if
|
|
24 '
|
|
25 -p '
|
|
26 #if $input_concordance:
|
|
27 --concordance "${input_concordance}"
|
|
28 #end if
|
|
29 #if $input_discordance:
|
|
30 --discordance "${input_discordance}"
|
|
31 #end if
|
|
32
|
|
33 #for $exclude_sample_name in $exclude_sample_name_repeat:
|
|
34 --exclude_sample_name "${exclude_sample_name.exclude_sample_name}"
|
|
35 #end for
|
|
36
|
|
37 ${exclude_filtered}
|
|
38
|
|
39 #for $sample_name in $sample_name_repeat:
|
|
40 --sample_name "${sample_name.sample_name}"
|
|
41 #end for
|
|
42
|
|
43 '
|
|
44
|
|
45 #for $select_expressions in $select_expressions_repeat:
|
|
46 #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) )
|
|
47 -o '${ hexlify( $select_expression ) }'
|
|
48 #end for
|
|
49
|
|
50 ##start tool specific options
|
|
51 #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced':
|
|
52 -p '
|
|
53 #for $exclude_sample_file in $analysis_param_type.exclude_sample_file_repeat:
|
|
54 --exclude_sample_file "${exclude_sample_file.exclude_sample_file}"
|
|
55 #end for
|
|
56
|
|
57 #for $sample_file in $analysis_param_type.sample_file_repeat:
|
|
58 --sample_file "${ample_file.sample_file}"
|
|
59 #end for
|
|
60
|
|
61 #if $analysis_param_type.input_keep_ids:
|
|
62 --keepIDs "${analysis_param_type.input_keep_ids}"
|
|
63 #end if
|
|
64
|
|
65 ${analysis_param_type.keep_original_AC}
|
|
66
|
|
67 ${analysis_param_type.mendelian_violation}
|
|
68
|
|
69 --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}"
|
|
70
|
|
71 --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}"
|
|
72
|
|
73 --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}"
|
|
74
|
|
75 #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction':
|
|
76 --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}"
|
|
77 #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number':
|
|
78 --select_random_number "${analysis_param_type.select_random_type.select_random_number}"
|
|
79 #end if
|
|
80
|
|
81 #if $analysis_param_type.select_type_to_include:
|
|
82 #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ):
|
|
83 --selectTypeToInclude "${type_to_include}"
|
|
84 #end for
|
|
85 #end if
|
|
86
|
|
87 ${analysis_param_type.exclude_non_variants}
|
|
88 '
|
|
89
|
|
90 #for $sample_expressions in $analysis_param_type.sample_expressions_repeat:
|
|
91 #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) )
|
|
92 -o '${ hexlify( $sample_expression ) }'
|
|
93 #end for
|
|
94
|
|
95 #end if
|
|
96 ##end tool specific options
|
|
97
|
|
98 #include source=$standard_gatk_options#
|
|
99
|
|
100
|
|
101 </command>
|
|
102 <inputs>
|
|
103 <conditional name="reference_source">
|
|
104 <expand macro="reference_source_selector_param" />
|
|
105 <when value="cached">
|
|
106 <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &lt;variant&gt;" />
|
|
107 <param name="ref_file" type="select" label="Using reference genome" help="-R,--reference_sequence &lt;reference_sequence&gt;">
|
|
108 <options from_data_table="gatk_picard_indexes">
|
|
109 <filter type="data_meta" key="dbkey" ref="input_variant" column="dbkey"/>
|
|
110 </options>
|
|
111 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
|
|
112 </param>
|
|
113 </when>
|
|
114 <when value="history"> <!-- FIX ME!!!! -->
|
|
115 <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &lt;variant&gt;" />
|
|
116 <param name="ref_file" type="data" format="fasta" label="Using reference file" help="-R,--reference_sequence &lt;reference_sequence&gt;" />
|
|
117 </when>
|
|
118 </conditional>
|
|
119
|
|
120 <repeat name="select_expressions_repeat" title="Criteria to use when selecting the data" help="-select,--select_expressions &lt;select_expressions&gt;">
|
|
121 <param name="select_expressions" type="text" label="JEXL expression">
|
|
122 <sanitizer>
|
|
123 <valid initial="string.printable">
|
|
124 <remove value="'"/>
|
|
125 </valid>
|
|
126 <mapping initial="none"/>
|
|
127 </sanitizer>
|
|
128 </param>
|
|
129 </repeat>
|
|
130
|
|
131 <param name="input_concordance" type="data" format="vcf" label="Output variants that were also called in this comparison track" optional="True" help="-conc,--concordance &lt;concordance&gt;"/>
|
|
132 <param name="input_discordance" type="data" format="vcf" label="Output variants that were not called in this comparison track" optional="True" help="-disc,--discordance &lt;discordance&gt;"/>
|
|
133
|
|
134 <repeat name="sample_name_repeat" title="Include Samples by name" help="-sn,--sample_name &lt;sample_name&gt;">
|
|
135 <param name="sample_name" type="text" label="Include genotypes from this sample"/>
|
|
136 </repeat>
|
|
137
|
|
138 <repeat name="exclude_sample_name_repeat" title="Exclude Samples by name" help="-xl_sn,--exclude_sample_name &lt;exclude_sample_name&gt;">
|
|
139 <param name="exclude_sample_name" type="text" label="Exclude genotypes from this sample"/>
|
|
140 </repeat>
|
|
141
|
|
142 <param name="exclude_filtered" type="boolean" truevalue="--excludeFiltered" falsevalue="" label="Don't include filtered loci in the analysis" help="-ef,--excludeFiltered" />
|
|
143
|
|
144 <expand macro="gatk_param_type_conditional" />
|
|
145
|
|
146
|
|
147 <expand macro="analysis_type_conditional">
|
|
148
|
|
149 <repeat name="exclude_sample_file_repeat" title="Exclude Samples by file" help="-xl_sf,--exclude_sample_file &lt;exclude_sample_file&gt;">
|
|
150 <param name="exclude_sample_file" type="data" format="txt" label="File containing a list of samples (one per line) to exclude"/>
|
|
151 </repeat>
|
|
152
|
|
153 <repeat name="sample_file_repeat" title="Samples by file" help="-sf,--sample_file &lt;sample_file&gt;">
|
|
154 <param name="sample_file" type="data" format="txt" label="File containing a list of samples (one per line) to include" />
|
|
155 </repeat>
|
|
156
|
|
157 <param name="input_keep_ids" type="data" format="text" label="Only emit sites whose ID is found in this file" optional="True" help="-IDs,--keepIDs &lt;keepIDs&gt;"/>
|
|
158
|
|
159 <param name="keep_original_AC" type="boolean" truevalue="--keepOriginalAC" falsevalue="" label="Don't update the AC, AF, or AN values in the INFO field after selecting" help="-keepOriginalAC,--keepOriginalAC" />
|
|
160
|
|
161 <param name="mendelian_violation" type="boolean" truevalue="--mendelianViolation" falsevalue="" label="output mendelian violation sites only" help="-mv,--mendelianViolation" />
|
|
162
|
|
163 <param name="mendelian_violation_qual_threshold" type="float" label="Minimum genotype QUAL score for each trio member required to accept a site as a mendelian violation" value="0" help="-mvq,--mendelianViolationQualThreshold &lt;mendelianViolationQualThreshold&gt;" />
|
|
164
|
|
165 <param name="remove_fraction_genotypes" type="float" label="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall" value="0" min="0" max="1" help="-fractionGenotypes,--remove_fraction_genotypes &lt;remove_fraction_genotypes&gt;" />
|
|
166
|
|
167 <param name="restrict_alleles_to" type="select" label="Select only variants of a particular allelicity" help="-restrictAllelesTo,--restrictAllelesTo &lt;restrictAllelesTo&gt;">
|
|
168 <option value="ALL" selected="True">ALL</option>
|
|
169 <option value="MULTIALLELIC">MULTIALLELIC</option>
|
|
170 <option value="BIALLELIC">BIALLELIC</option>
|
|
171 </param>
|
|
172
|
|
173 <repeat name="sample_expressions_repeat" title="Regular expression to select many samples from the ROD tracks provided" help="-se,--sample_expressions &lt;sample_expressions&gt;">
|
|
174 <param name="sample_expressions" type="text" label="Regular expression">
|
|
175 <sanitizer>
|
|
176 <valid initial="string.printable">
|
|
177 <remove value="'"/>
|
|
178 </valid>
|
|
179 <mapping initial="none"/>
|
|
180 </sanitizer>
|
|
181 </param>
|
|
182 </repeat>
|
|
183
|
|
184 <conditional name="select_random_type">
|
|
185 <param name="select_random_type_selector" type="select" label="Select a random subset of variants">
|
|
186 <option value="select_all" selected="True">Use all variants</option>
|
|
187 <option value="select_random_fraction">Select random fraction</option>
|
|
188 <option value="select_random_number">Select random number</option>
|
|
189 </param>
|
|
190 <when value="select_all">
|
|
191 <!-- Do nothing here -->
|
|
192 </when>
|
|
193 <when value="select_random_fraction">
|
|
194 <param name="select_random_fraction" type="float" value="0" label="Fraction" min="0" max="1" help="-fraction,--select_random_fraction &lt;select_random_fraction&gt;"/>
|
|
195 </when>
|
|
196 <when value="select_random_number">
|
|
197 <param name="select_random_number" type="integer" value="0" label="Count" help="-number,--select_random_number &lt;select_random_number&gt;" />
|
|
198 </when>
|
|
199 </conditional>
|
|
200
|
|
201 <param name="exclude_non_variants" type="boolean" truevalue="--excludeNonVariants" falsevalue="" label="Don't include loci found to be non-variant after the subsetting procedure" help="-env,--excludeNonVariants" />
|
|
202
|
|
203 <param name="select_type_to_include" type="select" label="Select only a certain type of variants from the input file" multiple="True" display="checkboxes" help="-selectType,--selectTypeToInclude &lt;selectTypeToInclude&gt;">
|
|
204 <option value="INDEL">INDEL</option>
|
|
205 <option value="SNP">SNP</option>
|
|
206 <option value="MIXED">MIXED</option>
|
|
207 <option value="MNP">MNP</option>
|
|
208 <option value="SYMBOLIC">SYMBOLIC</option>
|
|
209 <option value="NO_VARIATION">NO_VARIATION</option>
|
|
210 </param>
|
|
211 </expand>
|
|
212
|
|
213 </inputs>
|
|
214 <outputs>
|
|
215 <data format="vcf" name="output_vcf" label="${tool.name} on ${on_string} (Variant File)" />
|
|
216 <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
|
|
217 </outputs>
|
|
218 <tests>
|
|
219 <test>
|
|
220 <param name="reference_source_selector" value="history" />
|
|
221 <param name="ref_file" value="phiX.fasta" ftype="fasta" />
|
|
222 <param name="input_variant" value="gatk/gatk_variant_annotator/gatk_variant_annotator_out_1.vcf" ftype="vcf" />
|
|
223 <param name="select_expressions_repeat" value="0" />
|
|
224 <param name="input_concordance" />
|
|
225 <param name="input_discordance" />
|
|
226 <param name="exclude_sample_name_repeat" value="0" />
|
|
227 <param name="exclude_filtered" />
|
|
228 <param name="sample_name_repeat" value="0" />
|
|
229 <param name="gatk_param_type_selector" value="basic" />
|
|
230 <param name="analysis_param_type_selector" value="basic" />
|
|
231 <output name="output_vcf" file="gatk/gatk_variant_select/gatk_variant_select_out_1.vcf" lines_diff="4" />
|
|
232 <output name="output_log" file="gatk/gatk_variant_select/gatk_variant_select_out_1.log.contains" compare="contains" />
|
|
233 </test>
|
|
234 </tests>
|
|
235 <help>
|
|
236 **What it does**
|
|
237
|
|
238 Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions). One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
|
|
239
|
|
240 For more information on using the SelectVariants module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/SelectVariants>`_.
|
|
241
|
|
242 To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_.
|
|
243
|
|
244 If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_.
|
|
245
|
|
246 ------
|
|
247
|
|
248 **Inputs**
|
|
249
|
|
250 GenomeAnalysisTK: SelectVariants accepts a VCF input file.
|
|
251
|
|
252
|
|
253 **Outputs**
|
|
254
|
|
255 The output is in VCF format.
|
|
256
|
|
257
|
|
258 Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats.
|
|
259
|
|
260 -------
|
|
261
|
|
262 **Settings**::
|
|
263
|
|
264
|
|
265 out VCFWriter stdout File to which variants should be written
|
|
266 variant RodBinding[VariantContext] NA Input VCF file
|
|
267 concordance RodBinding[VariantContext] none Output variants that were also called in this comparison track
|
|
268 discordance RodBinding[VariantContext] none Output variants that were not called in this comparison track
|
|
269 exclude_sample_file Set[File] [] File containing a list of samples (one per line) to exclude. Can be specified multiple times
|
|
270 exclude_sample_name Set[String] [] Exclude genotypes from this sample. Can be specified multiple times
|
|
271 excludeFiltered boolean false Don't include filtered loci in the analysis
|
|
272 excludeNonVariants boolean false Don't include loci found to be non-variant after the subsetting procedure
|
|
273 keepIDs File NA Only emit sites whose ID is found in this file (one ID per line)
|
|
274 keepOriginalAC boolean false Don't update the AC, AF, or AN values in the INFO field after selecting
|
|
275 mendelianViolation Boolean false output mendelian violation sites only
|
|
276 mvq double 0.0 Minimum genotype QUAL score for each trio member required to accept a site as a violation
|
|
277 remove_fraction_genotypes double 0.0 Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall
|
|
278 restrictAllelesTo NumberAlleleRestriction ALL Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC
|
|
279 sample_expressions Set[String] NA Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times
|
|
280 sample_file Set[File] NA File containing a list of samples (one per line) to include. Can be specified multiple times
|
|
281 sample_name Set[String] [] Include genotypes from this sample. Can be specified multiple times
|
|
282 select_expressions ArrayList[String] [] One or more criteria to use when selecting the data
|
|
283 select_random_fraction double 0.0 Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track
|
|
284 select_random_number int 0 Selects a number of variants at random from the variant track
|
|
285 selectTypeToInclude List[Type] [] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times
|
|
286
|
|
287 @CITATION_SECTION@
|
|
288 </help>
|
|
289 </tool>
|