comparison beagle.xml @ 0:553b27c30eb8 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
author iuc
date Sat, 03 Jul 2021 23:33:37 +0000
parents
children f75bf16ac901
comparison
equal deleted inserted replaced
-1:000000000000 0:553b27c30eb8
1 <tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'>
2 <description>phasing genotypes and imputing ungenotyped markers</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro='edam_ontology' />
7 <expand macro='requirements' />
8 <command detect_errors='exit_code'><![CDATA[
9 #set out_prefix='out'
10 #if $optional_inputs.ref.ext == 'bref3'
11 ln -s '${optional_inputs.ref}' ref.bref3 &&
12 #end if
13 beagle
14 gt='${gt}'
15 #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3'
16 ref=ref.bref3
17 #else if $optional_inputs.ref
18 ref='${optional_inputs.ref}'
19 #end if
20 #if $optional_inputs.map
21 map='${optional_inputs.map}'
22 #end if
23 #if $chrom
24 chrom='${chrom}'
25 #end if
26 #if $optional_inputs.excludesamples
27 excludesamples='${optional_inputs.excludesamples}'
28 #end if
29 #if $optional_inputs.excludemarkers
30 excludemarkers='${optional_inputs.excludemarkers}'
31 #end if
32 ne=$ne
33 window=$window
34 overlap=$overlap
35 #if $seed
36 seed=$seed
37 #end if
38 #if $err
39 err=$err
40 #end if
41 burnin=$phasing_parameters.burnin
42 iterations=$phasing_parameters.iterations
43 phase-states=$phasing_parameters.phase_states
44 impute=$imputation_parameters.impute
45 imp-states=$imputation_parameters.imp_states
46 imp-segment=$imputation_parameters.imp_segment
47 imp-step=$imputation_parameters.imp_step
48 cluster=$imputation_parameters.cluster
49 ap=$imputation_parameters.ap
50 gp=$imputation_parameters.gp
51 out=$out_prefix
52 nthreads=\${GALAXY_SLOTS:-1}
53 && gunzip 'out.vcf.gz'
54 ]]> </command>
55 <inputs>
56 <param argument="gt" type="data" format="vcf" label="VCF file"
57 help="It specifies a VCF file containing genotypes for the study samples.
58 Each VCF record must contain a GT (genotype) format field"/>
59 <section name="optional_inputs" title="Optional input files" expanded="true">
60 <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes"
61 help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the
62 phased allele separator must be used '|'"/>
63 <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units"
64 help="Beagle uses linear interpolation to estimate genetic positions between map positions. If
65 no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/>
66 <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude"
67 help="It specifies a file containing samples (one sample identifier per line) to be excluded
68 from the analysis" />
69 <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude"
70 help="It specifies a file containing markers (one marker per line) to be excluded from the
71 analysis. Each line of the file can be either an identifier from a VCF record’s ID field
72 or a genomic coordinate in the format: CHROM:POS" />
73 </section>
74 <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval"
75 help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be
76 specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively">
77 <sanitizer invalid_char="">
78 <valid initial="string.letters,string.digits">
79 <add value=":" />
80 <add value="-" />
81 </valid>
82 </sanitizer>
83 <validator type="regex">[0-9a-zA-Z:-]+</validator>
84 </param>
85 <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size"
86 help="The default value is suitable for a large, outbred population. It is needed to specify an
87 appropriate effective populations size if you are imputing ungenotyped markers in a small
88 or inbred population"/>
89 <param argument="window" type="float" min="0" value="40.0" label="Window length in cM"
90 help="The window parameter must be at least 1.1 times as large as the overlap parameter.
91 The window parameter controls the amount of memory required for the analysis"/>
92 <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM"
93 help="It specifies the cM length of overlap between adjacent sliding windows"/>
94 <param argument="err" type="float" min="0" max="1" optional="true"
95 label="Allele mismatch probability for the hidden Markov model"
96 help="If no err parameter is specified, the err parameter will be set equal πœƒ/(2(πœƒ + 𝐻))
97 where πœƒ = 1/(0.5 + ln 𝐻) and 𝐻 is the number of haplotypes"/>
98 <param argument="seed" type="integer" value="" optional="true" label="Random seed"
99 help="A random seed is a number used to initialize a pseudorandom number generator" />
100 <param name="output_log" type="boolean" checked="false" label="Output a log file"/>
101 <section name="phasing_parameters" title="Phasing parameters">
102 <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations"
103 help="It is the maximum number of burnin iterations used to estimate an initial haplotype
104 frequency model for inferring genotype phase" />
105 <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations"
106 help="It is the number of iterations used to estimate genotype phase. Increasing this
107 parameter will trade increased computation time for increased phasing accuracy" />
108 <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing"
109 help="It is the number of model states used to estimate genotype phase" />
110 </section>
111 <section name="imputation_parameters" title="Imputation parameters">
112 <param argument="impute" type="boolean" truevalue="true" falsevalue="false"
113 checked="true" label="Impute ungenotyped markers"
114 help="It specifies whether markers that are present in the reference panel but absent in
115 that target will be imputed. This option has no effect if no reference panel is specified"/>
116 <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation"
117 help="It is the number of model states used to impute ungenotyped markers" />
118 <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments"
119 help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state
120 space for a target haplotype." />
121 <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments"
122 help="It is the length in cM of the step used for detecting short IBS segments" />
123 <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster"
124 help="It specifies the maximum cM distance between individual markers that are combined
125 into an aggregate marker when imputing ungenotyped markers" />
126 <param argument="ap" type="boolean" truevalue="true" falsevalue="false"
127 checked="false" label="Include posterior allele probabilities"
128 help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output
129 VCF file when imputing ungenotyped markers" />
130 <param argument="gp" type="boolean" truevalue="true" falsevalue="false"
131 checked="false" label="Include posterior genotype probabilities"
132 help="It specifies whether a GP (genotype probability) format field will be included in the output
133 VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele
134 probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype
135 with highest genotype probability may occasionally be different than the genotype obtained by
136 taking the allele with highest probability on each haplotype, which is the genotype reported
137 in the GT format field" />
138 </section>
139 </inputs>
140 <outputs>
141 <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/>
142 <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file">
143 <filter>output_log</filter>
144 </data>
145 </outputs>
146 <tests>
147 <!-- Test default values -->
148 <test expect_num_outputs="2">
149 <param name="gt" value="test.vcf.gz"/>
150 <param name="chrom" value="22:100-"/>
151 <param name="ne" value="1000000"/>
152 <param name="window" value="40.0"/>
153 <param name="overlap" value="2.0"/>
154 <param name="err" value="0.02"/>
155 <param name="seed" value="1"/>
156 <param name="output_log" value="true"/>
157 <section name="phasing_parameters">
158 <param name="burnin" value="3"/>
159 <param name="iterations" value="12"/>
160 <param name="phase_states" value="280"/>
161 </section>
162 <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/>
163 <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/>
164 </test>
165 <!-- Test plink file-->
166 <test expect_num_outputs="2">
167 <param name="gt" value="test.vcf.gz"/>
168 <param name="ne" value="1000000"/>
169 <param name="window" value="30.0"/>
170 <param name="overlap" value="3.0"/>
171 <param name="output_log" value="true"/>
172 <section name="optional_inputs">
173 <param name="map" value="plink.map"/>
174 </section>
175 <section name="phasing_parameters">
176 <param name="burnin" value="4"/>
177 <param name="iterations" value="10"/>
178 <param name="phase_states" value="250"/>
179 </section>
180 <output name="vcf_file" ftype="vcf">
181 <assert_contents>
182 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
183 <has_size value="181272"/>
184 </assert_contents>
185 </output>
186 <output name="log_file" ftype="txt">
187 <assert_contents>
188 <has_text text="Reference markers: 223"/>
189 <has_size value="1586" delta="10"/>
190 </assert_contents>
191 </output>
192 </test>
193 <!-- Test ref VCF input -->
194 <test expect_num_outputs="2">
195 <param name="gt" value="target.vcf.gz"/>
196 <param name="ne" value="1000000"/>
197 <param name="window" value="40.0"/>
198 <param name="overlap" value="2.0"/>
199 <param name="output_log" value="true"/>
200 <section name="optional_inputs">
201 <param name="ref" value="ref.vcf.gz"/>
202 </section>
203 <section name="imputation_parameters">
204 <param name="impute" value="true"/>
205 <param name="imp_states" value="1600"/>
206 <param name="imp_segment" value="6.0"/>
207 <param name="imp_step" value="0.1"/>
208 <param name="cluster" value="0.005"/>
209 <param name="ap" value="true"/>
210 <param name="gp" value="true"/>
211 </section>
212 <output name="vcf_file" ftype="vcf">
213 <assert_contents>
214 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
215 <has_size value="18635"/>
216 </assert_contents>
217 </output>
218 <output name="log_file" ftype="txt">
219 <assert_contents>
220 <has_text text="Reference markers: 223"/>
221 <has_size value="1801" delta="10"/>
222 </assert_contents>
223 </output>
224 </test>
225 <!-- Test ref bref3 input -->
226 <test expect_num_outputs="1">
227 <param name="gt" value="target.vcf.gz"/>
228 <param name="ne" value="1000000"/>
229 <param name="window" value="40.0"/>
230 <param name="overlap" value="2.0"/>
231 <section name="optional_inputs">
232 <param name="ref" value="ref.bref3"/>
233 </section>
234 <section name="imputation_parameters">
235 <param name="impute" value="true"/>
236 <param name="imp_states" value="1600"/>
237 <param name="imp_segment" value="6.0"/>
238 <param name="imp_step" value="0.1"/>
239 <param name="cluster" value="0.005"/>
240 <param name="ap" value="true"/>
241 <param name="gp" value="true"/>
242 </section>
243 <output name="vcf_file" ftype="vcf">
244 <assert_contents>
245 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
246 <has_size value="18635"/>
247 </assert_contents>
248 </output>
249 </test>
250 </tests>
251 <help><![CDATA[
252 .. class:: infomark
253
254 **Purpose**
255
256 Beagle is a program for phasing and imputing missing genotypes. Sporadic missing
257 genotypes are imputed during phasing. If a reference panel of phased genotypes is specified
258 with the ref argument, ungenotyped markers that are present in the reference panel can also
259 be imputed.
260
261 Beagle version 5.2 provides significantly faster genotype phasing than version 5.1.
262 Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but
263 Beagle versions 4.0 and 4.1 have this capability.
264
265 ----
266
267 .. class:: infomark
268
269 **HapMap genetic maps**
270
271 HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available
272 in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_
273
274 ----
275
276 .. class:: infomark
277
278 **Input files**
279
280 Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_
281 (VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal
282 X-chromosome genotypes must be in separate input files and analysed separately unless male
283 haploid genotypes are coded as homozygous diploid genotypes.
284
285 In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window,
286 it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/").
287 Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip,
288 and that a reference VCF file that has a name ending in β€œ.bref3” is compressed with bref version 3.
289
290 ----
291
292 .. class:: infomark
293
294 **Output files**
295
296 There are two output files. The log file gives a summary of the analysis that includes the
297 Beagle version, the command line arguments, and compute time.
298
299 The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing
300 genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the
301 unix gunzip utility.
302
303 If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO
304 field will contain:
305
306 ::
307
308 - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose.
309 - An "AF" subfield with the estimated alternate allele frequencies in the target samples.
310 - The "IMP" flag if the marker is imputed.
311
312 ]]> </help>
313 <expand macro="citations" />
314 </tool>