Mercurial > repos > iuc > beagle
comparison beagle.xml @ 0:553b27c30eb8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
author | iuc |
---|---|
date | Sat, 03 Jul 2021 23:33:37 +0000 |
parents | |
children | f75bf16ac901 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:553b27c30eb8 |
---|---|
1 <tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'> | |
2 <description>phasing genotypes and imputing ungenotyped markers</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro='edam_ontology' /> | |
7 <expand macro='requirements' /> | |
8 <command detect_errors='exit_code'><![CDATA[ | |
9 #set out_prefix='out' | |
10 #if $optional_inputs.ref.ext == 'bref3' | |
11 ln -s '${optional_inputs.ref}' ref.bref3 && | |
12 #end if | |
13 beagle | |
14 gt='${gt}' | |
15 #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3' | |
16 ref=ref.bref3 | |
17 #else if $optional_inputs.ref | |
18 ref='${optional_inputs.ref}' | |
19 #end if | |
20 #if $optional_inputs.map | |
21 map='${optional_inputs.map}' | |
22 #end if | |
23 #if $chrom | |
24 chrom='${chrom}' | |
25 #end if | |
26 #if $optional_inputs.excludesamples | |
27 excludesamples='${optional_inputs.excludesamples}' | |
28 #end if | |
29 #if $optional_inputs.excludemarkers | |
30 excludemarkers='${optional_inputs.excludemarkers}' | |
31 #end if | |
32 ne=$ne | |
33 window=$window | |
34 overlap=$overlap | |
35 #if $seed | |
36 seed=$seed | |
37 #end if | |
38 #if $err | |
39 err=$err | |
40 #end if | |
41 burnin=$phasing_parameters.burnin | |
42 iterations=$phasing_parameters.iterations | |
43 phase-states=$phasing_parameters.phase_states | |
44 impute=$imputation_parameters.impute | |
45 imp-states=$imputation_parameters.imp_states | |
46 imp-segment=$imputation_parameters.imp_segment | |
47 imp-step=$imputation_parameters.imp_step | |
48 cluster=$imputation_parameters.cluster | |
49 ap=$imputation_parameters.ap | |
50 gp=$imputation_parameters.gp | |
51 out=$out_prefix | |
52 nthreads=\${GALAXY_SLOTS:-1} | |
53 && gunzip 'out.vcf.gz' | |
54 ]]> </command> | |
55 <inputs> | |
56 <param argument="gt" type="data" format="vcf" label="VCF file" | |
57 help="It specifies a VCF file containing genotypes for the study samples. | |
58 Each VCF record must contain a GT (genotype) format field"/> | |
59 <section name="optional_inputs" title="Optional input files" expanded="true"> | |
60 <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes" | |
61 help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the | |
62 phased allele separator must be used '|'"/> | |
63 <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units" | |
64 help="Beagle uses linear interpolation to estimate genetic positions between map positions. If | |
65 no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/> | |
66 <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude" | |
67 help="It specifies a file containing samples (one sample identifier per line) to be excluded | |
68 from the analysis" /> | |
69 <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude" | |
70 help="It specifies a file containing markers (one marker per line) to be excluded from the | |
71 analysis. Each line of the file can be either an identifier from a VCF recordβs ID field | |
72 or a genomic coordinate in the format: CHROM:POS" /> | |
73 </section> | |
74 <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval" | |
75 help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be | |
76 specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively"> | |
77 <sanitizer invalid_char=""> | |
78 <valid initial="string.letters,string.digits"> | |
79 <add value=":" /> | |
80 <add value="-" /> | |
81 </valid> | |
82 </sanitizer> | |
83 <validator type="regex">[0-9a-zA-Z:-]+</validator> | |
84 </param> | |
85 <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size" | |
86 help="The default value is suitable for a large, outbred population. It is needed to specify an | |
87 appropriate effective populations size if you are imputing ungenotyped markers in a small | |
88 or inbred population"/> | |
89 <param argument="window" type="float" min="0" value="40.0" label="Window length in cM" | |
90 help="The window parameter must be at least 1.1 times as large as the overlap parameter. | |
91 The window parameter controls the amount of memory required for the analysis"/> | |
92 <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM" | |
93 help="It specifies the cM length of overlap between adjacent sliding windows"/> | |
94 <param argument="err" type="float" min="0" max="1" optional="true" | |
95 label="Allele mismatch probability for the hidden Markov model" | |
96 help="If no err parameter is specified, the err parameter will be set equal π/(2(π + π»)) | |
97 where π = 1/(0.5 + ln π») and π» is the number of haplotypes"/> | |
98 <param argument="seed" type="integer" value="" optional="true" label="Random seed" | |
99 help="A random seed is a number used to initialize a pseudorandom number generator" /> | |
100 <param name="output_log" type="boolean" checked="false" label="Output a log file"/> | |
101 <section name="phasing_parameters" title="Phasing parameters"> | |
102 <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations" | |
103 help="It is the maximum number of burnin iterations used to estimate an initial haplotype | |
104 frequency model for inferring genotype phase" /> | |
105 <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations" | |
106 help="It is the number of iterations used to estimate genotype phase. Increasing this | |
107 parameter will trade increased computation time for increased phasing accuracy" /> | |
108 <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing" | |
109 help="It is the number of model states used to estimate genotype phase" /> | |
110 </section> | |
111 <section name="imputation_parameters" title="Imputation parameters"> | |
112 <param argument="impute" type="boolean" truevalue="true" falsevalue="false" | |
113 checked="true" label="Impute ungenotyped markers" | |
114 help="It specifies whether markers that are present in the reference panel but absent in | |
115 that target will be imputed. This option has no effect if no reference panel is specified"/> | |
116 <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation" | |
117 help="It is the number of model states used to impute ungenotyped markers" /> | |
118 <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments" | |
119 help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state | |
120 space for a target haplotype." /> | |
121 <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments" | |
122 help="It is the length in cM of the step used for detecting short IBS segments" /> | |
123 <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster" | |
124 help="It specifies the maximum cM distance between individual markers that are combined | |
125 into an aggregate marker when imputing ungenotyped markers" /> | |
126 <param argument="ap" type="boolean" truevalue="true" falsevalue="false" | |
127 checked="false" label="Include posterior allele probabilities" | |
128 help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output | |
129 VCF file when imputing ungenotyped markers" /> | |
130 <param argument="gp" type="boolean" truevalue="true" falsevalue="false" | |
131 checked="false" label="Include posterior genotype probabilities" | |
132 help="It specifies whether a GP (genotype probability) format field will be included in the output | |
133 VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele | |
134 probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype | |
135 with highest genotype probability may occasionally be different than the genotype obtained by | |
136 taking the allele with highest probability on each haplotype, which is the genotype reported | |
137 in the GT format field" /> | |
138 </section> | |
139 </inputs> | |
140 <outputs> | |
141 <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/> | |
142 <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file"> | |
143 <filter>output_log</filter> | |
144 </data> | |
145 </outputs> | |
146 <tests> | |
147 <!-- Test default values --> | |
148 <test expect_num_outputs="2"> | |
149 <param name="gt" value="test.vcf.gz"/> | |
150 <param name="chrom" value="22:100-"/> | |
151 <param name="ne" value="1000000"/> | |
152 <param name="window" value="40.0"/> | |
153 <param name="overlap" value="2.0"/> | |
154 <param name="err" value="0.02"/> | |
155 <param name="seed" value="1"/> | |
156 <param name="output_log" value="true"/> | |
157 <section name="phasing_parameters"> | |
158 <param name="burnin" value="3"/> | |
159 <param name="iterations" value="12"/> | |
160 <param name="phase_states" value="280"/> | |
161 </section> | |
162 <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/> | |
163 <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/> | |
164 </test> | |
165 <!-- Test plink file--> | |
166 <test expect_num_outputs="2"> | |
167 <param name="gt" value="test.vcf.gz"/> | |
168 <param name="ne" value="1000000"/> | |
169 <param name="window" value="30.0"/> | |
170 <param name="overlap" value="3.0"/> | |
171 <param name="output_log" value="true"/> | |
172 <section name="optional_inputs"> | |
173 <param name="map" value="plink.map"/> | |
174 </section> | |
175 <section name="phasing_parameters"> | |
176 <param name="burnin" value="4"/> | |
177 <param name="iterations" value="10"/> | |
178 <param name="phase_states" value="250"/> | |
179 </section> | |
180 <output name="vcf_file" ftype="vcf"> | |
181 <assert_contents> | |
182 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
183 <has_size value="181272"/> | |
184 </assert_contents> | |
185 </output> | |
186 <output name="log_file" ftype="txt"> | |
187 <assert_contents> | |
188 <has_text text="Reference markers: 223"/> | |
189 <has_size value="1586" delta="10"/> | |
190 </assert_contents> | |
191 </output> | |
192 </test> | |
193 <!-- Test ref VCF input --> | |
194 <test expect_num_outputs="2"> | |
195 <param name="gt" value="target.vcf.gz"/> | |
196 <param name="ne" value="1000000"/> | |
197 <param name="window" value="40.0"/> | |
198 <param name="overlap" value="2.0"/> | |
199 <param name="output_log" value="true"/> | |
200 <section name="optional_inputs"> | |
201 <param name="ref" value="ref.vcf.gz"/> | |
202 </section> | |
203 <section name="imputation_parameters"> | |
204 <param name="impute" value="true"/> | |
205 <param name="imp_states" value="1600"/> | |
206 <param name="imp_segment" value="6.0"/> | |
207 <param name="imp_step" value="0.1"/> | |
208 <param name="cluster" value="0.005"/> | |
209 <param name="ap" value="true"/> | |
210 <param name="gp" value="true"/> | |
211 </section> | |
212 <output name="vcf_file" ftype="vcf"> | |
213 <assert_contents> | |
214 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
215 <has_size value="18635"/> | |
216 </assert_contents> | |
217 </output> | |
218 <output name="log_file" ftype="txt"> | |
219 <assert_contents> | |
220 <has_text text="Reference markers: 223"/> | |
221 <has_size value="1801" delta="10"/> | |
222 </assert_contents> | |
223 </output> | |
224 </test> | |
225 <!-- Test ref bref3 input --> | |
226 <test expect_num_outputs="1"> | |
227 <param name="gt" value="target.vcf.gz"/> | |
228 <param name="ne" value="1000000"/> | |
229 <param name="window" value="40.0"/> | |
230 <param name="overlap" value="2.0"/> | |
231 <section name="optional_inputs"> | |
232 <param name="ref" value="ref.bref3"/> | |
233 </section> | |
234 <section name="imputation_parameters"> | |
235 <param name="impute" value="true"/> | |
236 <param name="imp_states" value="1600"/> | |
237 <param name="imp_segment" value="6.0"/> | |
238 <param name="imp_step" value="0.1"/> | |
239 <param name="cluster" value="0.005"/> | |
240 <param name="ap" value="true"/> | |
241 <param name="gp" value="true"/> | |
242 </section> | |
243 <output name="vcf_file" ftype="vcf"> | |
244 <assert_contents> | |
245 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
246 <has_size value="18635"/> | |
247 </assert_contents> | |
248 </output> | |
249 </test> | |
250 </tests> | |
251 <help><![CDATA[ | |
252 .. class:: infomark | |
253 | |
254 **Purpose** | |
255 | |
256 Beagle is a program for phasing and imputing missing genotypes. Sporadic missing | |
257 genotypes are imputed during phasing. If a reference panel of phased genotypes is specified | |
258 with the ref argument, ungenotyped markers that are present in the reference panel can also | |
259 be imputed. | |
260 | |
261 Beagle version 5.2 provides significantly faster genotype phasing than version 5.1. | |
262 Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but | |
263 Beagle versions 4.0 and 4.1 have this capability. | |
264 | |
265 ---- | |
266 | |
267 .. class:: infomark | |
268 | |
269 **HapMap genetic maps** | |
270 | |
271 HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available | |
272 in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_ | |
273 | |
274 ---- | |
275 | |
276 .. class:: infomark | |
277 | |
278 **Input files** | |
279 | |
280 Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_ | |
281 (VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal | |
282 X-chromosome genotypes must be in separate input files and analysed separately unless male | |
283 haploid genotypes are coded as homozygous diploid genotypes. | |
284 | |
285 In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window, | |
286 it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/"). | |
287 Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip, | |
288 and that a reference VCF file that has a name ending in β.bref3β is compressed with bref version 3. | |
289 | |
290 ---- | |
291 | |
292 .. class:: infomark | |
293 | |
294 **Output files** | |
295 | |
296 There are two output files. The log file gives a summary of the analysis that includes the | |
297 Beagle version, the command line arguments, and compute time. | |
298 | |
299 The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing | |
300 genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the | |
301 unix gunzip utility. | |
302 | |
303 If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO | |
304 field will contain: | |
305 | |
306 :: | |
307 | |
308 - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose. | |
309 - An "AF" subfield with the estimated alternate allele frequencies in the target samples. | |
310 - The "IMP" flag if the marker is imputed. | |
311 | |
312 ]]> </help> | |
313 <expand macro="citations" /> | |
314 </tool> |