Mercurial > repos > iuc > beagle
view beagle.xml @ 0:553b27c30eb8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
author | iuc |
---|---|
date | Sat, 03 Jul 2021 23:33:37 +0000 |
parents | |
children | f75bf16ac901 |
line wrap: on
line source
<tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'> <description>phasing genotypes and imputing ungenotyped markers</description> <macros> <import>macros.xml</import> </macros> <expand macro='edam_ontology' /> <expand macro='requirements' /> <command detect_errors='exit_code'><![CDATA[ #set out_prefix='out' #if $optional_inputs.ref.ext == 'bref3' ln -s '${optional_inputs.ref}' ref.bref3 && #end if beagle gt='${gt}' #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3' ref=ref.bref3 #else if $optional_inputs.ref ref='${optional_inputs.ref}' #end if #if $optional_inputs.map map='${optional_inputs.map}' #end if #if $chrom chrom='${chrom}' #end if #if $optional_inputs.excludesamples excludesamples='${optional_inputs.excludesamples}' #end if #if $optional_inputs.excludemarkers excludemarkers='${optional_inputs.excludemarkers}' #end if ne=$ne window=$window overlap=$overlap #if $seed seed=$seed #end if #if $err err=$err #end if burnin=$phasing_parameters.burnin iterations=$phasing_parameters.iterations phase-states=$phasing_parameters.phase_states impute=$imputation_parameters.impute imp-states=$imputation_parameters.imp_states imp-segment=$imputation_parameters.imp_segment imp-step=$imputation_parameters.imp_step cluster=$imputation_parameters.cluster ap=$imputation_parameters.ap gp=$imputation_parameters.gp out=$out_prefix nthreads=\${GALAXY_SLOTS:-1} && gunzip 'out.vcf.gz' ]]> </command> <inputs> <param argument="gt" type="data" format="vcf" label="VCF file" help="It specifies a VCF file containing genotypes for the study samples. Each VCF record must contain a GT (genotype) format field"/> <section name="optional_inputs" title="Optional input files" expanded="true"> <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes" help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the phased allele separator must be used '|'"/> <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units" help="Beagle uses linear interpolation to estimate genetic positions between map positions. If no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/> <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude" help="It specifies a file containing samples (one sample identifier per line) to be excluded from the analysis" /> <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude" help="It specifies a file containing markers (one marker per line) to be excluded from the analysis. Each line of the file can be either an identifier from a VCF recordβs ID field or a genomic coordinate in the format: CHROM:POS" /> </section> <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval" help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively"> <sanitizer invalid_char=""> <valid initial="string.letters,string.digits"> <add value=":" /> <add value="-" /> </valid> </sanitizer> <validator type="regex">[0-9a-zA-Z:-]+</validator> </param> <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size" help="The default value is suitable for a large, outbred population. It is needed to specify an appropriate effective populations size if you are imputing ungenotyped markers in a small or inbred population"/> <param argument="window" type="float" min="0" value="40.0" label="Window length in cM" help="The window parameter must be at least 1.1 times as large as the overlap parameter. The window parameter controls the amount of memory required for the analysis"/> <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM" help="It specifies the cM length of overlap between adjacent sliding windows"/> <param argument="err" type="float" min="0" max="1" optional="true" label="Allele mismatch probability for the hidden Markov model" help="If no err parameter is specified, the err parameter will be set equal π/(2(π + π»)) where π = 1/(0.5 + ln π») and π» is the number of haplotypes"/> <param argument="seed" type="integer" value="" optional="true" label="Random seed" help="A random seed is a number used to initialize a pseudorandom number generator" /> <param name="output_log" type="boolean" checked="false" label="Output a log file"/> <section name="phasing_parameters" title="Phasing parameters"> <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations" help="It is the maximum number of burnin iterations used to estimate an initial haplotype frequency model for inferring genotype phase" /> <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations" help="It is the number of iterations used to estimate genotype phase. Increasing this parameter will trade increased computation time for increased phasing accuracy" /> <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing" help="It is the number of model states used to estimate genotype phase" /> </section> <section name="imputation_parameters" title="Imputation parameters"> <param argument="impute" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Impute ungenotyped markers" help="It specifies whether markers that are present in the reference panel but absent in that target will be imputed. This option has no effect if no reference panel is specified"/> <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation" help="It is the number of model states used to impute ungenotyped markers" /> <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments" help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state space for a target haplotype." /> <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments" help="It is the length in cM of the step used for detecting short IBS segments" /> <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster" help="It specifies the maximum cM distance between individual markers that are combined into an aggregate marker when imputing ungenotyped markers" /> <param argument="ap" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Include posterior allele probabilities" help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output VCF file when imputing ungenotyped markers" /> <param argument="gp" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Include posterior genotype probabilities" help="It specifies whether a GP (genotype probability) format field will be included in the output VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype with highest genotype probability may occasionally be different than the genotype obtained by taking the allele with highest probability on each haplotype, which is the genotype reported in the GT format field" /> </section> </inputs> <outputs> <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/> <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file"> <filter>output_log</filter> </data> </outputs> <tests> <!-- Test default values --> <test expect_num_outputs="2"> <param name="gt" value="test.vcf.gz"/> <param name="chrom" value="22:100-"/> <param name="ne" value="1000000"/> <param name="window" value="40.0"/> <param name="overlap" value="2.0"/> <param name="err" value="0.02"/> <param name="seed" value="1"/> <param name="output_log" value="true"/> <section name="phasing_parameters"> <param name="burnin" value="3"/> <param name="iterations" value="12"/> <param name="phase_states" value="280"/> </section> <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/> <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/> </test> <!-- Test plink file--> <test expect_num_outputs="2"> <param name="gt" value="test.vcf.gz"/> <param name="ne" value="1000000"/> <param name="window" value="30.0"/> <param name="overlap" value="3.0"/> <param name="output_log" value="true"/> <section name="optional_inputs"> <param name="map" value="plink.map"/> </section> <section name="phasing_parameters"> <param name="burnin" value="4"/> <param name="iterations" value="10"/> <param name="phase_states" value="250"/> </section> <output name="vcf_file" ftype="vcf"> <assert_contents> <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> <has_size value="181272"/> </assert_contents> </output> <output name="log_file" ftype="txt"> <assert_contents> <has_text text="Reference markers: 223"/> <has_size value="1586" delta="10"/> </assert_contents> </output> </test> <!-- Test ref VCF input --> <test expect_num_outputs="2"> <param name="gt" value="target.vcf.gz"/> <param name="ne" value="1000000"/> <param name="window" value="40.0"/> <param name="overlap" value="2.0"/> <param name="output_log" value="true"/> <section name="optional_inputs"> <param name="ref" value="ref.vcf.gz"/> </section> <section name="imputation_parameters"> <param name="impute" value="true"/> <param name="imp_states" value="1600"/> <param name="imp_segment" value="6.0"/> <param name="imp_step" value="0.1"/> <param name="cluster" value="0.005"/> <param name="ap" value="true"/> <param name="gp" value="true"/> </section> <output name="vcf_file" ftype="vcf"> <assert_contents> <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> <has_size value="18635"/> </assert_contents> </output> <output name="log_file" ftype="txt"> <assert_contents> <has_text text="Reference markers: 223"/> <has_size value="1801" delta="10"/> </assert_contents> </output> </test> <!-- Test ref bref3 input --> <test expect_num_outputs="1"> <param name="gt" value="target.vcf.gz"/> <param name="ne" value="1000000"/> <param name="window" value="40.0"/> <param name="overlap" value="2.0"/> <section name="optional_inputs"> <param name="ref" value="ref.bref3"/> </section> <section name="imputation_parameters"> <param name="impute" value="true"/> <param name="imp_states" value="1600"/> <param name="imp_segment" value="6.0"/> <param name="imp_step" value="0.1"/> <param name="cluster" value="0.005"/> <param name="ap" value="true"/> <param name="gp" value="true"/> </section> <output name="vcf_file" ftype="vcf"> <assert_contents> <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> <has_size value="18635"/> </assert_contents> </output> </test> </tests> <help><![CDATA[ .. class:: infomark **Purpose** Beagle is a program for phasing and imputing missing genotypes. Sporadic missing genotypes are imputed during phasing. If a reference panel of phased genotypes is specified with the ref argument, ungenotyped markers that are present in the reference panel can also be imputed. Beagle version 5.2 provides significantly faster genotype phasing than version 5.1. Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but Beagle versions 4.0 and 4.1 have this capability. ---- .. class:: infomark **HapMap genetic maps** HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_ ---- .. class:: infomark **Input files** Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_ (VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal X-chromosome genotypes must be in separate input files and analysed separately unless male haploid genotypes are coded as homozygous diploid genotypes. In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window, it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/"). Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip, and that a reference VCF file that has a name ending in β.bref3β is compressed with bref version 3. ---- .. class:: infomark **Output files** There are two output files. The log file gives a summary of the analysis that includes the Beagle version, the command line arguments, and compute time. The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the unix gunzip utility. If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO field will contain: :: - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose. - An "AF" subfield with the estimated alternate allele frequencies in the target samples. - The "IMP" flag if the marker is imputed. ]]> </help> <expand macro="citations" /> </tool>