Mercurial > repos > iuc > beagle

<tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'>
    <description>phasing genotypes and imputing ungenotyped markers</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='edam_ontology' />
    <expand macro='requirements' />
    <command detect_errors='exit_code'><![CDATA[
        #set out_prefix='out'
        #if $optional_inputs.ref.ext == 'bref3'
            ln -s '${optional_inputs.ref}' ref.bref3 &&
        #end if
        beagle
        gt='${gt}'
        #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3'
            ref=ref.bref3
        #else if $optional_inputs.ref
            ref='${optional_inputs.ref}'
        #end if
        #if $optional_inputs.map
            map='${optional_inputs.map}'
        #end if
        #if $chrom
            chrom='${chrom}'
        #end if
        #if $optional_inputs.excludesamples
            excludesamples='${optional_inputs.excludesamples}'
        #end if
        #if $optional_inputs.excludemarkers
            excludemarkers='${optional_inputs.excludemarkers}'
        #end if
        ne=$ne
        window=$window
        overlap=$overlap
        #if $seed
            seed=$seed
        #end if
        #if $err
            err=$err
        #end if
        burnin=$phasing_parameters.burnin
        iterations=$phasing_parameters.iterations
        phase-states=$phasing_parameters.phase_states
        impute=$imputation_parameters.impute
        imp-states=$imputation_parameters.imp_states
        imp-segment=$imputation_parameters.imp_segment
        imp-step=$imputation_parameters.imp_step
        cluster=$imputation_parameters.cluster
        ap=$imputation_parameters.ap
        gp=$imputation_parameters.gp
        out=$out_prefix
        nthreads=\${GALAXY_SLOTS:-1}
        && gunzip 'out.vcf.gz'
    ]]>    </command>
    <inputs>
        <param argument="gt" type="data" format="vcf" label="VCF file"
            help="It specifies a VCF file containing genotypes for the study samples.
                Each VCF record must contain a GT (genotype) format field"/>
        <section name="optional_inputs" title="Optional input files" expanded="true">
            <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes"
                help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the
                    phased allele separator must be used '|'"/>
            <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units"
                help="Beagle uses linear interpolation to estimate genetic positions between map positions. If
                no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/>
            <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude"
                help="It specifies a file containing samples (one sample identifier per line) to be excluded
                    from the analysis" />
            <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude"
                help="It specifies a file containing markers (one marker per line) to be excluded from the
                    analysis. Each line of the file can be either an identifier from a VCF record’s ID field
                    or a genomic coordinate in the format: CHROM:POS" />
        </section>
        <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval"
            help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be
                specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively">
            <sanitizer invalid_char="">
                <valid initial="string.letters,string.digits">
                    <add value=":" />
                    <add value="-" />
                </valid>
            </sanitizer>
            <validator type="regex">[0-9a-zA-Z:-]+</validator>
        </param>
        <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size"
            help="The default value is suitable for a large, outbred population. It is needed to specify an
                appropriate effective populations size if you are imputing ungenotyped markers in a small
                or inbred population"/>
        <param argument="window" type="float" min="0" value="40.0" label="Window length in cM"
            help="The window parameter must be at least 1.1 times as large as the overlap parameter.
                The window parameter controls the amount of memory required for the analysis"/>
        <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM"
            help="It specifies the cM length of overlap between adjacent sliding windows"/>
        <param argument="err" type="float" min="0" max="1" optional="true"
            label="Allele mismatch probability for the hidden Markov model"
            help="If no err parameter is specified, the err parameter will be set equal 𝜃/(2(𝜃 + 𝐻))
                where 𝜃 = 1/(0.5 + ln 𝐻) and 𝐻 is the number of haplotypes"/>
        <param argument="seed" type="integer" value="" optional="true" label="Random seed"
            help="A random seed is a number used to initialize a pseudorandom number generator" />
        <param name="output_log" type="boolean" checked="false" label="Output a log file"/>
        <section name="phasing_parameters" title="Phasing parameters">
            <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations"
                help="It is the maximum number of burnin iterations used to estimate an initial haplotype
                    frequency model for inferring genotype phase" />
            <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations"
                help="It is the number of iterations used to estimate genotype phase. Increasing this
                    parameter will trade increased computation time for increased phasing accuracy" />
            <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing"
                help="It is the number of model states used to estimate genotype phase" />
        </section>
        <section name="imputation_parameters" title="Imputation parameters">
            <param argument="impute" type="boolean" truevalue="true" falsevalue="false"
                checked="true" label="Impute ungenotyped markers"
                help="It specifies whether markers that are present in the reference panel but absent in
                    that target will be imputed. This option has no effect if no reference panel is specified"/>
            <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation"
                help="It is the number of model states used to impute ungenotyped markers" />
            <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments"
                help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state
                    space for a target haplotype." />
            <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments"
                 help="It is the length in cM of the step used for detecting short IBS segments" />
            <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster"
                help="It specifies the maximum cM distance between individual markers that are combined
                    into an aggregate marker when imputing ungenotyped markers" />
            <param argument="ap" type="boolean" truevalue="true" falsevalue="false"
                checked="false" label="Include posterior allele probabilities"
                help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output
                    VCF file when imputing ungenotyped markers" />
            <param argument="gp" type="boolean" truevalue="true" falsevalue="false"
                checked="false" label="Include posterior genotype probabilities"
                help="It specifies whether a GP (genotype probability) format field will be included in the output
                VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele
                probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype
                with highest genotype probability may occasionally be different than the genotype obtained by
                taking the allele with highest probability on each haplotype, which is the genotype reported
                in the GT format field" />
        </section>
    </inputs>
    <outputs>
        <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/>
        <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file">
            <filter>output_log</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test default values -->
        <test expect_num_outputs="2">
            <param name="gt" value="test.vcf.gz"/>
            <param name="chrom" value="22:100-"/>
            <param name="ne" value="1000000"/>
            <param name="window" value="40.0"/>
            <param name="overlap" value="2.0"/>
            <param name="err" value="0.02"/>
            <param name="seed" value="1"/>
            <param name="output_log" value="true"/>
            <section name="phasing_parameters">
                <param name="burnin" value="3"/>
                <param name="iterations" value="12"/>
                <param name="phase_states" value="280"/>
            </section>
            <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/>
            <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/>
        </test>
        <!-- Test plink file-->
        <test expect_num_outputs="2">
            <param name="gt" value="test.vcf.gz"/>
            <param name="ne" value="1000000"/>
            <param name="window" value="30.0"/>
            <param name="overlap" value="3.0"/>
            <param name="output_log" value="true"/>
            <section name="optional_inputs">
                <param name="map" value="plink.map"/>
            </section>
            <section name="phasing_parameters">
                <param name="burnin" value="4"/>
                <param name="iterations" value="10"/>
                <param name="phase_states" value="250"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
                    <has_size value="181272"/>
                </assert_contents>
            </output>
            <output name="log_file" ftype="txt">
                <assert_contents>
                    <has_text text="Reference markers:                  223"/>
                    <has_size value="1586" delta="10"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test ref VCF input -->
        <test expect_num_outputs="2">
            <param name="gt" value="target.vcf.gz"/>
            <param name="ne" value="1000000"/>
            <param name="window" value="40.0"/>
            <param name="overlap" value="2.0"/>
            <param name="output_log" value="true"/>
            <section name="optional_inputs">
                <param name="ref" value="ref.vcf.gz"/>
            </section>
            <section name="imputation_parameters">
                <param name="impute" value="true"/>
                <param name="imp_states" value="1600"/>
                <param name="imp_segment" value="6.0"/>
                <param name="imp_step" value="0.1"/>
                <param name="cluster" value="0.005"/>
                <param name="ap" value="true"/>
                <param name="gp" value="true"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
                    <has_size value="18635"/>
                </assert_contents>
            </output>
            <output name="log_file" ftype="txt">
                <assert_contents>
                    <has_text text="Reference markers:                  223"/>
                    <has_size value="1801" delta="10"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test ref bref3 input -->
        <test expect_num_outputs="1">
            <param name="gt" value="target.vcf.gz"/>
            <param name="ne" value="1000000"/>
            <param name="window" value="40.0"/>
            <param name="overlap" value="2.0"/>
            <section name="optional_inputs">
                <param name="ref" value="ref.bref3"/>
            </section>
            <section name="imputation_parameters">
                <param name="impute" value="true"/>
                <param name="imp_states" value="1600"/>
                <param name="imp_segment" value="6.0"/>
                <param name="imp_step" value="0.1"/>
                <param name="cluster" value="0.005"/>
                <param name="ap" value="true"/>
                <param name="gp" value="true"/>
            </section>
            <output name="vcf_file" ftype="vcf">
                <assert_contents>
                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
                    <has_size value="18635"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**

Beagle is a program for phasing and imputing missing genotypes. Sporadic missing
genotypes are imputed during phasing. If a reference panel of phased genotypes is specified
with the ref argument, ungenotyped markers that are present in the reference panel can also
be imputed.

Beagle version 5.2 provides significantly faster genotype phasing than version 5.1.
Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but
Beagle versions 4.0 and 4.1 have this capability.

----

.. class:: infomark

**HapMap genetic maps**

HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available
in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_

----

.. class:: infomark

**Input files**

Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_
(VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal
X-chromosome genotypes must be in separate input files and analysed separately unless male
haploid genotypes are coded as homozygous diploid genotypes.

In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window,
it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/").
Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip,
and that a reference VCF file that has a name ending in “.bref3” is compressed with bref version 3.

----

.. class:: infomark

**Output files**

There are two output files. The log file gives a summary of the analysis that includes the
Beagle version, the command line arguments, and compute time.

The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing
genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the
unix gunzip utility.

If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO
field will contain:

    ::

        - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose.
        - An "AF" subfield with the estimated alternate allele frequencies in the target samples.
        - The "IMP" flag if the marker is imputed.

    ]]>    </help>
    <expand macro="citations" />
</tool>
author	iuc
date	Sat, 03 Jul 2021 23:33:37 +0000
parents
children	f75bf16ac901