view usher.xml @ 2:8f6c7638eab2 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/usher commit 59c78982b11e280895f2f51c25ada4613968dc79"
author iuc
date Sun, 19 Dec 2021 15:53:23 +0000
parents 335665e15630
children 746b434c8fbb
line wrap: on
line source

<tool id='usher' name='UShER' version='@TOOL_VERSION@+@GALAXY_TOOL_VERSION@' profile='20.01'>
    <description>ultrafast sample placement on existing trees</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='edam_ontology' />
    <expand macro='requirements' />
    <version_command>usher --version</version_command>
    <command detect_errors='exit_code'><![CDATA[
        #if $vcf.ext == 'vcf_bgzip':
            ln -s '$vcf' input.vcf.gz &&
            ln -s '$vcf.metadata.tabix_index' input.vcf.gz.tbi &&
            #set $input_vcf = 'input.vcf.gz'
        #else:
            ln -s '$vcf' input.vcf &&
            #set $input_vcf = 'input.vcf'
        #end if

        usher
        -v '$input_vcf'
        #if $tree
            -t '${tree}'
        #end if
        #if $load_mutation_annotated_tree
            -i '${load_mutation_annotated_tree}'
        #end if
        $output_options.save_mutation_annotated_tree
        $sort_mode
        $collapse_options.collapse_tree
        $collapse_options.collapse_output_tree
        #if $max_uncertainty_per_sample
            -e $max_uncertainty_per_sample
        #end if
        $output_options.write_uncondensed_final_tree
        #if $output_options.write_subtrees_size
            -k $output_options.write_subtrees_size
        #end if
        #if $output_options.write_single_subtree
            -K $output_options.write_single_subtree
        #end if
        $output_options.write_parsimony_scores_per_node
        #if $output_options.multiple_placements
            -M $output_options.multiple_placements
        #end if
        #if $retain_input_branch_lengths
            $retain_input_branch_lengths
        #end if
        -T \${GALAXY_SLOTS:-1}
        -d ./
        #if $output_options.write_subtrees_size
            && mkdir -p out_subtrees out_subtrees_expanded out_subtrees_mutations
            && mv subtree*expanded.txt ./out_subtrees_expanded
            && mv subtree*mutations.txt ./out_subtrees_mutations
            && mv subtree*.nh ./out_subtrees
        #end if
    ]]>    </command>
    <inputs>
        <param name="vcf" type="data" format="vcf,vcf_bgzip" label="VCF file" help="Input VCF file."/>
        <param name="tree" type="data" format="newick" optional="True" label="Tree file" help="Input tree file in Newick format."/>
        <param argument="--load-mutation-annotated-tree" type="data" format="protobuf3" optional="True" label="Mutation-annotated tree object" help="Load a mutation annotated tree file, in protocol-buffers format (protobuf3)."/>
        <param name="sort_mode" type="select" label="Mode of sorting new samples">
            <option value="">Default sorting</option>
            <option value="--sort-before-placement-1">Sort based on computed parsimony score and then number of optimal placements before the actual placement (-s)</option>
            <option value="--sort-before-placement-1 --reverse-sort">Sort based on computed parsimony score and then number of optimal placements before the actual placement [reversed] (-s -r)</option>
            <option value="--sort-before-placement-2">Sort based on the number of optimal placements and then the parsimony score before the actual placement (-S)</option>
            <option value="--sort-before-placement-2 --reverse-sort">Sort based on the number of optimal placements and then the parsimony score before the actual placement [reversed] (-S -r)</option>
            <option value="--sort-before-placement-3">Sort based on the number of ambiguous bases (-A)</option>
        </param>
        <param argument="--max-uncertainty-per-sample" type="integer" optional="True" min="0" max="10000000" value="" label="Maximum number of equally parsimonious placements" help="Maximum number of equally parsimonious placements allowed per sample beyond which the sample is ignored." />
        <param argument="--retain-input-branch-lengths" type="boolean" truevalue="--retain-input-branch-lengths" falsevalue="" label="Retain the branch lengths" help="Retain the branch lengths from the input tree in out newick files instead of using number of mutations for the branch lengths." />
        <section name="collapse_options" title="Collapse options" expanded="True">
            <param argument="--collapse-tree" type="boolean" truevalue="--collapse-tree" falsevalue="" label="Collapse internal nodes of the input tree" help="Collapse internal nodes of the input tree with no mutations and condense identical sequences in polytomies into a single node and the save the tree to file condensed-tree.nh in outdir." />
            <param argument="--collapse-output-tree" type="boolean" truevalue="--collapse-output-tree" falsevalue="" label="Collapse internal nodes of the output tree" help="Collapse internal nodes of the output tree with no mutations before the saving the tree to file final-tree.nh in outdir." />
        </section>
        <section name="output_options" title="Output tree options" expanded="True">
            <param argument="--save-mutation-annotated-tree" type="boolean" truevalue="--save-mutation-annotated-tree annotated-tree.pb" falsevalue="" label="Save the new mutation-annotated tree object" help="Save output mutation-annotated tree object to the file new_global_assignments.pb." />
            <param argument="--write-uncondensed-final-tree" type="boolean" truevalue="--write-uncondensed-final-tree" falsevalue="" label="Uncondensed final tree" help="Write the final tree in uncondensed format and save to file uncondensed-final-tree.nh in outdir." />
            <param argument="--write-subtrees-size" type="integer" optional="True" min="0" max="100" value="" label="Generate multiple subtrees" help="Write minimum set of subtrees covering the newly added samples of size equal to this value." />
            <param argument="--write-single-subtree" type="integer" optional="True" min="0" max="100" value="" label="Generate a single subtree" help="Similar to write-subtrees-size but produces a single subtree with all newly added samples along with random samples up to the value specified by this argument." />
            <param argument="--write-parsimony-scores-per-node" type="boolean" truevalue="--write-parsimony-scores-per-node" falsevalue="" label="Generate parsimony scores per node" help="Write the parsimony scores for adding new samples at each existing node in the tree without modifying the tree in a file names parsimony-scores.tsv in outdir." />
            <param argument="--multiple-placements" type="integer" optional="True" min="0" max="100" value="" label="Multiple placements" help="Create a new tree up to this limit for each possibility of parsimony-optimal placement." />
        </section>
    </inputs>
    <outputs>
        <data name="annotated_tree" format="protobuf3" from_work_dir="annotated-tree.pb" label="${tool.name} on ${on_string}: mutation annotated tree">
            <filter>output_options['save_mutation_annotated_tree'] is True</filter>
        </data> 
        <data name="condensed_tree" format="newick" from_work_dir="condensed-tree.nh" label="${tool.name} on ${on_string}: condensed tree">
            <filter>collapse_options['collapse_tree'] is True</filter>
        </data>
        <data name="final_tree" format="newick" from_work_dir="final-tree.nh" label="${tool.name} on ${on_string}: final tree">
            <filter>output_options['write_uncondensed_final_tree'] is False and output_options['write_parsimony_scores_per_node'] is False</filter>
        </data>
        <data name="uncondensed_tree" format="newick" from_work_dir="uncondensed-final-tree.nh" label="${tool.name} on ${on_string}: uncondensed tree">
            <filter>output_options['write_uncondensed_final_tree'] is True</filter>
        </data>
        <data name="mutation_paths" format="txt" from_work_dir="mutation-paths.txt" label="${tool.name} on ${on_string}: mutation paths">
            <filter>output_options['save_mutation_annotated_tree'] is False and output_options['write_parsimony_scores_per_node'] is False</filter>        
        </data>
        <data name="clades" format="txt" from_work_dir="clades.txt" label="${tool.name} on ${on_string}: clades">
            <filter>output_options['save_mutation_annotated_tree'] is False and output_options['write_parsimony_scores_per_node'] is False</filter>        
        </data>
        <collection name='subtrees' type='list' label='${tool.name} on ${on_string}: subtrees'>
            <discover_datasets pattern='(?P&lt;designation&gt;.*)\.nh' format="newick" directory='out_subtrees'/>
            <filter>output_options['write_subtrees_size']</filter> 
        </collection>
        <collection name='subtrees_expanded' type='list' label='${tool.name} on ${on_string}: subtrees expanded'>
            <discover_datasets pattern='(?P&lt;designation&gt;.*)\.txt' format='txt' directory='out_subtrees_expanded'/>
            <filter>output_options['write_subtrees_size']</filter> 
        </collection>  
        <collection name='subtrees_mutations' type='list' label='${tool.name} on ${on_string}: subtrees mutations'>
            <discover_datasets pattern='(?P&lt;designation&gt;.*)\.txt' format='txt' directory='out_subtrees_mutations'/>
            <filter>output_options['write_subtrees_size']</filter> 
        </collection>
        <data name="current_tree" format="newick" from_work_dir="current-tree.nh" label="${tool.name} on ${on_string}: current tree">
            <filter>output_options['write_parsimony_scores_per_node'] is True</filter>        
        </data>
        <data name="parsimony_scores" format="tabular" from_work_dir="parsimony-scores.tsv" label="${tool.name} on ${on_string}: parsimony scores per node">
            <filter>output_options['write_parsimony_scores_per_node'] is True</filter>        
        </data>
        <data name="single_subtree" format="newick" from_work_dir="single-subtree.nh" label="${tool.name} on ${on_string}: single subtree">
            <filter>output_options['write_single_subtree']</filter>        
        </data>
        <data name="single_subtree_expanded" format="txt" from_work_dir="single-subtree-expanded.txt" label="${tool.name} on ${on_string}: single subtree expanded">
            <filter>output_options['write_single_subtree']</filter>        
        </data>
        <data name="single_subtree_mutations" format="txt" from_work_dir="single-subtree-mutations.txt" label="${tool.name} on ${on_string}: single subtree mutations">
            <filter>output_options['write_single_subtree']</filter>        
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="vcf" value="global_samples.vcf.gz" ftype="vcf_bgzip" />
            <param name="tree" value="global_phylo.nh" ftype="newick"/>
            <section name="collapse_options">
                <param name="collapse_tree" value="true"/>
            </section>
            <section name="output_options">
                <param name="save_mutation_annotated_tree" value="true" ftype="newick"/>
            </section>
            <output name="annotated_tree" file="test_01_annotated_tree.pb" ftype="protobuf3">
                <assert_contents>
                    <has_size value="26100" delta="100"/>
                </assert_contents>
            </output>
            <output name="condensed_tree" file="test_01_condensed_tree.nh" ftype="newick"/>
            <output name="final_tree" file="test_01_final_tree.nh" ftype="newick"/>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <section name="output_options">
                <param name="write_uncondensed_final_tree" value="true"/>
            </section>
            <output name="uncondensed_tree" file="test_02_uncondensed_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_02_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0" delta="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <param name="vcf" value="global_samples.vcf.gz" ftype="vcf_bgzip" />
            <param name="tree" value="global_phylo.nh" ftype="newick"/>
            <param name="sort_mode" value="--sort-before-placement-1"/>
            <section name="collapse_options">
                <param name="collapse_tree" value="true"/>
            </section>   
            <output name="condensed_tree" file="test_03_condensed_tree.nh" ftype="newick"/>
            <output name="final_tree" file="test_03_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="6">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <section name="output_options">
                <param name="write_uncondensed_final_tree" value="true"/>
                <param name="write_subtrees_size" value="20"/>
            </section>
            <output name="uncondensed_tree" file="test_04_uncondensed_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_04_mutation_paths.txt" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
            <output_collection name="subtrees" type="list" count="1">
                <element name="subtree-1" file="test_04_subtree_1.nh"/>
            </output_collection>
            <output_collection name="subtrees_expanded" type="list" count="1">
                <element name="subtree-1-expanded" file="test_04_subtree_1_expanded.txt" ftype="txt"/>
            </output_collection>
            <output_collection name="subtrees_mutations" type="list" count="1">
                <element name="subtree-1-mutations" file="test_04_subtree_1_mutations.txt" ftype="txt"/>
            </output_collection>
        </test>
        <test expect_num_outputs="2">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <section name="output_options">
                <param name="write_parsimony_scores_per_node" value="true"/>
            </section>
            <output name="current_tree" file="test_05_current_tree.nw" ftype="newick"/>
            <output name="parsimony_scores" file="test_05_parsimony_scores.tabular" ftype="tabular"/>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>  
            <section name="output_options">
                <param name="multiple_placements" value="2"/>
            </section>
            <output name="final_tree" file="test_06_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_06_mutation_paths.txt" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="6">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>  
            <section name="output_options">
                <param name="write_single_subtree" value="2"/>
            </section>
            <output name="final_tree" file="test_06_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_06_mutation_paths.txt" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
            <output name="single_subtree" file="test_07_single_subtree.nh" ftype="newick"/>
            <output name="single_subtree_mutations" file="test_07_single_subtree_mutations.txt" ftype="txt"/>
            <output name="single_subtree_expanded" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <param name="sort_mode" value="--sort-before-placement-2"/>
            <output name="final_tree" file="test_08_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_08_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <param name="sort_mode" value="--sort-before-placement-3"/>
            <output name="final_tree" file="test_09_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_09_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <param name="sort_mode" value="--sort-before-placement-1 --reverse-sort"/>
            <output name="final_tree" file="test_10_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_10_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <param name="sort_mode" value="--sort-before-placement-2 --reverse-sort"/>
            <output name="final_tree" file="test_11_final_tree.nh" ftype="newick"/>
            <output name="mutation_paths" file="test_11_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="vcf" value="new_samples.vcf.gz" ftype="vcf_bgzip"/>
            <param name="load_mutation_annotated_tree" value="test_01_annotated_tree.pb" ftype="protobuf3"/>
            <param name="retain_input_branch" value="true"/>
            <output name="final_tree" file="test_12_final_tree.nh" ftype="newick" lines_diff="2"/>
            <output name="mutation_paths" file="test_12_mutation_path.nh" ftype="txt"/>
            <output name="clades" ftype="txt">
                <assert_contents>
                    <has_size value="0"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**

UShER is a program for rapid, accurate placement of samples to existing phylogenies. While not restricted to SARS-CoV-2 phylogenetic analyses, it has enabled real-time phylogenetic analyses and genomic contact tracing in that its placement is orders of magnitude faster and more memory-efficient than previous methods, and is being widely used by several SARS-CoV-2 research groups, including the UCSC Genome Browser team and Rob Lanfear’s global phylogeny releases.

----
                    
.. class:: infomark
                                        
**How UShER works?**

Given existing samples, whose genotypes and phylogenetic tree is known, and the genotypes of new samples, UShER aims to incorporate new samples into the phylogenetic tree while preserving the topology of existing samples and maximizing parsimony. UShER’s algorithm consists of two phases: (i) the pre-processing phase and (ii) the placement phase.

In the pre-processing phase, UShER accepts the phylogenetic tree of existing samples in a Newick format and their genotypes, specified as a set of single-nucleotide variants with respect to a reference sequence (UShER currently ignores indels), in a VCF format. For each site in the VCF, UShER uses `Fitch-Sankoff algorithm <https://evolution.gs.washington.edu/gs541/2010/lecture1.pdf>`_ to find the most parsimonious nucleotide assignment for every node of the tree (UShER automatically labels internal tree nodes). 

When a sample contains ambiguous genotypes, multiple nucleotides may be most parsimonious at a node. To resolve these, UShER assigns it any one of the most parsimonious nucleotides with preference, when possible, given to the reference base. UShER also allows the VCF to specify ambiguous bases in samples using `IUPAC format <https://www.bioinformatics.org/sms/iupac.html>`_, which are also resolved to a unique base using the above strategy. When a node is found to carry a mutation, i.e. the base assigned to the node differs from its parent, the mutation gets added to a list of mutations corresponding to that node. Finally, UShER uses `protocol buffers <https://developers.google.com/protocol-buffers>`_ to store in a file, the Newick string corresponding to the input tree and a list of lists of node mutation, which we refer to as mutation-annotated tree object.

The mutation-annotated tree object carries sufficient information to derive parsimony-resolved genotypes for any tip of the tree using the sequence of mutations from the root to that tip. Compared to other tools that use full multiple-sequence alignment (MSA) to guide the placement, UShER's mutation-annotated tree object is compact and is what helps make it fast.

In the placement phase, UShER loads the pre-processed mutation-annotated tree object and the genotypes of new samples in a VCF format and sequentially adds the new samples to the tree. For each new sample, UShER computes the additional parsimony score required for placing it at every node in the current tree while considering the full path of mutations from the root of the tree to that node. 

Next, UShER places the new sample at the node that results in the smallest additional parsimony score. When multiple node placements are equally parsimonious, UShER picks the node with a greater number of descendant leaves for placement. If the choice is between a parent and its child node, the parent node would always be selected by this rule. However, a more accurate placement should reflect the number of leaves uniquely attributable to the child versus parent node. Therefore, in these cases, UShER picks the parent node if the number of descendant leaves of the parent that are not shared with the child node exceed the number of descendant leaves of the child. 

UShER also automatically imputes and reports ambiguous genotypes for the newly added samples and ignores missing bases, such as 'N' or '.' (i.e. missing bases never contribute to the parsimony score).

At the end of the placement phase, UShER allows the user to create another protocol-buffer (protobuf) file containing the mutation-annotated tree object for the newly generated tree including added samples. This allows for another round of placements to be carried out over and above the newly added samples.


    ]]>    </help>
    <expand macro="citations" />
</tool>