view progressivemauve.xml @ 0:74093fb62bdf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author iuc
date Wed, 17 Aug 2016 14:46:55 -0400
parents
children bca52822843e
line wrap: on
line source

<?xml version="1.0"?>
<tool id="progressivemauve" name="progressiveMauve" version="@WRAPPER_VERSION@.0">
  <description>constructs multiple genome alignments</description>
  <macros>
    <import>macros.xml</import>
  </macros>
  <expand macro="requirements"/>
  <expand macro="stdio"/>
  <version_command>progressiveMauve --version</version_command>
  <command><![CDATA[
## Symlink files in with correct extensions
#for $file in $sequences:
    ln -s $file `basename $file`;
#end for

progressiveMauve
## Input Options

#if $apply_backbone:
    --apply-backbone=$apply_backbone
#end if
--island-gap-size=$island_gap_size
$mums

#if $seed_weight:
    --seed-weight=$seed_weight
#end if

#if $max_gapped_aligner_length:
    --max-gapped-aligner-length=$max_gapped_aligner_length
#end if

#if $match_input:
    --match-input=$match_input
#end if

$collinear
--scoring-scheme=$scoring_scheme
$no_weight_scaling

--max-breakpoint-distance-scale=$max_breakpoint_distance_scale
--conservation-distance-scale=$conservation_distance_scale
$skip_refinement
$skip_gapped_alignment

#if $bp_dist_estimate_min_score:
    --bp-dist-estimate-min-score=$bp_dist_estimate_min_score
#end if

#if $gap_open:
    --gap-open=$gap_open
#end if

#if $gap_extend:
    --gap-extend=$gap_extend
#end if

#if $weight:
    --weight=$weight
#end if

#if $min_scaled_penalty:
    --min-scaled-penalty=$min_scaled_penalty
#end if

--hmm-p-go-homologous=$hmm_p_go_homologous
--hmm-p-go-unrelated=$hmm_p_go_unrelated
--hmm-identity=$hmm_identity

$seed_family
$solid_seeds
$coding_seeds
$no_recursion
$disable_backbone

## Outputs
--output=$output
#if $output_guide_tree:
    --output-guide-tree=$output_guide_tree_file
#end if

#if $output_backbone:
    --backbone-output=$output_backbone_file
#end if

## Sequences
#for file in $sequences:
    `basename "${file}"`
#end for

]]></command>
  <inputs>
      <param type="data" format="fasta" name="sequences" multiple="True"
          label="Select sequences to align" help="in fasta format" />
      <param type="data" format="xmfa" label="Apply Backbone" name="apply_backbone" optional="True"
          help="Read an existing sequence alignment in XMFA format and apply backbone statistics to it (--apply-backbone)" />

      <param type="integer" label="Island gap size" value="20" name="island_gap_size"
          help="Alignment gaps above this size in nucleotides are considered to be islands (--island-gap-size)"/>

      <param type="boolean" truevalue="--disable-backbone" falsevalue="" name="disable_backbone"
          label="Disable backbone" help="Disable backbone detection (--disable-backbone)" />

      <param type="boolean" truevalue="True" falsevalue="" name="output_guide_tree"
          label="Output Guide Tree" help="Write out the guide tree used for alignment to a file (--output-guide-tree)" />

      <param type="boolean" truevalue="True" falsevalue="" name="output_backbone"
          label="Output Backbone" help="Write out the backbone to a file (--backbone-output)" />

      <param type="boolean" truevalue="--mums" falsevalue="" label="MUMs" name="mums"
          help="Find MUMs only, do not attempt to determine locally collinear blocks (LCBs) (--mums)" />

      <param type="integer" label="Seed weight" name="seed_weight" value="0" optional="True"
          help="Use the specified seed weight for calculating initial anchors (--seed-weight)" />

      <param type="data" format="tabular" label="Match Input" name="match_input" optional="True"
          help="Use specified match file instead of searching for matches (--match-input)" />

    <!--<param type="file" label="input-id-matrix" help="An identity matrix describing similarity among all pairs of input sequences/alignments (- -input-id-matrix)" />-->
    <param type="integer" label="Max gapped aligner length" value="0" optional="True" name="max_gapped_aligner_length"
        help="Maximum number of base pairs to attempt aligning with the gapped aligner (--max-gapped-aligner-length)" />

    <param type="data" format="nhx" label="input-guide-tree" optional="True" name="input_guide_tree"
        help="A phylogenetic guide tree in Newick format that describes the order in which sequences will be aligned (--input-guide-tree)" />

    <param type="boolean" truevalue="--collinear" falsevalue="" label="Collinear inputs" name="collinear"
        help="Assume that input sequences are collinear--they have no rearrangements (--collinear)" />

    <param type="select" label="Scoring scheme" name="scoring_scheme" help="Selects the anchoring score function. (--scoring-scheme)" >
        <option value="sp" selected="True">Extant sum-of-pairs (sp)</option>
        <option value="ancestral_sp">Sum-of-pairs + Ancestral (ancestral_sp)</option>
        <option value="ancestral">Ancestral (ancestral)</option>
    </param>

    <param type="boolean" truevalue="--no-weight-scaling" falsevalue="" label="No weight scaling" name="no_weight_scaling"
        help="Don't scale LCB weights by conservation distance and breakpoint distance (--no-weight-scaling)" />

    <param type="float" min="0" max="1" label="max-breakpoint-distance-scale" value="0.5" name="max_breakpoint_distance_scale"
        help="Set the maximum weight scaling by breakpoint distance. (--max-breakpoint-distance-scale)" />

    <param type="float" min="0" max="1"  label="conservation-distance-scale" value="0.5" name="conservation_distance_scale"
        help="Scale conservation distances by this amount. (--conservation-distance-scale)" />

    <param type="boolean" truevalue="--skip-refinement" falsevalue="" label="Skip refinement" name="skip_refinement"
        help="Do not perform iterative refinement (--skip-refinement)" />
    <param type="boolean" truevalue="--skip-gapped-alignment" falsevalue="" label="Skip gapped alignment" name="skip_gapped_alignment"
        help="Do not perform gapped alignment (--skip-gapped-alignment)" />
    <param type="integer" label="BP dist estimate min score" name="bp_dist_estimate_min_score" value="0" optional="True"
        help="Minimum LCB score for estimating pairwise breakpoint distance (--bp-dist-estimate-min-score)" />

    <param type="integer" label="Gap open" name="gap_open" value="0" optional="True"
        help="Gap open penalty (--gap-open)" />

    <param type="select" label="Repeat penalty" name="repeat_penalty"
        help="Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. (--repeat-penalty)">
        <option value="negative" selected="True">Negative</option>
        <option value="zero">Zero</option>
    </param>

    <param type="integer" label="Gap extend" name="gap_extend" value="0" optional="True"
        help="Gap extend penalty (--gap-extend)" />

    <!--<param type="data" label="Substitution matrix" -->
        <!--help="Nucleotide substitution matrix in NCBI format (- -substitution-matrix)" />-->

    <param type="integer" label="Weight" name="weight" value="0" optional="True"
        help="Minimum pairwise LCB score (--weight)" />
    <param type="integer" label="Min scaled penalty" name="min_scaled_penalty" value="0" optional="True"
        help="Minimum breakpoint penalty after scaling the penalty by expected divergence (--min-scaled-penalty)" />

    <param type="float" label="HMM p go homologous" name="hmm_p_go_homologous" min="0" max="1" value="0.00001"
        help="Probability of transitioning from the unrelated to the homologous state (--hmm-p-go-homologous)" />
    <param type="float" label="HMM p go unrelated" name="hmm_p_go_unrelated" min="0" max="1" value="0.000000001"
        help="Probability of transitioning from the homologous to the unrelated state (--hmm-p-go-unrelated)" />
    <param type="float" label="HMM identity" name="hmm_identity" min="0" max="1" value="0.7"
        help="Expected level of sequence identity among pairs of sequences(--hmm-identity)" />

    <param type="boolean" truevalue="--seed-family" falsevalue="" label="Seed family" name="seed_family"
        help="Use a family of spaced seeds to improve sensitivity (--seed-family)" />
    <param type="boolean" truevalue="--solid-seeds" falsevalue="" label="Solid seeds" name="solid_seeds"
        help="Use solid seeds. Do not permit substitutions in anchor matches. (--solid-seeds)" />
    <param type="boolean" truevalue="--coding-seeds" falsevalue="" label="Coding seeds" name="coding_seeds"
        help="Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy. (--coding-seeds)" />
    <param type="boolean" truevalue="--no-recursion" falsevalue="" label="No recursion" name="no_recursion"
        help="Disable recursive anchor search (--no-recursion)" />
  </inputs>
  <outputs>
    <data format="xmfa" name="output" label="${tool.name} alignment of ${on_string}">
       <change_format>
           <when input="mums" value="--mums" format="tabular" />
       </change_format>
    </data>
    <data format="nhx" name="output_guide_tree_file" label="${tool.name} alignment of ${on_string}: Guide tree">
        <when>output_guide_tree</when>
    </data>
    <data format="tabular" name="output_backbone_file" label="${tool.name} alignment of ${on_string}: Backbone">
        <when>output_backbone</when>
    </data>
  </outputs>
  <tests>
      <test>
          <param name="sequences" value="phagey.fa,karma.fa" />
          <output name="output" file="1.xmfa" lines_diff="20"/>
      </test>
      <test>
          <param name="sequences" value="merged.fa" />
          <output name="output" file="1.xmfa" lines_diff="20"/>
      </test>
      <test>
          <param name="sequences" value="merged.fa" />
          <param name="output_guide_tree" value="True" />
          <output name="output" file="1.xmfa" lines_diff="20"/>
          <output name="output_guide_tree_file" file="1.nhx" />
      </test>
      <test>
          <param name="sequences" value="merged.fa" />
          <param name="mums" value="True" />
          <output name="output" file="1.mums" compare="sim_size" delta="1000"/>
      </test>
      <test>
          <param name="sequences" value="merged.fa" />
          <param name="match_input" value="1.mums" />
          <output name="output" file="1.xmfa" lines_diff="24"/>
      </test>
  </tests>
  <help><![CDATA[
What it does
============

Mauve is a system for efficiently constructing multiple genome alignments in
the presence of large-scale evolutionary events such as rearrangement and
inversion. Multiple genome alignment provides a basis for research into
comparative genomics and the study of evolutionary dynamics. Aligning whole
genomes is a fundamentally different problem than aligning short sequences.

Mauve has been developed with the idea that a multiple genome aligner should
require only modest computational resources. It employs algorithmic techniques
that scale well in the amount of sequence being aligned. For example, a pair of
Y. pestis genomes can be aligned in under a minute, while a group of 9
divergent Enterobacterial genomes can be aligned in a few hours.

progressiveMauve XMFA alignment visualized with the Mauve tool:

.. image:: $PATH_TO_IMAGES/hemolysin.jpg

Example Usage
=============

+-----------------------------------+-------------+
| Usage                             | Notes       |
+===================================+=============+
| Align genomes                     |Simply       |
|                                   |select as    |
|                                   |many fasta   |
|                                   |files with   |
|                                   |one or more  |
|                                   |sequences as |
|                                   |necessary    |
+-----------------------------------+-------------+
| Align genomes but also save       |Use the      |
| the guide tree and produce a      |**Output     |
| backbone file                     |Guide Tree** |
|                                   |and **Output |
|                                   |Backbone**   |
|                                   |options      |
+-----------------------------------+-------------+
| Align genomes, but do not         |Use the      |
| detect forced alignment of        |**Disable    |
| unrelated sequences               |backbone**   |
|                                   |option       |
+-----------------------------------+-------------+
| Detect forced alignment of        |Use the      |
| unrelated sequence in the         |**Apply      |
| alignment produced                |Backbone**   |
| in previous example, use          |option and   |
| custom Homology HMM transition    |specify the  |
| parameters.                       |XMFA file    |
|                                   |produced     |
|                                   |in the       |
|                                   |previous     |
|                                   |example      |
+-----------------------------------+-------------+
| Compute ungapped                  |Use the      |
| local-multiple alignments among   |**MUMs**     |
| the input sequences               |option       |
+-----------------------------------+-------------+
| Compute an alignment of the       |Set the      |
| same genomes, using previously    |**Match      |
| computed local-multiple           |Input** to   |
| alignments                        |the tabular  |
|                                   |MUMs file    |
|                                   |produced in  |
|                                   |the previous |
|                                   |example      |
+-----------------------------------+-------------+
| Set a minimum scaled              |Use the      |
| breakpoint penalty to cope with   |**Min Scaled |
| the case where most genomes       |Penalty** and|
| are aligned correctly, but manual |set to a     |
| inspection reveals that           |value like   |
| a divergent genome has too        |5000         |
| many predicted rearrangements.    |             |
+-----------------------------------+-------------+
| Globally align a set of           |Use the      |
| collinear virus                   |**Colinear**,|
| genomes, using seed families      |**Seed       |
| to improve anchoring sensitivity  |Family**     |
| in regions below 70% sequence     |options      |
| identity.                         |             |
+-----------------------------------+-------------+


The progressiveMauve algorithm: addressing limitations of the original algorithm
================================================================================

Comparative genomics has revealed that closely-related bacteria often have
highly divergent gene content. While the original Mauve algorithm could align
regions conserved among all organisms, the portion of the genome conserved
among all taxa (the core genome) shrinks as more taxa are added to the
analysis. As such, the original Mauve algorithm did not scale well to large
numbers of taxa because it could not align regions conserved among subsets of
the genomes under study. progressiveMauve employs a different algorithmic
approach to scoring alignments that allows alignment of segments conserved
among subsets of taxa. The progressiveMauve algorithm has been described in
Aaron Darling's Ph.D. Thesis, and is also the subject of a manuscript published
in PLoS ONE. A brief overview is given here.

Finding initial local multiple alignments
-----------------------------------------

progressiveMauve elaborates on the original algorithm for finding local
multiple alignments. Instead of using a single seed pattern for match
filtration, progressiveMauve uses a combination of three seed patterns for
improved sensitivity. The palindromic seed patterns have been described in
Darling et al. 2006 "Procrastination leads to efficient filtration for local
multiple alignment"

Seed matches which represent a unique subsequence shared by two or more input
genomes are subjected to ungapped extension until the seed pattern no longer
matches. The result is an ungapped local multiple alignment with at most one
component from each of the input genome sequences.

Computing a pairwise genome content distance matrix and guide tree
------------------------------------------------------------------

progressiveMauve builds up genome alignments progressively according to a guide
tree. The guide tree is computed based on an estimate of the shared gene
content among each pair of input genomes. For a pair of input genomes, g.x and
g.y, shared gene content is estimated by counting the number of nucleotides in
gx and gy aligned to each other in the initial set of local multiple
alignments. The count is normalized to a similarity value between 0 and 1 by
dividing by the average size of gx and gy. The similarity value is subtracted
from 1 to arrive at a distance estimate. Neighbor joining is then applied to
the matrix of distance estimates to yield a guide tree topology. Note that the
guide tree is not intended to be a phylogeny indicative of the genealogy of
input genomes. It is merely a computational crutch for progressive genome
alignment. Also note that alignments are later refined independently of a
single guide tree toplogy to avoid biasing later phylogenetic inference.

Computing a pairwise breakpoint distance matrix
-----------------------------------------------

Prior to alignment, progressiveMauve attempts to compute a conservative
estimate of the number of rearrangement breakpoints among any pair of genomes.
For each pair of genomes, pairwise alignments are created from the
local-multiple alignments and the pairwise alignments are subjected to greedy
breakpoint elimination. The breakpoint penalty used for greedy breakpoint
elimination is set high for closely related genomes and scaled downward
according to the estimate of genomic content distance. Because the breakpoint
penalty is high, the resulting set of locally collinear blocks represent
robustly supported segmental homology, and a conservative estimate of the
breakpoint distance can be made on this basis. The conservative estimate of
breakpoint distance is used later during progressive alignment to scale
breakpoint penalties.

Progressive genome alignment
----------------------------

A genome alignment is progressively built up according to the guide tree. At
each step of the progressive genome alignment, alignment anchors are selected
from the initial set of local multiple alignments. Anchors are selected so that
they maximize a Sum-of-pairs scoring scheme which applies a penalty for
predicting breakpoints among any pair of genomes. Because rates of genomic
rearrangement are highly variable, especially in some bacterial pathogens, some
genomes may be expected to exhibit greater rearrangement than others. As such,
a single choice of scoring penalty is unlikely to yield accurate alignments for
all genomes. To cope with this phenomenon, progressiveMauve scales the
breakpoint penalty according to the expected level of sequence divergence and
the number of well-supported genomic rearrangements among the pair of input
genomes. These scaling values are taken from the distance matrices computed
earlier in the algorithm.

Anchored alignment
------------------

Once anchors have been computed at a node in the guide tree, a global alignment
is computed on the basis of the anchors. Given a set of anchors among two
genomes, a genome and an alignment, or a pair of alignments, a modified MUSCLE
global alignment algorithm is applied to compute an anchored profile-profile
alignment. MUSCLE is then used to perform tree-independent iterative refinement
on the global genome alignment.

Rejecting alignment of unrelated sequence
-----------------------------------------

Although we compute a global alignment among sequences, genomes often contain
lineage-specific sequence and are thus not globally related. The global
alignment will often contain forced alignment of unrelated sequence. A simple
hidden Markov model structure is used to detect forced alignment of unrelated
sequence, which are then removed from the alignment.

Strengths of the progressiveMauve algorithm
-------------------------------------------

-  It can be applied to a much larger number of genomes than the original Mauve
   algorithm
-  It can align more divergent genomes than the original algorithm. Genomes
   with as little as 50% nucleotide identity can be alignable
-  Manual adjustment of the alignment scoring parameters is usually not
   necessary
-  It aligns the pan-genome, e.g. regions conserved among subsets of the input
   genomes
-  It is more accurate than the previous Mauve algorithm

Notes on Reproducibility
------------------------

The command line programme progressiveMauve seems to behave differently when::

    --max-breakpoint-distance-scale=0.5 --conservation-distance-scale=0.5

are passed to the tool, compared to when those options are not passed. This
means that if you wish to precisely replicate the results you see in Galaxy at
the command line, you'll need to pass these flags with their "default" values.

@ATTRIBUTION@
]]></help>
  <expand macro="citation" />
</tool>