Mercurial > repos > iuc > progressivemauve
diff progressivemauve.xml @ 0:74093fb62bdf draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author | iuc |
---|---|
date | Wed, 17 Aug 2016 14:46:55 -0400 |
parents | |
children | bca52822843e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/progressivemauve.xml Wed Aug 17 14:46:55 2016 -0400 @@ -0,0 +1,439 @@ +<?xml version="1.0"?> +<tool id="progressivemauve" name="progressiveMauve" version="@WRAPPER_VERSION@.0"> + <description>constructs multiple genome alignments</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="stdio"/> + <version_command>progressiveMauve --version</version_command> + <command><![CDATA[ +## Symlink files in with correct extensions +#for $file in $sequences: + ln -s $file `basename $file`; +#end for + +progressiveMauve +## Input Options + +#if $apply_backbone: + --apply-backbone=$apply_backbone +#end if +--island-gap-size=$island_gap_size +$mums + +#if $seed_weight: + --seed-weight=$seed_weight +#end if + +#if $max_gapped_aligner_length: + --max-gapped-aligner-length=$max_gapped_aligner_length +#end if + +#if $match_input: + --match-input=$match_input +#end if + +$collinear +--scoring-scheme=$scoring_scheme +$no_weight_scaling + +--max-breakpoint-distance-scale=$max_breakpoint_distance_scale +--conservation-distance-scale=$conservation_distance_scale +$skip_refinement +$skip_gapped_alignment + +#if $bp_dist_estimate_min_score: + --bp-dist-estimate-min-score=$bp_dist_estimate_min_score +#end if + +#if $gap_open: + --gap-open=$gap_open +#end if + +#if $gap_extend: + --gap-extend=$gap_extend +#end if + +#if $weight: + --weight=$weight +#end if + +#if $min_scaled_penalty: + --min-scaled-penalty=$min_scaled_penalty +#end if + +--hmm-p-go-homologous=$hmm_p_go_homologous +--hmm-p-go-unrelated=$hmm_p_go_unrelated +--hmm-identity=$hmm_identity + +$seed_family +$solid_seeds +$coding_seeds +$no_recursion +$disable_backbone + +## Outputs +--output=$output +#if $output_guide_tree: + --output-guide-tree=$output_guide_tree_file +#end if + +#if $output_backbone: + --backbone-output=$output_backbone_file +#end if + +## Sequences +#for file in $sequences: + `basename "${file}"` +#end for + +]]></command> + <inputs> + <param type="data" format="fasta" name="sequences" multiple="True" + label="Select sequences to align" help="in fasta format" /> + <param type="data" format="xmfa" label="Apply Backbone" name="apply_backbone" optional="True" + help="Read an existing sequence alignment in XMFA format and apply backbone statistics to it (--apply-backbone)" /> + + <param type="integer" label="Island gap size" value="20" name="island_gap_size" + help="Alignment gaps above this size in nucleotides are considered to be islands (--island-gap-size)"/> + + <param type="boolean" truevalue="--disable-backbone" falsevalue="" name="disable_backbone" + label="Disable backbone" help="Disable backbone detection (--disable-backbone)" /> + + <param type="boolean" truevalue="True" falsevalue="" name="output_guide_tree" + label="Output Guide Tree" help="Write out the guide tree used for alignment to a file (--output-guide-tree)" /> + + <param type="boolean" truevalue="True" falsevalue="" name="output_backbone" + label="Output Backbone" help="Write out the backbone to a file (--backbone-output)" /> + + <param type="boolean" truevalue="--mums" falsevalue="" label="MUMs" name="mums" + help="Find MUMs only, do not attempt to determine locally collinear blocks (LCBs) (--mums)" /> + + <param type="integer" label="Seed weight" name="seed_weight" value="0" optional="True" + help="Use the specified seed weight for calculating initial anchors (--seed-weight)" /> + + <param type="data" format="tabular" label="Match Input" name="match_input" optional="True" + help="Use specified match file instead of searching for matches (--match-input)" /> + + <!--<param type="file" label="input-id-matrix" help="An identity matrix describing similarity among all pairs of input sequences/alignments (- -input-id-matrix)" />--> + <param type="integer" label="Max gapped aligner length" value="0" optional="True" name="max_gapped_aligner_length" + help="Maximum number of base pairs to attempt aligning with the gapped aligner (--max-gapped-aligner-length)" /> + + <param type="data" format="nhx" label="input-guide-tree" optional="True" name="input_guide_tree" + help="A phylogenetic guide tree in Newick format that describes the order in which sequences will be aligned (--input-guide-tree)" /> + + <param type="boolean" truevalue="--collinear" falsevalue="" label="Collinear inputs" name="collinear" + help="Assume that input sequences are collinear--they have no rearrangements (--collinear)" /> + + <param type="select" label="Scoring scheme" name="scoring_scheme" help="Selects the anchoring score function. (--scoring-scheme)" > + <option value="sp" selected="True">Extant sum-of-pairs (sp)</option> + <option value="ancestral_sp">Sum-of-pairs + Ancestral (ancestral_sp)</option> + <option value="ancestral">Ancestral (ancestral)</option> + </param> + + <param type="boolean" truevalue="--no-weight-scaling" falsevalue="" label="No weight scaling" name="no_weight_scaling" + help="Don't scale LCB weights by conservation distance and breakpoint distance (--no-weight-scaling)" /> + + <param type="float" min="0" max="1" label="max-breakpoint-distance-scale" value="0.5" name="max_breakpoint_distance_scale" + help="Set the maximum weight scaling by breakpoint distance. (--max-breakpoint-distance-scale)" /> + + <param type="float" min="0" max="1" label="conservation-distance-scale" value="0.5" name="conservation_distance_scale" + help="Scale conservation distances by this amount. (--conservation-distance-scale)" /> + + <param type="boolean" truevalue="--skip-refinement" falsevalue="" label="Skip refinement" name="skip_refinement" + help="Do not perform iterative refinement (--skip-refinement)" /> + <param type="boolean" truevalue="--skip-gapped-alignment" falsevalue="" label="Skip gapped alignment" name="skip_gapped_alignment" + help="Do not perform gapped alignment (--skip-gapped-alignment)" /> + <param type="integer" label="BP dist estimate min score" name="bp_dist_estimate_min_score" value="0" optional="True" + help="Minimum LCB score for estimating pairwise breakpoint distance (--bp-dist-estimate-min-score)" /> + + <param type="integer" label="Gap open" name="gap_open" value="0" optional="True" + help="Gap open penalty (--gap-open)" /> + + <param type="select" label="Repeat penalty" name="repeat_penalty" + help="Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. (--repeat-penalty)"> + <option value="negative" selected="True">Negative</option> + <option value="zero">Zero</option> + </param> + + <param type="integer" label="Gap extend" name="gap_extend" value="0" optional="True" + help="Gap extend penalty (--gap-extend)" /> + + <!--<param type="data" label="Substitution matrix" --> + <!--help="Nucleotide substitution matrix in NCBI format (- -substitution-matrix)" />--> + + <param type="integer" label="Weight" name="weight" value="0" optional="True" + help="Minimum pairwise LCB score (--weight)" /> + <param type="integer" label="Min scaled penalty" name="min_scaled_penalty" value="0" optional="True" + help="Minimum breakpoint penalty after scaling the penalty by expected divergence (--min-scaled-penalty)" /> + + <param type="float" label="HMM p go homologous" name="hmm_p_go_homologous" min="0" max="1" value="0.00001" + help="Probability of transitioning from the unrelated to the homologous state (--hmm-p-go-homologous)" /> + <param type="float" label="HMM p go unrelated" name="hmm_p_go_unrelated" min="0" max="1" value="0.000000001" + help="Probability of transitioning from the homologous to the unrelated state (--hmm-p-go-unrelated)" /> + <param type="float" label="HMM identity" name="hmm_identity" min="0" max="1" value="0.7" + help="Expected level of sequence identity among pairs of sequences(--hmm-identity)" /> + + <param type="boolean" truevalue="--seed-family" falsevalue="" label="Seed family" name="seed_family" + help="Use a family of spaced seeds to improve sensitivity (--seed-family)" /> + <param type="boolean" truevalue="--solid-seeds" falsevalue="" label="Solid seeds" name="solid_seeds" + help="Use solid seeds. Do not permit substitutions in anchor matches. (--solid-seeds)" /> + <param type="boolean" truevalue="--coding-seeds" falsevalue="" label="Coding seeds" name="coding_seeds" + help="Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy. (--coding-seeds)" /> + <param type="boolean" truevalue="--no-recursion" falsevalue="" label="No recursion" name="no_recursion" + help="Disable recursive anchor search (--no-recursion)" /> + </inputs> + <outputs> + <data format="xmfa" name="output" label="${tool.name} alignment of ${on_string}"> + <change_format> + <when input="mums" value="--mums" format="tabular" /> + </change_format> + </data> + <data format="nhx" name="output_guide_tree_file" label="${tool.name} alignment of ${on_string}: Guide tree"> + <when>output_guide_tree</when> + </data> + <data format="tabular" name="output_backbone_file" label="${tool.name} alignment of ${on_string}: Backbone"> + <when>output_backbone</when> + </data> + </outputs> + <tests> + <test> + <param name="sequences" value="phagey.fa,karma.fa" /> + <output name="output" file="1.xmfa" lines_diff="20"/> + </test> + <test> + <param name="sequences" value="merged.fa" /> + <output name="output" file="1.xmfa" lines_diff="20"/> + </test> + <test> + <param name="sequences" value="merged.fa" /> + <param name="output_guide_tree" value="True" /> + <output name="output" file="1.xmfa" lines_diff="20"/> + <output name="output_guide_tree_file" file="1.nhx" /> + </test> + <test> + <param name="sequences" value="merged.fa" /> + <param name="mums" value="True" /> + <output name="output" file="1.mums" compare="sim_size" delta="1000"/> + </test> + <test> + <param name="sequences" value="merged.fa" /> + <param name="match_input" value="1.mums" /> + <output name="output" file="1.xmfa" lines_diff="24"/> + </test> + </tests> + <help><![CDATA[ +What it does +============ + +Mauve is a system for efficiently constructing multiple genome alignments in +the presence of large-scale evolutionary events such as rearrangement and +inversion. Multiple genome alignment provides a basis for research into +comparative genomics and the study of evolutionary dynamics. Aligning whole +genomes is a fundamentally different problem than aligning short sequences. + +Mauve has been developed with the idea that a multiple genome aligner should +require only modest computational resources. It employs algorithmic techniques +that scale well in the amount of sequence being aligned. For example, a pair of +Y. pestis genomes can be aligned in under a minute, while a group of 9 +divergent Enterobacterial genomes can be aligned in a few hours. + +progressiveMauve XMFA alignment visualized with the Mauve tool: + +.. image:: $PATH_TO_IMAGES/hemolysin.jpg + +Example Usage +============= + ++-----------------------------------+-------------+ +| Usage | Notes | ++===================================+=============+ +| Align genomes |Simply | +| |select as | +| |many fasta | +| |files with | +| |one or more | +| |sequences as | +| |necessary | ++-----------------------------------+-------------+ +| Align genomes but also save |Use the | +| the guide tree and produce a |**Output | +| backbone file |Guide Tree** | +| |and **Output | +| |Backbone** | +| |options | ++-----------------------------------+-------------+ +| Align genomes, but do not |Use the | +| detect forced alignment of |**Disable | +| unrelated sequences |backbone** | +| |option | ++-----------------------------------+-------------+ +| Detect forced alignment of |Use the | +| unrelated sequence in the |**Apply | +| alignment produced |Backbone** | +| in previous example, use |option and | +| custom Homology HMM transition |specify the | +| parameters. |XMFA file | +| |produced | +| |in the | +| |previous | +| |example | ++-----------------------------------+-------------+ +| Compute ungapped |Use the | +| local-multiple alignments among |**MUMs** | +| the input sequences |option | ++-----------------------------------+-------------+ +| Compute an alignment of the |Set the | +| same genomes, using previously |**Match | +| computed local-multiple |Input** to | +| alignments |the tabular | +| |MUMs file | +| |produced in | +| |the previous | +| |example | ++-----------------------------------+-------------+ +| Set a minimum scaled |Use the | +| breakpoint penalty to cope with |**Min Scaled | +| the case where most genomes |Penalty** and| +| are aligned correctly, but manual |set to a | +| inspection reveals that |value like | +| a divergent genome has too |5000 | +| many predicted rearrangements. | | ++-----------------------------------+-------------+ +| Globally align a set of |Use the | +| collinear virus |**Colinear**,| +| genomes, using seed families |**Seed | +| to improve anchoring sensitivity |Family** | +| in regions below 70% sequence |options | +| identity. | | ++-----------------------------------+-------------+ + + +The progressiveMauve algorithm: addressing limitations of the original algorithm +================================================================================ + +Comparative genomics has revealed that closely-related bacteria often have +highly divergent gene content. While the original Mauve algorithm could align +regions conserved among all organisms, the portion of the genome conserved +among all taxa (the core genome) shrinks as more taxa are added to the +analysis. As such, the original Mauve algorithm did not scale well to large +numbers of taxa because it could not align regions conserved among subsets of +the genomes under study. progressiveMauve employs a different algorithmic +approach to scoring alignments that allows alignment of segments conserved +among subsets of taxa. The progressiveMauve algorithm has been described in +Aaron Darling's Ph.D. Thesis, and is also the subject of a manuscript published +in PLoS ONE. A brief overview is given here. + +Finding initial local multiple alignments +----------------------------------------- + +progressiveMauve elaborates on the original algorithm for finding local +multiple alignments. Instead of using a single seed pattern for match +filtration, progressiveMauve uses a combination of three seed patterns for +improved sensitivity. The palindromic seed patterns have been described in +Darling et al. 2006 "Procrastination leads to efficient filtration for local +multiple alignment" + +Seed matches which represent a unique subsequence shared by two or more input +genomes are subjected to ungapped extension until the seed pattern no longer +matches. The result is an ungapped local multiple alignment with at most one +component from each of the input genome sequences. + +Computing a pairwise genome content distance matrix and guide tree +------------------------------------------------------------------ + +progressiveMauve builds up genome alignments progressively according to a guide +tree. The guide tree is computed based on an estimate of the shared gene +content among each pair of input genomes. For a pair of input genomes, g.x and +g.y, shared gene content is estimated by counting the number of nucleotides in +gx and gy aligned to each other in the initial set of local multiple +alignments. The count is normalized to a similarity value between 0 and 1 by +dividing by the average size of gx and gy. The similarity value is subtracted +from 1 to arrive at a distance estimate. Neighbor joining is then applied to +the matrix of distance estimates to yield a guide tree topology. Note that the +guide tree is not intended to be a phylogeny indicative of the genealogy of +input genomes. It is merely a computational crutch for progressive genome +alignment. Also note that alignments are later refined independently of a +single guide tree toplogy to avoid biasing later phylogenetic inference. + +Computing a pairwise breakpoint distance matrix +----------------------------------------------- + +Prior to alignment, progressiveMauve attempts to compute a conservative +estimate of the number of rearrangement breakpoints among any pair of genomes. +For each pair of genomes, pairwise alignments are created from the +local-multiple alignments and the pairwise alignments are subjected to greedy +breakpoint elimination. The breakpoint penalty used for greedy breakpoint +elimination is set high for closely related genomes and scaled downward +according to the estimate of genomic content distance. Because the breakpoint +penalty is high, the resulting set of locally collinear blocks represent +robustly supported segmental homology, and a conservative estimate of the +breakpoint distance can be made on this basis. The conservative estimate of +breakpoint distance is used later during progressive alignment to scale +breakpoint penalties. + +Progressive genome alignment +---------------------------- + +A genome alignment is progressively built up according to the guide tree. At +each step of the progressive genome alignment, alignment anchors are selected +from the initial set of local multiple alignments. Anchors are selected so that +they maximize a Sum-of-pairs scoring scheme which applies a penalty for +predicting breakpoints among any pair of genomes. Because rates of genomic +rearrangement are highly variable, especially in some bacterial pathogens, some +genomes may be expected to exhibit greater rearrangement than others. As such, +a single choice of scoring penalty is unlikely to yield accurate alignments for +all genomes. To cope with this phenomenon, progressiveMauve scales the +breakpoint penalty according to the expected level of sequence divergence and +the number of well-supported genomic rearrangements among the pair of input +genomes. These scaling values are taken from the distance matrices computed +earlier in the algorithm. + +Anchored alignment +------------------ + +Once anchors have been computed at a node in the guide tree, a global alignment +is computed on the basis of the anchors. Given a set of anchors among two +genomes, a genome and an alignment, or a pair of alignments, a modified MUSCLE +global alignment algorithm is applied to compute an anchored profile-profile +alignment. MUSCLE is then used to perform tree-independent iterative refinement +on the global genome alignment. + +Rejecting alignment of unrelated sequence +----------------------------------------- + +Although we compute a global alignment among sequences, genomes often contain +lineage-specific sequence and are thus not globally related. The global +alignment will often contain forced alignment of unrelated sequence. A simple +hidden Markov model structure is used to detect forced alignment of unrelated +sequence, which are then removed from the alignment. + +Strengths of the progressiveMauve algorithm +------------------------------------------- + +- It can be applied to a much larger number of genomes than the original Mauve + algorithm +- It can align more divergent genomes than the original algorithm. Genomes + with as little as 50% nucleotide identity can be alignable +- Manual adjustment of the alignment scoring parameters is usually not + necessary +- It aligns the pan-genome, e.g. regions conserved among subsets of the input + genomes +- It is more accurate than the previous Mauve algorithm + +Notes on Reproducibility +------------------------ + +The command line programme progressiveMauve seems to behave differently when:: + + --max-breakpoint-distance-scale=0.5 --conservation-distance-scale=0.5 + +are passed to the tool, compared to when those options are not passed. This +means that if you wish to precisely replicate the results you see in Galaxy at +the command line, you'll need to pass these flags with their "default" values. + +@ATTRIBUTION@ +]]></help> + <expand macro="citation" /> +</tool>