Mercurial > repos > iuc > progressivemauve
comparison progressivemauve.xml @ 0:74093fb62bdf draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author | iuc |
---|---|
date | Wed, 17 Aug 2016 14:46:55 -0400 |
parents | |
children | bca52822843e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:74093fb62bdf |
---|---|
1 <?xml version="1.0"?> | |
2 <tool id="progressivemauve" name="progressiveMauve" version="@WRAPPER_VERSION@.0"> | |
3 <description>constructs multiple genome alignments</description> | |
4 <macros> | |
5 <import>macros.xml</import> | |
6 </macros> | |
7 <expand macro="requirements"/> | |
8 <expand macro="stdio"/> | |
9 <version_command>progressiveMauve --version</version_command> | |
10 <command><![CDATA[ | |
11 ## Symlink files in with correct extensions | |
12 #for $file in $sequences: | |
13 ln -s $file `basename $file`; | |
14 #end for | |
15 | |
16 progressiveMauve | |
17 ## Input Options | |
18 | |
19 #if $apply_backbone: | |
20 --apply-backbone=$apply_backbone | |
21 #end if | |
22 --island-gap-size=$island_gap_size | |
23 $mums | |
24 | |
25 #if $seed_weight: | |
26 --seed-weight=$seed_weight | |
27 #end if | |
28 | |
29 #if $max_gapped_aligner_length: | |
30 --max-gapped-aligner-length=$max_gapped_aligner_length | |
31 #end if | |
32 | |
33 #if $match_input: | |
34 --match-input=$match_input | |
35 #end if | |
36 | |
37 $collinear | |
38 --scoring-scheme=$scoring_scheme | |
39 $no_weight_scaling | |
40 | |
41 --max-breakpoint-distance-scale=$max_breakpoint_distance_scale | |
42 --conservation-distance-scale=$conservation_distance_scale | |
43 $skip_refinement | |
44 $skip_gapped_alignment | |
45 | |
46 #if $bp_dist_estimate_min_score: | |
47 --bp-dist-estimate-min-score=$bp_dist_estimate_min_score | |
48 #end if | |
49 | |
50 #if $gap_open: | |
51 --gap-open=$gap_open | |
52 #end if | |
53 | |
54 #if $gap_extend: | |
55 --gap-extend=$gap_extend | |
56 #end if | |
57 | |
58 #if $weight: | |
59 --weight=$weight | |
60 #end if | |
61 | |
62 #if $min_scaled_penalty: | |
63 --min-scaled-penalty=$min_scaled_penalty | |
64 #end if | |
65 | |
66 --hmm-p-go-homologous=$hmm_p_go_homologous | |
67 --hmm-p-go-unrelated=$hmm_p_go_unrelated | |
68 --hmm-identity=$hmm_identity | |
69 | |
70 $seed_family | |
71 $solid_seeds | |
72 $coding_seeds | |
73 $no_recursion | |
74 $disable_backbone | |
75 | |
76 ## Outputs | |
77 --output=$output | |
78 #if $output_guide_tree: | |
79 --output-guide-tree=$output_guide_tree_file | |
80 #end if | |
81 | |
82 #if $output_backbone: | |
83 --backbone-output=$output_backbone_file | |
84 #end if | |
85 | |
86 ## Sequences | |
87 #for file in $sequences: | |
88 `basename "${file}"` | |
89 #end for | |
90 | |
91 ]]></command> | |
92 <inputs> | |
93 <param type="data" format="fasta" name="sequences" multiple="True" | |
94 label="Select sequences to align" help="in fasta format" /> | |
95 <param type="data" format="xmfa" label="Apply Backbone" name="apply_backbone" optional="True" | |
96 help="Read an existing sequence alignment in XMFA format and apply backbone statistics to it (--apply-backbone)" /> | |
97 | |
98 <param type="integer" label="Island gap size" value="20" name="island_gap_size" | |
99 help="Alignment gaps above this size in nucleotides are considered to be islands (--island-gap-size)"/> | |
100 | |
101 <param type="boolean" truevalue="--disable-backbone" falsevalue="" name="disable_backbone" | |
102 label="Disable backbone" help="Disable backbone detection (--disable-backbone)" /> | |
103 | |
104 <param type="boolean" truevalue="True" falsevalue="" name="output_guide_tree" | |
105 label="Output Guide Tree" help="Write out the guide tree used for alignment to a file (--output-guide-tree)" /> | |
106 | |
107 <param type="boolean" truevalue="True" falsevalue="" name="output_backbone" | |
108 label="Output Backbone" help="Write out the backbone to a file (--backbone-output)" /> | |
109 | |
110 <param type="boolean" truevalue="--mums" falsevalue="" label="MUMs" name="mums" | |
111 help="Find MUMs only, do not attempt to determine locally collinear blocks (LCBs) (--mums)" /> | |
112 | |
113 <param type="integer" label="Seed weight" name="seed_weight" value="0" optional="True" | |
114 help="Use the specified seed weight for calculating initial anchors (--seed-weight)" /> | |
115 | |
116 <param type="data" format="tabular" label="Match Input" name="match_input" optional="True" | |
117 help="Use specified match file instead of searching for matches (--match-input)" /> | |
118 | |
119 <!--<param type="file" label="input-id-matrix" help="An identity matrix describing similarity among all pairs of input sequences/alignments (- -input-id-matrix)" />--> | |
120 <param type="integer" label="Max gapped aligner length" value="0" optional="True" name="max_gapped_aligner_length" | |
121 help="Maximum number of base pairs to attempt aligning with the gapped aligner (--max-gapped-aligner-length)" /> | |
122 | |
123 <param type="data" format="nhx" label="input-guide-tree" optional="True" name="input_guide_tree" | |
124 help="A phylogenetic guide tree in Newick format that describes the order in which sequences will be aligned (--input-guide-tree)" /> | |
125 | |
126 <param type="boolean" truevalue="--collinear" falsevalue="" label="Collinear inputs" name="collinear" | |
127 help="Assume that input sequences are collinear--they have no rearrangements (--collinear)" /> | |
128 | |
129 <param type="select" label="Scoring scheme" name="scoring_scheme" help="Selects the anchoring score function. (--scoring-scheme)" > | |
130 <option value="sp" selected="True">Extant sum-of-pairs (sp)</option> | |
131 <option value="ancestral_sp">Sum-of-pairs + Ancestral (ancestral_sp)</option> | |
132 <option value="ancestral">Ancestral (ancestral)</option> | |
133 </param> | |
134 | |
135 <param type="boolean" truevalue="--no-weight-scaling" falsevalue="" label="No weight scaling" name="no_weight_scaling" | |
136 help="Don't scale LCB weights by conservation distance and breakpoint distance (--no-weight-scaling)" /> | |
137 | |
138 <param type="float" min="0" max="1" label="max-breakpoint-distance-scale" value="0.5" name="max_breakpoint_distance_scale" | |
139 help="Set the maximum weight scaling by breakpoint distance. (--max-breakpoint-distance-scale)" /> | |
140 | |
141 <param type="float" min="0" max="1" label="conservation-distance-scale" value="0.5" name="conservation_distance_scale" | |
142 help="Scale conservation distances by this amount. (--conservation-distance-scale)" /> | |
143 | |
144 <param type="boolean" truevalue="--skip-refinement" falsevalue="" label="Skip refinement" name="skip_refinement" | |
145 help="Do not perform iterative refinement (--skip-refinement)" /> | |
146 <param type="boolean" truevalue="--skip-gapped-alignment" falsevalue="" label="Skip gapped alignment" name="skip_gapped_alignment" | |
147 help="Do not perform gapped alignment (--skip-gapped-alignment)" /> | |
148 <param type="integer" label="BP dist estimate min score" name="bp_dist_estimate_min_score" value="0" optional="True" | |
149 help="Minimum LCB score for estimating pairwise breakpoint distance (--bp-dist-estimate-min-score)" /> | |
150 | |
151 <param type="integer" label="Gap open" name="gap_open" value="0" optional="True" | |
152 help="Gap open penalty (--gap-open)" /> | |
153 | |
154 <param type="select" label="Repeat penalty" name="repeat_penalty" | |
155 help="Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. (--repeat-penalty)"> | |
156 <option value="negative" selected="True">Negative</option> | |
157 <option value="zero">Zero</option> | |
158 </param> | |
159 | |
160 <param type="integer" label="Gap extend" name="gap_extend" value="0" optional="True" | |
161 help="Gap extend penalty (--gap-extend)" /> | |
162 | |
163 <!--<param type="data" label="Substitution matrix" --> | |
164 <!--help="Nucleotide substitution matrix in NCBI format (- -substitution-matrix)" />--> | |
165 | |
166 <param type="integer" label="Weight" name="weight" value="0" optional="True" | |
167 help="Minimum pairwise LCB score (--weight)" /> | |
168 <param type="integer" label="Min scaled penalty" name="min_scaled_penalty" value="0" optional="True" | |
169 help="Minimum breakpoint penalty after scaling the penalty by expected divergence (--min-scaled-penalty)" /> | |
170 | |
171 <param type="float" label="HMM p go homologous" name="hmm_p_go_homologous" min="0" max="1" value="0.00001" | |
172 help="Probability of transitioning from the unrelated to the homologous state (--hmm-p-go-homologous)" /> | |
173 <param type="float" label="HMM p go unrelated" name="hmm_p_go_unrelated" min="0" max="1" value="0.000000001" | |
174 help="Probability of transitioning from the homologous to the unrelated state (--hmm-p-go-unrelated)" /> | |
175 <param type="float" label="HMM identity" name="hmm_identity" min="0" max="1" value="0.7" | |
176 help="Expected level of sequence identity among pairs of sequences(--hmm-identity)" /> | |
177 | |
178 <param type="boolean" truevalue="--seed-family" falsevalue="" label="Seed family" name="seed_family" | |
179 help="Use a family of spaced seeds to improve sensitivity (--seed-family)" /> | |
180 <param type="boolean" truevalue="--solid-seeds" falsevalue="" label="Solid seeds" name="solid_seeds" | |
181 help="Use solid seeds. Do not permit substitutions in anchor matches. (--solid-seeds)" /> | |
182 <param type="boolean" truevalue="--coding-seeds" falsevalue="" label="Coding seeds" name="coding_seeds" | |
183 help="Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy. (--coding-seeds)" /> | |
184 <param type="boolean" truevalue="--no-recursion" falsevalue="" label="No recursion" name="no_recursion" | |
185 help="Disable recursive anchor search (--no-recursion)" /> | |
186 </inputs> | |
187 <outputs> | |
188 <data format="xmfa" name="output" label="${tool.name} alignment of ${on_string}"> | |
189 <change_format> | |
190 <when input="mums" value="--mums" format="tabular" /> | |
191 </change_format> | |
192 </data> | |
193 <data format="nhx" name="output_guide_tree_file" label="${tool.name} alignment of ${on_string}: Guide tree"> | |
194 <when>output_guide_tree</when> | |
195 </data> | |
196 <data format="tabular" name="output_backbone_file" label="${tool.name} alignment of ${on_string}: Backbone"> | |
197 <when>output_backbone</when> | |
198 </data> | |
199 </outputs> | |
200 <tests> | |
201 <test> | |
202 <param name="sequences" value="phagey.fa,karma.fa" /> | |
203 <output name="output" file="1.xmfa" lines_diff="20"/> | |
204 </test> | |
205 <test> | |
206 <param name="sequences" value="merged.fa" /> | |
207 <output name="output" file="1.xmfa" lines_diff="20"/> | |
208 </test> | |
209 <test> | |
210 <param name="sequences" value="merged.fa" /> | |
211 <param name="output_guide_tree" value="True" /> | |
212 <output name="output" file="1.xmfa" lines_diff="20"/> | |
213 <output name="output_guide_tree_file" file="1.nhx" /> | |
214 </test> | |
215 <test> | |
216 <param name="sequences" value="merged.fa" /> | |
217 <param name="mums" value="True" /> | |
218 <output name="output" file="1.mums" compare="sim_size" delta="1000"/> | |
219 </test> | |
220 <test> | |
221 <param name="sequences" value="merged.fa" /> | |
222 <param name="match_input" value="1.mums" /> | |
223 <output name="output" file="1.xmfa" lines_diff="24"/> | |
224 </test> | |
225 </tests> | |
226 <help><![CDATA[ | |
227 What it does | |
228 ============ | |
229 | |
230 Mauve is a system for efficiently constructing multiple genome alignments in | |
231 the presence of large-scale evolutionary events such as rearrangement and | |
232 inversion. Multiple genome alignment provides a basis for research into | |
233 comparative genomics and the study of evolutionary dynamics. Aligning whole | |
234 genomes is a fundamentally different problem than aligning short sequences. | |
235 | |
236 Mauve has been developed with the idea that a multiple genome aligner should | |
237 require only modest computational resources. It employs algorithmic techniques | |
238 that scale well in the amount of sequence being aligned. For example, a pair of | |
239 Y. pestis genomes can be aligned in under a minute, while a group of 9 | |
240 divergent Enterobacterial genomes can be aligned in a few hours. | |
241 | |
242 progressiveMauve XMFA alignment visualized with the Mauve tool: | |
243 | |
244 .. image:: $PATH_TO_IMAGES/hemolysin.jpg | |
245 | |
246 Example Usage | |
247 ============= | |
248 | |
249 +-----------------------------------+-------------+ | |
250 | Usage | Notes | | |
251 +===================================+=============+ | |
252 | Align genomes |Simply | | |
253 | |select as | | |
254 | |many fasta | | |
255 | |files with | | |
256 | |one or more | | |
257 | |sequences as | | |
258 | |necessary | | |
259 +-----------------------------------+-------------+ | |
260 | Align genomes but also save |Use the | | |
261 | the guide tree and produce a |**Output | | |
262 | backbone file |Guide Tree** | | |
263 | |and **Output | | |
264 | |Backbone** | | |
265 | |options | | |
266 +-----------------------------------+-------------+ | |
267 | Align genomes, but do not |Use the | | |
268 | detect forced alignment of |**Disable | | |
269 | unrelated sequences |backbone** | | |
270 | |option | | |
271 +-----------------------------------+-------------+ | |
272 | Detect forced alignment of |Use the | | |
273 | unrelated sequence in the |**Apply | | |
274 | alignment produced |Backbone** | | |
275 | in previous example, use |option and | | |
276 | custom Homology HMM transition |specify the | | |
277 | parameters. |XMFA file | | |
278 | |produced | | |
279 | |in the | | |
280 | |previous | | |
281 | |example | | |
282 +-----------------------------------+-------------+ | |
283 | Compute ungapped |Use the | | |
284 | local-multiple alignments among |**MUMs** | | |
285 | the input sequences |option | | |
286 +-----------------------------------+-------------+ | |
287 | Compute an alignment of the |Set the | | |
288 | same genomes, using previously |**Match | | |
289 | computed local-multiple |Input** to | | |
290 | alignments |the tabular | | |
291 | |MUMs file | | |
292 | |produced in | | |
293 | |the previous | | |
294 | |example | | |
295 +-----------------------------------+-------------+ | |
296 | Set a minimum scaled |Use the | | |
297 | breakpoint penalty to cope with |**Min Scaled | | |
298 | the case where most genomes |Penalty** and| | |
299 | are aligned correctly, but manual |set to a | | |
300 | inspection reveals that |value like | | |
301 | a divergent genome has too |5000 | | |
302 | many predicted rearrangements. | | | |
303 +-----------------------------------+-------------+ | |
304 | Globally align a set of |Use the | | |
305 | collinear virus |**Colinear**,| | |
306 | genomes, using seed families |**Seed | | |
307 | to improve anchoring sensitivity |Family** | | |
308 | in regions below 70% sequence |options | | |
309 | identity. | | | |
310 +-----------------------------------+-------------+ | |
311 | |
312 | |
313 The progressiveMauve algorithm: addressing limitations of the original algorithm | |
314 ================================================================================ | |
315 | |
316 Comparative genomics has revealed that closely-related bacteria often have | |
317 highly divergent gene content. While the original Mauve algorithm could align | |
318 regions conserved among all organisms, the portion of the genome conserved | |
319 among all taxa (the core genome) shrinks as more taxa are added to the | |
320 analysis. As such, the original Mauve algorithm did not scale well to large | |
321 numbers of taxa because it could not align regions conserved among subsets of | |
322 the genomes under study. progressiveMauve employs a different algorithmic | |
323 approach to scoring alignments that allows alignment of segments conserved | |
324 among subsets of taxa. The progressiveMauve algorithm has been described in | |
325 Aaron Darling's Ph.D. Thesis, and is also the subject of a manuscript published | |
326 in PLoS ONE. A brief overview is given here. | |
327 | |
328 Finding initial local multiple alignments | |
329 ----------------------------------------- | |
330 | |
331 progressiveMauve elaborates on the original algorithm for finding local | |
332 multiple alignments. Instead of using a single seed pattern for match | |
333 filtration, progressiveMauve uses a combination of three seed patterns for | |
334 improved sensitivity. The palindromic seed patterns have been described in | |
335 Darling et al. 2006 "Procrastination leads to efficient filtration for local | |
336 multiple alignment" | |
337 | |
338 Seed matches which represent a unique subsequence shared by two or more input | |
339 genomes are subjected to ungapped extension until the seed pattern no longer | |
340 matches. The result is an ungapped local multiple alignment with at most one | |
341 component from each of the input genome sequences. | |
342 | |
343 Computing a pairwise genome content distance matrix and guide tree | |
344 ------------------------------------------------------------------ | |
345 | |
346 progressiveMauve builds up genome alignments progressively according to a guide | |
347 tree. The guide tree is computed based on an estimate of the shared gene | |
348 content among each pair of input genomes. For a pair of input genomes, g.x and | |
349 g.y, shared gene content is estimated by counting the number of nucleotides in | |
350 gx and gy aligned to each other in the initial set of local multiple | |
351 alignments. The count is normalized to a similarity value between 0 and 1 by | |
352 dividing by the average size of gx and gy. The similarity value is subtracted | |
353 from 1 to arrive at a distance estimate. Neighbor joining is then applied to | |
354 the matrix of distance estimates to yield a guide tree topology. Note that the | |
355 guide tree is not intended to be a phylogeny indicative of the genealogy of | |
356 input genomes. It is merely a computational crutch for progressive genome | |
357 alignment. Also note that alignments are later refined independently of a | |
358 single guide tree toplogy to avoid biasing later phylogenetic inference. | |
359 | |
360 Computing a pairwise breakpoint distance matrix | |
361 ----------------------------------------------- | |
362 | |
363 Prior to alignment, progressiveMauve attempts to compute a conservative | |
364 estimate of the number of rearrangement breakpoints among any pair of genomes. | |
365 For each pair of genomes, pairwise alignments are created from the | |
366 local-multiple alignments and the pairwise alignments are subjected to greedy | |
367 breakpoint elimination. The breakpoint penalty used for greedy breakpoint | |
368 elimination is set high for closely related genomes and scaled downward | |
369 according to the estimate of genomic content distance. Because the breakpoint | |
370 penalty is high, the resulting set of locally collinear blocks represent | |
371 robustly supported segmental homology, and a conservative estimate of the | |
372 breakpoint distance can be made on this basis. The conservative estimate of | |
373 breakpoint distance is used later during progressive alignment to scale | |
374 breakpoint penalties. | |
375 | |
376 Progressive genome alignment | |
377 ---------------------------- | |
378 | |
379 A genome alignment is progressively built up according to the guide tree. At | |
380 each step of the progressive genome alignment, alignment anchors are selected | |
381 from the initial set of local multiple alignments. Anchors are selected so that | |
382 they maximize a Sum-of-pairs scoring scheme which applies a penalty for | |
383 predicting breakpoints among any pair of genomes. Because rates of genomic | |
384 rearrangement are highly variable, especially in some bacterial pathogens, some | |
385 genomes may be expected to exhibit greater rearrangement than others. As such, | |
386 a single choice of scoring penalty is unlikely to yield accurate alignments for | |
387 all genomes. To cope with this phenomenon, progressiveMauve scales the | |
388 breakpoint penalty according to the expected level of sequence divergence and | |
389 the number of well-supported genomic rearrangements among the pair of input | |
390 genomes. These scaling values are taken from the distance matrices computed | |
391 earlier in the algorithm. | |
392 | |
393 Anchored alignment | |
394 ------------------ | |
395 | |
396 Once anchors have been computed at a node in the guide tree, a global alignment | |
397 is computed on the basis of the anchors. Given a set of anchors among two | |
398 genomes, a genome and an alignment, or a pair of alignments, a modified MUSCLE | |
399 global alignment algorithm is applied to compute an anchored profile-profile | |
400 alignment. MUSCLE is then used to perform tree-independent iterative refinement | |
401 on the global genome alignment. | |
402 | |
403 Rejecting alignment of unrelated sequence | |
404 ----------------------------------------- | |
405 | |
406 Although we compute a global alignment among sequences, genomes often contain | |
407 lineage-specific sequence and are thus not globally related. The global | |
408 alignment will often contain forced alignment of unrelated sequence. A simple | |
409 hidden Markov model structure is used to detect forced alignment of unrelated | |
410 sequence, which are then removed from the alignment. | |
411 | |
412 Strengths of the progressiveMauve algorithm | |
413 ------------------------------------------- | |
414 | |
415 - It can be applied to a much larger number of genomes than the original Mauve | |
416 algorithm | |
417 - It can align more divergent genomes than the original algorithm. Genomes | |
418 with as little as 50% nucleotide identity can be alignable | |
419 - Manual adjustment of the alignment scoring parameters is usually not | |
420 necessary | |
421 - It aligns the pan-genome, e.g. regions conserved among subsets of the input | |
422 genomes | |
423 - It is more accurate than the previous Mauve algorithm | |
424 | |
425 Notes on Reproducibility | |
426 ------------------------ | |
427 | |
428 The command line programme progressiveMauve seems to behave differently when:: | |
429 | |
430 --max-breakpoint-distance-scale=0.5 --conservation-distance-scale=0.5 | |
431 | |
432 are passed to the tool, compared to when those options are not passed. This | |
433 means that if you wish to precisely replicate the results you see in Galaxy at | |
434 the command line, you'll need to pass these flags with their "default" values. | |
435 | |
436 @ATTRIBUTION@ | |
437 ]]></help> | |
438 <expand macro="citation" /> | |
439 </tool> |