comparison garli.xml @ 2:681e9bb51cc4 draft default tip

Clean help, fix option descriptions, add genthreshfortopoterm, change filetypes to txt to make it more flexible.
author malex
date Thu, 05 Jul 2012 17:18:52 -0400
parents 4025ba8b84d6
children
comparison
equal deleted inserted replaced
1:9ce35d2d9937 2:681e9bb51cc4
3 ## The command is a Cheetah template which allows some Python based syntax. 3 ## The command is a Cheetah template which allows some Python based syntax.
4 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces 4 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
5 ## Arguments to the wrapper beyond the config file are just for Galaxy's benefit - all filenames are hardcoded 5 ## Arguments to the wrapper beyond the config file are just for Galaxy's benefit - all filenames are hardcoded
6 <command interpreter="python">garli_wrapper.py $garli_conf $best_all_tre $best_tre $log00_log $screen_log </command> 6 <command interpreter="python">garli_wrapper.py $garli_conf $best_all_tre $best_tre $log00_log $screen_log </command>
7 <inputs> 7 <inputs>
8 <param name="datafname" format="nexus" type="data" label="Nexus formated sequence file" force_select="true"/> 8 <param name="datafname" format="txt" type="data" label="Nexus formated sequence file" force_select="true"/>
9 <conditional name="choose_search_type"> 9 <conditional name="choose_search_type">
10 <param name="search_type" type="select" label="Analysis Type"> 10 <param name="search_type" type="select" label="Analysis Type">
11 <option value="mlsearch" selected="true">ML Search</option> 11 <option value="mlsearch" selected="true">ML Search</option>
12 <option value="bootstrap">Bootstrap</option> 12 <option value="bootstrap">Bootstrap</option>
13 </param> 13 </param>
14 <when value="mlsearch"> 14 <when value="mlsearch">
15 <param name="searchreps" type="integer" size="4" value="1" label="Number of replicates"> 15 <param name="searchreps" type="integer" size="4" value="1" label="Number of independent search replicates">
16 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/> 16 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/>
17 </param> 17 </param>
18 <param name="bootstrapreps" type="hidden" value="0" /> 18 <param name="bootstrapreps" type="hidden" value="0" />
19 <param name="resampleproportion" type="float" 19 <param name="resampleproportion" type="float"
20 value="1.0" label="Relative size of resample data 20 value="1.0" label="Relative size of resample data
23 </param> 23 </param>
24 </when> 24 </when>
25 <when value="bootstrap"> 25 <when value="bootstrap">
26 <param name="searchreps" type="hidden" value="0" /> 26 <param name="searchreps" type="hidden" value="0" />
27 <param name="bootstrapreps" type="integer" size="4" 27 <param name="bootstrapreps" type="integer" size="4"
28 value="1" label="Number of replicates"> 28 value="1" label="Number of bootstrap replicates">
29 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/> 29 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/>
30 </param> 30 </param>
31 </when> 31 </when>
32 </conditional> 32 </conditional>
33 <param name="constraintfile" type="data" format="text" label="Constraint file" optional="true"/> 33 <param name="constraintfile" type="data" format="text" label="Constraint file" optional="true"/>
42 </when> 42 </when>
43 <when value="random"> 43 <when value="random">
44 <param name="streefname" type="hidden" value="random"/> 44 <param name="streefname" type="hidden" value="random"/>
45 </when> 45 </when>
46 <when value="file"> 46 <when value="file">
47 <param name="streefname" format="nexus" type="data" label="Starting Tree File"/> 47 <param name="streefname" format="txt" type="data" label="Starting Tree File"/>
48 </when> 48 </when>
49 </conditional> 49 </conditional>
50 <param name="attachmentspertaxon" size="4" type="integer" value="50" 50 <param name="attachmentspertaxon" size="4" type="integer" value="50"
51 label="Attachment branches evaluated per taxon (min=1)" > 51 label="Attachment branches evaluated per taxon (min=1)" >
52 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/> 52 <validator type="in_range" message="(1-infinity)" min="1" max="inf"/>
312 <validator type="in_range" message="(0.01-1.0)" min="0.01" max="1.0"/> 312 <validator type="in_range" message="(0.01-1.0)" min="0.01" max="1.0"/>
313 </param> 313 </param>
314 <param name="distanceswapbias" type="float" size="3" value="1.0" 314 <param name="distanceswapbias" type="float" size="3" value="1.0"
315 label="Relative weight assigned to branch swaps based on locality"> 315 label="Relative weight assigned to branch swaps based on locality">
316 <validator type="in_range" message="(0.1-10.0)" min="0.1" max="10.0"/> 316 <validator type="in_range" message="(0.1-10.0)" min="0.1" max="10.0"/>
317 </param>
318 <param name="genthreshfortopoterm" type="integer" size="5" value="20000"
319 label="Number of generations without topology improvement required for termination">
320 <validator type="in_range" message="(1-20000)" min="1" max="20000"/>
317 </param> 321 </param>
318 </inputs> 322 </inputs>
319 <outputs> 323 <outputs>
320 <data format="nexus" name="best_tre" metadata_source="datafname" from_work_dir="garli.best.tre" label="${tool.name} on ${on_string}: garli.best.tre"/> 324 <data format="nexus" name="best_tre" metadata_source="datafname" from_work_dir="garli.best.tre" label="${tool.name} on ${on_string}: garli.best.tre"/>
321 <data format="nexus" name="best_all_tre" metadata_source="datafname" from_work_dir="garli.best.all.tre" label="${tool.name} on ${on_string}: garli.all.best.tre"/> 325 <data format="nexus" name="best_all_tre" metadata_source="datafname" from_work_dir="garli.best.all.tre" label="${tool.name} on ${on_string}: garli.all.best.tre"/>
340 saveevery = 100 344 saveevery = 100
341 refinestart = ${refinestart} 345 refinestart = ${refinestart}
342 outputeachbettertopology = 0 346 outputeachbettertopology = 0
343 outputcurrentbesttopology = 0 347 outputcurrentbesttopology = 0
344 enforcetermconditions = 1 348 enforcetermconditions = 1
345 genthreshfortopoterm = 20000 349 genthreshfortopoterm = ${genthreshfortopoterm}
346 scorethreshforterm = 0.05 350 scorethreshforterm = 0.05
347 significanttopochange = 0.01 351 significanttopochange = 0.01
348 outputphyliptree = 0 352 outputphyliptree = 0
349 outputmostlyuselessfiles = 0 353 outputmostlyuselessfiles = 0
350 restart = 0 354 restart = 0
406 for partitioned models and morphology-like datatypes. 410 for partitioned models and morphology-like datatypes.
407 411
408 Garli is written and maintained by Derrick Zwickl 412 Garli is written and maintained by Derrick Zwickl
409 413
410 Configuration options are adapted from 414 Configuration options are adapted from
411 https://www.nescent.org/wg_garli/GARLI_Configuration_Settings 415 https://www.nescent.org/wg_garli/GARLI_Configuration_Settings. Please see that
412 416 page for more details.
413 -----
414
415 **Detailed description of the configuration options**
416
417
418 **Analysis Type**
419
420 Specify whether to perform a maximum likelihood search for the best tree, or
421 a bootstrap analysis.
422
423
424 **Number of replicates**
425
426 Number of independent search replicates to run.
427
428
429 **Relative size of resample data**
430
431 This setting allows for bootstrap-like resampling, but with the
432 psuedoreplicate datasets having the number of alignment columns different
433 from the real data. Setting values below 1.0 is somewhat similar to
434 jackknifing, but not identical.
435
436
437 **Attachment branches evaluated per taxon (min=1)**
438
439 The number of attachment branches evaluated for each taxon to be added to
440 the tree during the creation of an ML stepwise-addition starting tree.
441 Briefly, stepwise addition is an algorithm used to make a tree, and involves
442 adding taxa in a random order to a growing tree. For each taxon to be added,
443 a number of randomly chosen attachment branches are tried and scored, and
444 then the best scoring one is chosen as the location of that taxon. This
445 setting controls how many attachment points are evaluated for each taxon to
446 be added. A value of one is equivalent to a completely random tree (only one
447 randomly chosen location is evaluated). A value of greater than 2 times the
448 number of taxa in the dataset means that all attachment points will be
449 evaluated for each taxon, and will result in very good starting trees (but
450 may take a while on large datasets). Even fairly small values (less than 10)
451 can result in starting trees that are much, much better than random, but
452 still fairly different from one another.
453
454
455 **Constraint file**
456
457 Select a file containing constraint specifications.
458
459
460 **Random seed**
461
462 Random see can have a value of -1 or a positive integer. The random number
463 seed used by the random number generator. Specify “–1” to have a seed chosen
464 for you. Specifying the same seed number in multiple runs will give exactly
465 identical results, if all other parameters and settings are also identical.
466
467
468 **Available memory**
469
470 This lets GARLI determine how much system memory it may be able to use to
471 store computations for reuse.
472
473
474 **Perform initial rough optimization**
475
476 Specifies whether some initial rough optimization is performed on the
477 starting branch lengths and rate heterogeneity parameters. This is always
478 recommended.
479
480
481 **Outgroup taxa numbers**
482
483 The outgroup option allows for orienting tree topologies in a consistent way
484 when they are written to a file. Note that this has NO effect whatsoever on
485 the actual inference and the specified outgroup is NOT constrained to be
486 present in the inferred trees. If multiple outgroup taxa are specified and
487 they do not form a monophyletic group, this setting will be ignored. If you
488 specify a single outgroup taxon it will always be present, and the tree will
489 always be consistently oriented. To specify an outgroup consisting of taxa
490 1, 3 and 5 the format is this: outgroup = 1 3 5. Dashes are used for ranges
491 e.g. 1-3 5.
492
493
494 **Collapse branches**
495
496 Before version 1.0, all trees that are returned were fully resolved. This is
497 true even if the maximum-likelihood estimate of some internal branch lengths
498 are effectively zero (or GARLI's minimum, which is 1e-8). In such cases,
499 collapsing the branch into a polytomy would be a better representation. Note
500 that GARLI will never return a tree with an actual branch length of zero,
501 but rather with its minimum value of 1.0e-8. The drawback of always
502 returning fully resolved trees is that what is effectively a polytomy can be
503 resolved in three ways, and different independent searches may randomly
504 return one of those resolutions. Thus, if you compare the trees by topology
505 only, they will look different. If you pay attention to the branch lengths
506 and likelihood scores of the trees it will be apparent that they are
507 effectively the same. I think that collapsing of branches is particularly
508 important when bootstrapping, since no support should be given to a branch
509 that doesn't really exist, i.e., that is a random resolution of a polytomy.
510 Collapsing is also good when calculating tree to tree distances such as the
511 symmetric tree distance, for example when calculating phylogenetic error to
512 a known target tree. Zero-length branches would add to the distances
513 (~error) although they really should not.
514
515
516 **Model type**
517
518 The codon-aminoacid datatype means that the data will be supplied as a
519 nucleotide alignment, but will be internally translated and analyzed using
520 an amino acid model. The codon and codon-aminoacid datatypes require
521 nucleotide sequence that is aligned in the correct reading frame. In other
522 words, all gaps in the alignment should be a multiple of 3 in length, and
523 the alignment should start at the first position of a codon. If the
524 alignment has extra columns at the start, middle or end, they should be
525 removed or excluded with a Nexus exset (see the FAQ for an example of exset
526 usage). The correct Genetic Code must also be set.
527
528
529
530
531 **Datatype - nucleotide**
532
533 **Rate matrix**
534
535 The number of relative substitution rate parameters (note that the number of
536 free parameters is this value minus one). Equivalent to the “nst” setting in
537 PAUP* and MrBayes. 1rate assumes that substitutions between all pairs of
538 nucleotides occur at the same rate (JC model), 2rate allows different rates
539 for transitions and transversions (K2P or HKY models), and 6rate allows a
540 different rate between each nucleotide pair (GTR). These rates are estimated
541 unless the fixed option is chosen. Since version 0.96, parameters for any
542 submodel of the GTR model may be estimated. The format for specifying this
543 is very similar to that used in the “rclass’ setting of PAUP*. Within
544 parentheses, six letters are specified, with spaces between them. The six
545 letters represent the rates of substitution between the six pairs of
546 nucleotides, with the order being A-C, A-G, A-T, C-G, C-T and G-T. Letters
547 within the parentheses that are the same mean that a single parameter is
548 shared by multiple nucleotide pairs.
549
550
551 **State frequences**
552
553 Specifies how the equilibrium state frequencies (A, C, G and T) are treated.
554 The empirical setting fixes the frequencies at their observed proportions,
555 and the other options should be self-explanatory.
556
557
558 **Datatype - nucleotide or amino-acid**
559
560
561 **Treatment of proportion of invariable sites parameter**
562
563 Specifies whether a parameter representing the proportion of sites that are
564 unable to change (i.e. have a substitution rate of zero) will be included.
565 This is typically referred to as 'invariant sites', but would better be
566 termed 'invariable sites'.
567
568
569 **Rate heterogeneity type**
570
571 (none, gamma, gammafixed) – The model of rate heterogeneity assumed.
572 “gammafixed” requires that the alpha shape parameter is provided, and a
573 setting of “gamma” estimates it.
574
575
576 **Number of discrete dN/dS categories**
577
578 The number of categories of variable rates (not including the invariant site
579 class if it is being used). Must be set to 1 if ratehetmodel is set to none.
580 Note that runtimes and memory usage scale linearly with this setting.
581
582
583 **Datatype - amino-acid or codon-aminoacid**
584
585 **Rate matrix**
586
587 (poisson, jones, dayhoff, wag, mtmam, mtrev) – The fixed amino acid rate
588 matrix to use. You should use the matrix that gives the best likelihood, and
589 could use a program like PROTTEST (very much like MODELTEST, but for amino
590 acid models) to determine which fits best for your data. Poisson assumes a
591 single rate of substitution between all amino acid pairs, and is a very poor
592 model.
593
594
595 **Equilibrium Base Frequences **
596
597 (equal, empirical, estimate, fixed, jones, dayhoff, wag, mtmam, mtrev) –
598 Specifies how the equilibrium state frequencies of the 20 amino acids are
599 treated. The “empirical” option fixes the frequencies at their observed
600 proportions (when describing a model this is often termed '+F').
601
602
603 **Number of discrete dN/dS categories**
604
605 The number of categories of variable rates (not including the invariant site
606 class if it is being used). Must be set to 1 if ratehetmodel is set to none.
607 Note that runtimes and memory usage scale linearly with this setting.
608
609
610 **Treatment of proportion of invariable sites parameter**
611
612 Specifies whether a parameter representing the proportion of sites that are
613 unable to change (i.e. have a substitution rate of zero) will be included.
614 This is typically referred to as 'invariant sites', but would better be
615 termed 'invariable sites'.
616
617
618 **Datatype - codon**
619
620
621 **Rate matrix**
622
623 (1rate, 2rate, 6rate, fixed, custom string) – This determines the relative
624 rates of nucleotide substitution assumed by the codon model. The options are
625 exactly the same as those allowed under a normal nucleotide model. A codon
626 model with ratematrix = 2rate specifies the standard Goldman and Yang (1994)
627 model, with different substitution rates for transitions and transversions.
628
629
630 **State frequences**
631
632 The options are to use equal codon frequencies (not a good option), the
633 frequencies observed in your dataset (termed “empirical” in GARLI), or the
634 codon frequencies implied by the “F1x4” or “F3x4” methods (using PAML
635 terminology). These last two options calculate the codon frequencies as the
636 product of the frequencies of the three nucleotides that make up each codon.
637 In the “F1x4” case the nucleotide frequencies are those observed in the
638 dataset across all codon positions, while the “F3x4” option uses the
639 nucleotide frequencies observed in the data at each codon position
640 separately.
641
642
643 **Rate Heterogeneity Type**
644
645 For codon models, the default is to infer a single dN/dS parameter.
646 Alternatively, a model can be specified that infers a given number of dN/dS
647 categories, with the dN/dS values and proportions falling in each category
648 estimated (ratehetmodel = nonsynonymous). This is the 'discrete' or 'M3'
649 model of Yang et al., 2000.
650
651
652 **Number of discrete dN/dS categories**
653
654 When ratehetmodel = nonsynonymous, this is the number of dN/dS parameter
655 categories.
656
657
658 **Datatype - codon or codon-aminoacid**
659
660
661 **Genetic code**
662
663 The genetic code to be used in translating codons into amino acids.
664
665
666 **Population Settings**
667
668
669 **Number of individuals in population**
670
671 The number of individuals in the population. This may be increased, but
672 doing so is generally not beneficial. Note that typical genetic algorithms
673 tend to have much, much larger population sizes than GARLI defaults.
674
675
676 **Unmutated copies of best individual**
677
678 The number of times the best individual is copied to the next generation
679 with no chance of mutation. It is best not to mess with this setting.
680
681
682 **Strength of selection**
683
684 Controls the strength of selection, with larger numbers denoting stronger
685 selection. The relative probability of reproduction of two individuals
686 depends on the difference in their log likelihoods (ΔlnL) and is formulated
687 very similarly to the procedure of calculating Akaike weights.
688
689
690 **Fitness handicap for the best individual**
691
692 This can be used to bias the probability of reproduction of the best
693 individual downward. Because the best individual is automatically copied
694 into the next generation, it has a bit of an unfair advantage and can cause
695 all population variation to be lost due to genetic drift, especially with
696 small populations sizes. The value specified here is subtracted from the
697 best individual’s lnL score before calculating the probabilities of
698 reproduction. It seems plausible that this might help maintain variation,
699 but I have not seen it cause a measurable effect.
700
701
702 **Maximum number of generations to run**
703
704 Use if automatic termination is desired to prevent a runaway process.
705
706
707 **Maximum time to run**
708
709 The maximum number of seconds for the run to continue. Use if automatic
710 termination is desired to prevent a runaway process.
711
712
713 **Branch-length optimization settings**
714
715
716 **Minimal optimization precision**
717
718 The minimum allowed value of the optimization precision - must not be larger
719 then the Starting optimization precision.
720
721
722 **Number of steps down from Start Precision to Minimum Precision**
723
724 Specify the number of steps that it will take for the optimization precision
725 to decrease (linearly) from startoptrec to minoptprec.
726
727
728 **Tree rejection threshold**
729
730 This setting controls which trees have more extensive branch-length
731 optimization applied to them. All trees created by a branch swap receive
732 optimization on a few branches that directly took part in the rearrangement.
733 If the difference in score between the partially optimized tree and the best
734 known tree is greater than treerejectionthreshold, no further optimization
735 is applied to the branches of that tree. Reducing this value can
736 significantly reduce runtimes, often with little or no effect on results.
737 However, it is possible that a better tree could be missed if this is set
738 too low. In cases in which obtaining the very best tree per search is not
739 critical (e.g., bootstrapping), setting this lower (~20) is probably safe.
740
741
742 **Settings controlling the proportions of the mutation types**
743
744
745 **Weight on topology mutations**
746
747 The prior weight assigned to the class of topology mutations (NNI, SPR and
748 limSPR). Note that setting this to 0.0 turns off topology mutations, meaning
749 that the tree topology is fixed for the run. This used to be a way to have
750 the program estimate only model parameters and branch-lengths, but the
751 optimizeinputonly setting is now a better way to go.
752
753
754 **Weight on model parameter mutations**
755
756 The prior weight assigned to the class of model mutations. Note that setting
757 this at 0.0 fixes the model during the run.
758
759
760 **Weight on branch-length parameter mutations**
761
762 The prior weight assigned to branch-length mutations. The same procedure
763 used above to determine the proportion of Topology:Model:Branch-Length
764 mutations is also used to determine the relative proportions of the three
765 types of topological mutations (NNI:SPR:limSPR), controlled by the following
766 three weights. Note that the proportion of mutations applied to each of the
767 model parameters is not user controlled.
768
769
770 **Weight on NNI topology changes**
771
772 The prior weight assigned to NNI mutations
773
774
775 **Weight on SPR topology changes**
776
777 The prior weight assigned to random SPR mutations. For very large datasets
778 it is often best to set this to 0.0, as random SPR mutations essentially
779 never result in score increases.
780
781
782 **Weight on localized SPR topology changes**
783
784 The prior weight assigned to SPR mutations with the reconnection branch
785 limited to being a maximum of limsprrange branches away from where the
786 branch was detached.
787
788
789 **Interval Length**
790
791 The number of generations in each interval during which the number and
792 benefit of each mutation type are stored.
793
794
795 **Number of intervals to store**
796
797 The number of intervals to be stored. Thus, records of mutations are kept
798 for the last (intervallength x intervalstostore) generations. Every
799 intervallength generations the probabilities of the mutation types are
800 updated by the scheme described above.
801
802
803 **Settings controlling mutation details**
804
805
806 **Max range for localized SPR topology changes**
807
808 The maximum number of branches away from its original location that a branch
809 may be reattached during a limited SPR move. Setting this too high (&gt; 10)
810 can seriously degrade performance, but if you do so in conjunction with a
811 large increase in genthreshfort.
812
813
814 **Settings controlling mutation details**
815
816 The mean of the binomial distribution from which the number of branch
817 lengths mutated is drawn during a branch length mutation.
818
819
820 **Magnitude of branch-length mutations**
821
822 The shape parameter of the gamma distribution (with a mean of 1.0) from
823 which the branch-length multipliers are drawn for branch-length mutations.
824 Larger numbers cause smaller changes in branch lengths. (Note that this has
825 nothing to do with gamma rate heterogeneity.)
826
827
828 **Magnitude of model parameter mutations**
829
830 The shape parameter of the gamma distribution (with a mean of 1.0) from
831 which the model mutation multipliers are drawn for model parameters
832 mutations. Larger numbers cause smaller changes in model parameters. (Note
833 that this has nothing to do with gamma rate heterogeneity.)
834
835
836 **Relative weight assigned to already attempted branch swaps**
837
838 With version 0.95 and later, GARLI keeps track of which branch swaps it has
839 attempted on the current best tree. Because swaps are applied randomly, it
840 is possible that some swaps are tried twice before others are tried at all.
841 This option allows the program to bias the swaps applied toward those that
842 have not yet been attempted. Each swap is assigned a relative weight
843 depending on the number of times that it has been attempted on the current
844 best tree. This weight is equal to (uniqueswapbias) raised to the (# times
845 swap attempted) power. In other words, a value of 0.5 means that swaps that
846 have already been tried once will be half as likely as those not yet
847 attempted, swaps attempted twice will be ¼ as likely, etc. A value of 1.0
848 means no biasing. Use of this option may allow the use of somewhat larger
849 values of limsprrange.
850
851
852 **Relative weight assigned to branch swaps based on locality**
853
854 This option is similar to uniqueswapbias, except that it biases toward
855 certain swaps based on the topological distance between the initial and
856 rearranged trees. The distance is measured as in the limsprrange, and is
857 half the the Robinson-Foulds distance between the trees. As with
858 uniqueswapbias, distanceswapbias assigns a relative weight to each potential
859 swap. In this case the weight is (distanceswapbias) raised to the
860 (reconnection distance - 1) power. Thus, given a value of 0.5, the weight of
861 an NNI is 1.0, the weight of an SPR with distance 2 is 0.5, with distance 3
862 is 0.25, etc. Note that values less than 1.0 bias toward more localized
863 swaps, while values greater than 1.0 bias toward more extreme swaps. Also
864 note that this bias is only applied to limSPR rearrangements. Be careful in
865 setting this, as extreme values can have a very large effect.
866 417
867 </help> 418 </help>
868 </tool> 419 </tool>