Mercurial > repos > rnateam > mafft
view mafft.xml @ 16:8e649f27aa0d draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/mafft commit 3d98df472498e1273369d23822d10db14f337443
author | bgruening |
---|---|
date | Thu, 22 Aug 2024 19:20:24 +0000 |
parents | bf28a8cff401 |
children |
line wrap: on
line source
<?xml version="1.0" encoding="UTF-8"?> <tool id="rbc_mafft" name="MAFFT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>Multiple alignment program for amino acid or nucleotide sequences</description> <macros> <import>macros.xml</import> </macros> <expand macro="biotools"/> <expand macro="requirements" /> <version_command>mafft --version</version_command> <command detect_errors="exit_code"><![CDATA[ ## Concatenate all input datasets no matter how they were provided bash inputs.sh && ## Count total number of sequences across input datasets ## Can't do this on the concatenated input data prepared above because it's ## just a regular file and we don't have Galaxy-generated metadata for it. #set sequence_count = 0 #if $input.mapping == "implicit" #for $batch in $input.batches: #set sequence_count += int($batch.inputs.metadata.sequences) #end for #elif $input.mapping == "merge" #for $batch in $input.batches: #for $dataset in $batch.inputs: #set sequence_count += int($dataset.metadata.sequences) #end for #end for #end if ## For those cases in which MAFFT needs fasta3, set an env variable to make it ## find the executable. Necessary because the current version of MAFFT still ## expects a fasta34 executable in path, but we bundle a newer version. #if $flavour.type == "custom" #if $flavour.guidetree.guidetree_generation == "original" #if $flavour.guidetree.dist_flavour.distance_method == "--fastapair" export FASTA_4_MAFFT=`which @FASTA3_EXEC@` && #end if #elif $flavour.guidetree.guidetree_generation == "parttree" #if $flavour.guidetree.parttree_selection.parttree_option == "--fastaparttree" export FASTA_4_MAFFT=`which @FASTA3_EXEC@` && #end if #end if #end if ## groupsize warning #if $flavour.type == "custom" #if $flavour.guidetree.guidetree_generation == "parttree" #if $flavour.guidetree.parttree_selection.groupsize > $sequence_count echo "WARNING = Chosen groupsize number larger than number of input sequences. Not recommended for MAFFT." && #end if #end if #end if ## run MAFFT with predefined MSA flavours or custom settings #if $flavour.type == "custom" mafft #if $flavour.guidetree.guidetree_generation == "original" #if $flavour.guidetree.dist_flavour.distance_method == "--6merpair" --6merpair --retree $flavour.guidetree.dist_flavour.retree #elif $flavour.guidetree.dist_flavour.distance_method == "--globalpair" --globalpair --weighti $flavour.guidetree.dist_flavour.weighti #if $flavour.guidetree.dist_flavour.treat_unrelated_segments.unalignlevel > 0 --allowshift --unalignlevel $flavour.guidetree.dist_flavour.treat_unrelated_segments.unalignlevel #end if $flavour.guidetree.dist_flavour.treat_unrelated_segments.leavegappyregion #elif $flavour.guidetree.dist_flavour.distance_method == "--localpair" --localpair --weighti $flavour.guidetree.dist_flavour.weighti --lop $flavour.guidetree.dist_flavour.lop --lep $flavour.guidetree.dist_flavour.lep --lexp $flavour.guidetree.dist_flavour.lexp #elif flavour.guidetree.dist_flavour.distance_method == "--genafpair" --genafpair --weighti $flavour.guidetree.dist_flavour.weighti --lop $flavour.guidetree.dist_flavour.lop --lep $flavour.guidetree.dist_flavour.lep --lexp $flavour.guidetree.dist_flavour.lexp --LOP $flavour.guidetree.dist_flavour.LOP --LEXP $flavour.guidetree.dist_flavour.LEXP #elif $flavour.guidetree.dist_flavour.distance_method == "--fastapair" --fastapair --weighti $flavour.guidetree.dist_flavour.weighti #end if #elif $flavour.guidetree.guidetree_generation == "parttree" $flavour.guidetree.parttree_selection.parttree_option --retree $flavour.guidetree.parttree_selection.retree --partsize $flavour.guidetree.parttree_selection.partsize #if $flavour.guidetree.parttree_selection.groupsize != -1 --groupsize $flavour.guidetree.parttree_selection.groupsize #end if #end if ## progressive alignment calculation --maxiterate $flavour.progressive_alignment_calculation.maxiterate $flavour.progressive_alignment_calculation.fft $flavour.progressive_alignment_calculation.noscore #else $flavour.type #if $flavour.type == "mafft-ginsi" or "--globalpair" in str($flavour.type) #if $flavour.treat_unrelated_segments.unalignlevel > 0 --allowshift --unalignlevel $flavour.treat_unrelated_segments.unalignlevel #end if $flavour.treat_unrelated_segments.leavegappyregion #end if #end if ## handle scoring matrix $datatype_selection.datatype #if $datatype_selection.datatype != "" #if $datatype_selection.scoring_matrix.type == "custom" --aamatrix '$datatype_selection.scoring_matrix.aamatrix' #else $datatype_selection.scoring_matrix.type $datatype_selection.scoring_matrix.coefficient #end if $datatype_selection.fmodel ## gap penalties #if $datatype_selection.gap_costs.use_defaults == "no" --ep $datatype_selection.gap_costs.ep --op $datatype_selection.gap_costs.op #end if #end if ## output options $reorder $outputFormat $treeout ## specify threads to use ## disable multithreading during iterative refinement step for reproducibility ## cmp. https://mafft.cbrc.jp/alignment/software/multithreading.html --thread \${GALAXY_SLOTS:-1} --threadit 0 input.fa > '$outputAlignment' ## Output alignment tree #if $treeout && mv input.fa.tree '$outputTree' #end if ]]></command> <configfiles> <configfile filename="inputs.sh"><![CDATA[ #if $input.mapping == "implicit" #for $batch in $input.batches: cat $batch.inputs >> input.fa #end for #elif $input.mapping == "merge" #for $batch in $input.batches: #for $dataset in $batch.inputs: cat $dataset >> input.fa #end for #end for #end if ]]></configfile> </configfiles> <inputs> <conditional name="input"> <param name="mapping" type="select" label="For multiple inputs generate" help="All you have is a single dataset with the sequences to align? You can skip this help text and continue with the default setting. For multiple input datasets, the first mode will launch separate MAFFT jobs for all sequences from the first, second, ..., n-th dataset/element from each input batch, respectively, resulting in n separate MSAs. The second mode will concatenate all input sequences from all inputs for a single run of MAFFT and will generate a single MSA."> <option value="implicit">one or several MSAs depending on input structure</option> <option value="merge">a single MSA of all sequences from all inputs</option> </param> <when value="implicit"> <repeat name="batches" title="Input batch" default="1" min="1"> <param name="inputs" type="data" format="fasta" label="Sequences to align" help="Amino acid or nucleotide sequences in FASTA format. Add Dataset for concatenation of every additional dataset with each file of the first upload panel"/> </repeat> </when> <when value="merge"> <repeat name="batches" title="Input batch" default="1" min="1"> <param name="inputs" type="data" format="fasta" label="Sequences to align" help="Amino acid or nucleotide sequences in FASTA format." multiple="true"/> </repeat> </when> </conditional> <conditional name="datatype_selection"> <param name="datatype" type="select" label="Type of sequences" help="The tool can try to detect the type of the input sequences, but you likely want to declare it explicitly. Doing so will also give you control over the scoring matrix used for the alignment, while autodetection will result in the Kimura PAM200 and the BLOSUM62 matrix being used for nucleic acids and protein alignments, respectively."> <option value="">auto-detect</option> <option value="--nuc">Nucleic acids</option> <option value="--amino">Amino acids</option> </param> <when value="" /> <when value="--nuc"> <conditional name="scoring_matrix"> <param name="type" type="select" label="Type of scoring matrix" help="See the tool help below for details about the available options."> <option value="--kimura">Kimura</option> </param> <when value="--kimura"> <param name="coefficient" argument="--kimura" type="integer" min="1" value="200" label="PAM value of the matrix" /> </when> </conditional> <expand macro="misc_scoring_scheme" /> </when> <when value="--amino"> <conditional name="scoring_matrix"> <param name="type" type="select" label="Type of scoring matrix" help="See the tool help below for details about the available options."> <option value="--bl" selected="true">BLOSUM</option> <option value="--jtt">JTT</option> <option value="--tm">transmembrane protein-optimized JTT</option> <option value="custom">custom matrix</option> </param> <when value="--bl"> <param name="coefficient" argument="--bl" type="select" label="Coefficient of the BLOSUM matrix" display="radio"> <option value="30">30</option> <option value="45">45</option> <option value="62" selected="true">62</option> <option value="80">80</option> </param> </when> <when value="--jtt"> <param name="coefficient" argument="--jtt" type="integer" min="1" value="200" label="PAM value of the matrix" /> </when> <when value="--tm"> <param name="coefficient" argument="--tm" type="integer" min="1" value="200" label="PAM value of the matrix"/> </when> <when value="custom"> <param argument="--aamatrix" type="data" format="txt" label="User-defined AA scoring matrix" help="The expected format of the matrix is the same as that used by BLAST."/> </when> </conditional> <expand macro="misc_scoring_scheme" /> </when> </conditional> <conditional name="flavour"> <param name="type" type="select" label="MAFFT flavour" help="Run mafft with pre-defined input parameters. Specification of these parameters can be found in the help section. With 'Auto', the tool automatically selects an appropriate strategy from L-INS-i, FFT-NS-i and FFT-NS-2, according to data size from few to many respectively. Default setting: FFT-NS-2."> <option value="mafft --auto">Auto</option> <option value="mafft-fftns --retree 1">FFT-NS-1 (very fast, progressive method; use for >2,000 sequences)</option> <option value="mafft-fftns" selected="true">FFT-NS-2 (fast, progressive method)</option> <option value="mafft-nwns">NW-NS-2 (fast, progressive method without FFT approximation)</option> <option value="mafft --retree 1 --maxiterate 0 --nofft --parttree">NW-NS-PartTree-1 (very fast, progressive method using the PartTree algorithm; for ~10,000 to ~50,000 sequences)</option> <option value="mafft --maxiterate 0 --globalpair">G-INS-1 (slow, progressive method with an accurate guide tree)</option> <option value="mafft-fftnsi">FFT-NS-i (slow, iterative refinement method)</option> <option value="mafft-nwnsi">NW-NS-i (slow, iterative refinement method without FFT approximation)</option> <option value="mafft-einsi">E-INS-i (very slow; use for <200 sequences with multiple conserved domains and long gaps)</option> <option value="mafft-linsi">L-INS-i (very slow; use for <200 sequences with one conserved domain and long gaps)</option> <option value="mafft-ginsi">G-INS-i (very slow; recommended for <200 sequences with global homology)</option> <option value="custom">Custom Parameters</option> </param> <when value="mafft --auto"/> <when value="mafft-fftns --retree 1"/> <when value="mafft-fftns"/> <when value="mafft-nwns"/> <when value="mafft --retree 1 --maxiterate 0 --nofft --parttree"/> <when value="mafft --maxiterate 0 --globalpair"> <expand macro="global_align_options"/> </when> <when value="mafft-fftnsi"/> <when value="mafft-nwnsi"/> <when value="mafft-einsi"/> <when value="mafft-linsi"/> <when value="mafft-ginsi"> <expand macro="global_align_options"/> </when> <when value="custom"> <conditional name="guidetree"> <param name="guidetree_generation" type="select" label="GuideTree-Generation" help="Parttree is recommended for a large number (> ~10,000) of sequences as input"> <option value="original">Original guidetree building method of MAFFT</option> <option value="parttree">Fast guidetree building method with PartTree-algorithm</option> </param> <when value="original"> <conditional name="dist_flavour"> <param name="distance_method" type="select" label="Distance method" help="Distance method must be chosen regarding your data"> <option value="--6merpair" selected="true">Shared 6mers distance (fastest) (--6merpair)</option> <option value="--globalpair">Global alignment (Needleman-Wunsch) (--globalpair)</option> <option value="--localpair">Local alignment (Smith-Waterman) (--localpair)</option> <option value="--genafpair">Local, affine gap cost (--genafpair)</option> <option value="--fastapair">All pairwise alignments are computed with FASTA (--fastapair)</option> </param> <when value="--6merpair"> <param argument="--retree" type="integer" min="1" max="3" value="2" label="Guide tree is build this number of times in the progressive stage."/> </when> <when value="--globalpair"> <expand macro="global_align_options"/> <expand macro="weighti_param" /> </when> <when value="--localpair"> <expand macro="weighti_param" /> <param argument="--lop" type="float" value="-2.0" label="Gap opening penalty at local pairwise alignment" help="-2.00 default value"/> <param argument="--lep" type="float" value="0.1" label="Offset value at local pairwise alignment" help="0.1 default value"/> <param argument="--lexp" type="float" value="-0.1" label="Gap extension penalty at local pairwise alignment." help="-0.1 default value" /> </when> <when value="--genafpair"> <expand macro="weighti_param" /> <param argument="--lop" type="float" value="-2.0" label="Gap opening penalty at local pairwise alignment" help="-2.00 default value" /> <param argument="--lep" type="float" value="0.1" label="Offset value at local pairwise alignment" help="0.1 default value" /> <param argument="--lexp" type="float" value="-0.1" label="Gap extension penalty at local pairwise alignment." help="-0.1 default value" /> <param argument="--LOP" type="float" value="-6.00" label="Gap opening penalty to skip the alignment" help="-6.00 default value" /> <param argument="--LEXP" type="float" value="0.00" label="Gap extension penalty to skip the alignment" help="0 default value" /> </when> <when value="--fastapair"> <expand macro="weighti_param" /> </when> </conditional> </when> <when value="parttree"> <conditional name="parttree_selection"> <param name="parttree_option" type="select" label="Which distance for the fast tree-building method?"> <option value="--parttree" selected="true">Fast tree-building method with the 6mer distance (--parttree)</option> <option value="--fastaparttree">Distances based on FASTA (--fastaparttree)</option> <option value="--dpparttree">Distances based on DP. (Needleman-Wunsch) (--dpparttree)</option> </param> <when value="--parttree"> <expand macro="parttree_parameters" /> </when> <when value="--fastaparttree"> <expand macro="parttree_parameters" /> </when> <when value="--dpparttree"> <expand macro="parttree_parameters" /> </when> </conditional> </when> </conditional> <section name="progressive_alignment_calculation" title="Progressive alignment calculation" expanded="true"> <param argument="--maxiterate" type="integer" min="0" max="1000" value="0" label="Maximum number of iterations" help="1000 for maximum quality" /> <param argument="--fft" type="boolean" truevalue="--fft" falsevalue="--nofft" checked="True" label="Use FFT approximation in group-to-group alignment?" /> <param argument="--noscore" type="boolean" truevalue="" falsevalue="--noscore" checked="True" label="Check alignment score in the iterative refinement stage?" /> </section> </when> </conditional> <param argument="--reorder" type="boolean" truevalue="--reorder" falsevalue="" checked="False" label="Reorder output?" help="Default order is input order." /> <param argument="--treeout" type="boolean" truevalue="--treeout" falsevalue="" checked="False" label="Keep alignment tree as output?" /> <param name="outputFormat" type="select" label="Output format"> <option value="" selected="true">FASTA</option> <option value="--clustalout">ClustalW</option> <option value="--phylipout">Phylip</option> </param> </inputs> <outputs> <data format="fasta" name="outputAlignment" label="${tool.name} on ${on_string}"> <change_format> <when input="outputFormat" value="--clustalout" format="clustal"/> <when input="outputFormat" value="--phylipout" format="phylip"/> </change_format> </data> <data name="outputTree" format="txt" label="${tool.name} Guide Tree"> <filter>treeout</filter> </data> </outputs> <tests> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <output name="outputAlignment" ftype="fasta" file="mafft_default.aln"/> </test> <!-- test autodetection of suitable algorithm from input; expected to choose L-INS-i --> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <conditional name="flavour"> <param name="type" value="mafft --auto"/> </conditional> <output name="outputAlignment" ftype="fasta" file="mafft_auto_linsi.aln"/> </test> <!-- test explicit specification of L-INS-i mode --> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <conditional name="flavour"> <param name="type" value="mafft-linsi"/> </conditional> <output name="outputAlignment" ftype="fasta" file="mafft_auto_linsi.aln"/> </test> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <conditional name="datatype_selection"> <param name="datatype" value="--amino"/> <conditional name="scoring_matrix"> <param name="type" value="--bl"/> <param name="coefficient" value="80"/> </conditional> </conditional> <conditional name="flavour"> <param name="type" value="mafft-fftns"/> </conditional> <param name="outputFormat" value="--clustalout"/> <output name="outputAlignment" ftype="clustal" file="mafft_explicit_amino_blosum80.clustal.aln" lines_diff="2" /> </test> <test expect_num_outputs="1" > <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_nuc.fa"/> </repeat> </conditional> <conditional name="datatype_selection"> <param name="datatype" value="--nuc"/> <conditional name="scoring_matrix"> <param name="type" value="--kimura"/> <param name="coefficient" value="40"/> </conditional> </conditional> <conditional name="flavour"> <param name="type" value="mafft-fftns"/> </conditional> <param name="outputFormat" value="--phylipout"/> <output name="outputAlignment" ftype="phylip" file="mafft_kimura40.phylip.aln" /> </test> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <conditional name="datatype_selection"> <param name="datatype" value="--amino"/> </conditional> <conditional name="flavour"> <param name="type" value="custom"/> <conditional name="guidetree"> <param name="guidetree_generation" value="original"/> <conditional name="dist_flavour"> <param name="distance_method" value="--globalpair"/> <param name="weighti" value="3"/> </conditional> </conditional> <section name="progressive_alignment_calculation"> <param name="maxiterate" value="1000"/> </section> </conditional> <param name="outputFormat" value="--clustalout"/> <output name="outputAlignment" ftype="clustal" file="mafft_custom_original.clustal.aln" lines_diff="2"> </output> </test> <test expect_num_outputs="1"> <conditional name="input"> <param name="mapping" value="implicit"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> </conditional> <conditional name="datatype_selection"> <param name="datatype" value="--amino"/> </conditional> <conditional name="flavour"> <param name="type" value="custom"/> <conditional name="guidetree"> <param name="guidetree_generation" value="parttree"/> <conditional name="parttree_selection"> <param name="parttree_option" value="--parttree"/> <param name="retree" value="2"/> </conditional> </conditional> </conditional> <output name="outputAlignment" ftype="fasta" file="mafft_custom_parttree.aln" /> </test> <!-- test concatenation of multiple inputs --> <test expect_num_outputs="2"> <conditional name="input"> <param name="mapping" value="merge"/> <repeat name="batches"> <param name="inputs" value="sample_amino.fa"/> </repeat> <repeat name="batches"> <param name="inputs" value="sample_nuc.fa"/> </repeat> </conditional> <param name="treeout" value="true"/> <output name="outputAlignment" ftype="fasta"> <metadata name="sequences" value="39"/> </output> </test> </tests> <help><![CDATA[ **What it does** MAFFT is a multiple sequence alignment (MSA) program, which offers a range of multiple alignment methods. Input types and alignment scoring matrices ------------------------------------------ For the alignment of *protein* sequences, you can choose between: - different flavors of BLOSUM matrices (`Henikoff S and Henikoff JG, 1992 <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC50453/>`__) - JTT matrices with any point accepted mutation (PAM) rate (`Jones, Taylor and Thornton, 1992 <https://pubmed.ncbi.nlm.nih.gov/1633570/>`__) - PAM-based matrices optimized for transmembrane proteins (`Jones, Taylor and Thornton, 1994 <https://pubmed.ncbi.nlm.nih.gov/8112466/>`__) For nucleic acid sequence alignment, MAFFT uses Kimura's two parameter model (`Kimura 1980 <https://pubmed.ncbi.nlm.nih.gov/7463489/>`__) with a transitions to transversions ratio of 2 (kappa 2), but lets you configure the PAM value. The tool can also try to autodetect the sequence type from the input(s). In this mode, it will use the BLOSUM 62 matrix if it detects amino acids input, and the Kimura kappa 2 PAM200 matrix for nucleic acids. Pre-configured MSA methods -------------------------- From the `MAFFT man page <https://mafft.cbrc.jp/alignment/software/manual/manual.html>`__, an overview of the different predefined flavours of the tool. **Accuracy-oriented methods:** - *L-INS-i* (probably most accurate; recommended for <200 sequences; iterative refinement method incorporating local pairwise alignment information): - mafft --localpair --maxiterate 1000 input [> output] - *G-INS-i* (suitable for sequences of similar lengths; recommended for <200 sequences; iterative refinement method incorporating global pairwise alignment information): - mafft --globalpair --maxiterate 1000 input [> output] - *E-INS-i* (suitable for sequences containing large unalignable regions; recommended for <200 sequences): - mafft --ep 0 --genafpair --maxiterate 1000 input [> output]. For E-INS-i, the --ep 0 option is recommended to allow large gaps. **Speed-oriented methods:** - *FFT-NS-i* (iterative refinement method; two cycles only): - mafft --retree 2 --maxiterate 2 input [> output] - *FFT-NS-2* (fast; progressive method): - mafft --retree 2 --maxiterate 0 input [> output] - *NW-NS-i* (iterative refinement method without FFT approximation; two cycles only): - mafft --retree 2 --maxiterate 2 --nofft input [> output] - *NW-NS-2* (fast; progressive method without the FFT approximation): - mafft --retree 2 --maxiterate 0 --nofft input [> output] - *NW-NS-PartTree-1* (recommended for ~10,000 to ~50,000 sequences; progressive method with the PartTree algorithm): - mafft --retree 1 --maxiterate 0 --nofft --parttree input [> output] - *FFT-NS-1* (very fast; recommended for >2000 sequences; progressive method with a rough guide tree): - mafft --retree 1 --maxiterate 0 input [> output] ]]></help> <expand macro="citations" /> </tool>