Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering
comparison mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7
| author | iuc |
|---|---|
| date | Thu, 27 Mar 2025 14:37:56 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9f6869226de1 |
|---|---|
| 1 <tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | |
| 2 <description> | |
| 3 of very large datasets | |
| 4 </description> | |
| 5 <macros> | |
| 6 <import>macro.xml</import> | |
| 7 </macros> | |
| 8 <expand macro="biotools"/> | |
| 9 <expand macro="requirements"/> | |
| 10 <expand macro="version_command"/> | |
| 11 <command detect_errors="exit_code"><![CDATA[ | |
| 12 mmseqs easy-linclust | |
| 13 '$input_fasta' | |
| 14 'result' | |
| 15 'tmp' | |
| 16 | |
| 17 #if '$alph_type.dbtype' == "1" | |
| 18 --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale | |
| 19 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale | |
| 20 #elif '$alph_type.dbtype' == "2" | |
| 21 --zdrop $alph_type.type.zdrop | |
| 22 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale | |
| 23 --adjust-kmer-len $alph_type.type.adjust_kmer_len | |
| 24 #end if | |
| 25 ##Pre-filter options | |
| 26 --add-self-matches $prefilter.add_self_matches | |
| 27 -k $prefilter.kmer_length | |
| 28 ##--split-memory-limit BYTE Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0] | |
| 29 --mask $prefilter.mask | |
| 30 --mask-prob $prefilter.mask_prob | |
| 31 --mask-lower-case $prefilter.mask_lower_case | |
| 32 --spaced-kmer-mode $prefilter.spaced_kmer_mode | |
| 33 ##--spaced-kmer-pattern STR User-specified spaced k-mer pattern [] | |
| 34 ##--disk-space-limit BYTE Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0] | |
| 35 | |
| 36 ##Align options | |
| 37 -a $align.convertalis | |
| 38 ##The next 2 parameters seems to be the same | |
| 39 --alignment-mode $align.alignment_mode | |
| 40 --alignment-output-mode $align.alignment_output_mode | |
| 41 --wrapped-scoring $align.wrapped_scoring | |
| 42 -e $align.evalue | |
| 43 --min-seq-id $min_seq_id | |
| 44 --min-aln-len $align.min_aln_len | |
| 45 --seq-id-mode $align.seq_id_mode | |
| 46 --alt-ali $align.alt_ali | |
| 47 -c $cov | |
| 48 --cov-mode $cov_mode | |
| 49 --max-rejected $align.max_rejected | |
| 50 --max-accept $align.max_accept | |
| 51 --score-bias $align.score_bias | |
| 52 --realign $align.realign | |
| 53 --realign-score-bias $align.realign_score_bias | |
| 54 --realign-max-seqs $align.realign_max_seqs | |
| 55 --corr-score-weight $align.corr_score_weight | |
| 56 | |
| 57 ##Clustering options | |
| 58 --cluster-mode $cluster.cluster_mode | |
| 59 --max-iterations $cluster.max_iterations | |
| 60 --similarity-type $cluster.similarity_type | |
| 61 | |
| 62 ##kmermatcher options | |
| 63 ##--weights STR Weights used for cluster priorization [] | |
| 64 --cluster-weight-threshold $kmermatcher.cluster_weight_threshold | |
| 65 --kmer-per-seq $kmermatcher.kmer_per_seq | |
| 66 --hash-shift $kmermatcher.hash_shift | |
| 67 --include-only-extendable $kmermatcher.include_only_extendable | |
| 68 --ignore-multi-kmer $kmermatcher.ignore_multi_kmer | |
| 69 | |
| 70 ##Profile options | |
| 71 ##--pca Pseudo count admixture strength [] | |
| 72 ##--pcb Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) [] | |
| 73 | |
| 74 ##Misc options | |
| 75 --rescore-mode $misc.rescore_mode | |
| 76 --dbtype $alph_type.dbtype | |
| 77 --shuffle $misc.shuffle | |
| 78 --id-offset $misc.id_offset | |
| 79 | |
| 80 ##Common options | |
| 81 ##--compressed INT Write compressed output [0] | |
| 82 --threads "\${GALAXY_SLOTS:-1}" | |
| 83 ##-v INT Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3] | |
| 84 --max-seq-len $common.max_seq_len | |
| 85 ##--db-load-mode INT Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0] | |
| 86 ##--mpi-runner STR Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") [] | |
| 87 ##--force-reuse BOOL Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0] | |
| 88 ##--remove-tmp-files BOOL Delete temporary files [0] | |
| 89 | |
| 90 ##Expert options | |
| 91 --filter-hits $expert.filter_hits | |
| 92 --sort-results $expert.sort_results | |
| 93 ##--create-lookup INT Create database lookup file (can be very large) [0] | |
| 94 ]]></command> | |
| 95 <inputs> | |
| 96 <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" /> | |
| 97 <conditional name="alph_type"> | |
| 98 <param argument="--dbtype" type="select" label="Input data type" help="" > | |
| 99 <option value="0" selected="true">Automatic</option> | |
| 100 <option value="1">Amino acid</option> | |
| 101 <option value="2">Nucleotides</option> | |
| 102 </param> | |
| 103 <when value="0"/> | |
| 104 <when value="1"> | |
| 105 <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/> | |
| 106 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/> | |
| 107 </when> | |
| 108 <when value="2"> | |
| 109 <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/> | |
| 110 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/> | |
| 111 <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/> | |
| 112 </when> | |
| 113 </conditional> | |
| 114 <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/> | |
| 115 <param argument="--cov-mode" type="select" label="Coverage mode" help="" > | |
| 116 <option value="0" selected="true">Coverage of query and target</option> | |
| 117 <option value="1">Coverage of target</option> | |
| 118 <option value="2">Coverage of query</option> | |
| 119 <option value="3">Target seq. length has to be at least x% of query length</option> | |
| 120 <option value="4">Query seq. length has to be at least x% of target length</option> | |
| 121 <option value="5">Short seq. needs to be at least x% of the other seq. length</option> | |
| 122 </param> | |
| 123 <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/> | |
| 124 <section name="prefilter" title="Pre-filter"> | |
| 125 <expand macro="prefilter_common_parameters" /> | |
| 126 <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help=""> | |
| 127 <option value="0" selected="true">Use consecutive positions in k-mers</option> | |
| 128 <option value="1">Use spaced k-mers</option> | |
| 129 </param> | |
| 130 </section> | |
| 131 <section name="align" title="Align"> | |
| 132 <expand macro="align_common_parameters" /> | |
| 133 <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" > | |
| 134 <option value="0" selected="true">Automatic</option> | |
| 135 <option value="1">Only score and end_pos</option> | |
| 136 <option value="2">Also start_pos and cov</option> | |
| 137 <option value="3">Also seq.id</option> | |
| 138 <option value="4">Only ungapped alignment</option> | |
| 139 </param> | |
| 140 <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/> | |
| 141 <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/> | |
| 142 <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/> | |
| 143 </section> | |
| 144 <section name="cluster" title="Clustering"> | |
| 145 <param argument="--cluster-mode" type="select" label="Cluster mode" help="" > | |
| 146 <option value="0" selected="true">Set-Cover (greedy)</option> | |
| 147 <option value="1">Connected component (BLASTclust)</option> | |
| 148 <option value="2">Greedy clustering by sequence length (CDHIT)</option> | |
| 149 </param> | |
| 150 <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/> | |
| 151 <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" > | |
| 152 <option value="1">Alignment score</option> | |
| 153 <option value="2" selected="true">Sequence identity</option> | |
| 154 </param> | |
| 155 </section> | |
| 156 <section name="kmermatcher" title="K-mer matcher"> | |
| 157 <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/> | |
| 158 <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/> | |
| 159 <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/> | |
| 160 <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/> | |
| 161 <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/> | |
| 162 </section> | |
| 163 <section name="misc" title="Misc"> | |
| 164 <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" > | |
| 165 <option value="0" selected="true">Hamming distance</option> | |
| 166 <option value="1">Local alignment (score only)</option> | |
| 167 <option value="2">Local alignment</option> | |
| 168 <option value="3">Global alignment</option> | |
| 169 <option value="4">Longest alignment fulfilling window quality criterion</option> | |
| 170 </param> | |
| 171 <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/> | |
| 172 <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/> | |
| 173 </section> | |
| 174 <expand macro="common_section"/> | |
| 175 <section name="expert" title="Expert"> | |
| 176 <expand macro="expert_common_parameters" /> | |
| 177 </section> | |
| 178 <section name="output_files" title="Selection of the output files"> | |
| 179 <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true" label="Output files selection"> | |
| 180 <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option> | |
| 181 <option value="file_all_seq" selected="true">FASTA-like per cluster</option> | |
| 182 <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option> | |
| 183 </param> | |
| 184 </section> | |
| 185 </inputs> | |
| 186 <outputs> | |
| 187 <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" > | |
| 188 <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter> | |
| 189 </data> | |
| 190 <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" > | |
| 191 <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter> | |
| 192 </data> | |
| 193 <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list"> | |
| 194 <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter> | |
| 195 </data> | |
| 196 </outputs> | |
| 197 <tests> | |
| 198 <test expect_num_outputs="3"> | |
| 199 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/> | |
| 200 <conditional name="alph_type"> | |
| 201 <param name="dbtype" value="2"/> | |
| 202 </conditional> | |
| 203 <output name="output_rep_seq" ftype="fasta"> | |
| 204 <assert_contents> | |
| 205 <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/> | |
| 206 <has_size value="551000" delta="50000"/> | |
| 207 </assert_contents> | |
| 208 </output> | |
| 209 <output name="output_all_seq" ftype="fasta"> | |
| 210 <assert_contents> | |
| 211 <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/> | |
| 212 <has_size value="627000" delta="20000"/> | |
| 213 <has_n_lines n="14806" delta="500"/> | |
| 214 </assert_contents> | |
| 215 </output> | |
| 216 <output name="output_cluster" ftype="tabular"> | |
| 217 <assert_contents> | |
| 218 <has_line line="MYSTERY.13	MYSTERY.13"/> | |
| 219 <has_n_columns n="2"/> | |
| 220 <has_size value="113000" delta="50000"/> | |
| 221 </assert_contents> | |
| 222 </output> | |
| 223 </test> | |
| 224 </tests> | |
| 225 <help><![CDATA[ | |
| 226 **MMseqs2: ultra fast and sensitive sequence search and clustering suite** | |
| 227 | |
| 228 MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. | |
| 229 MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. | |
| 230 The software is designed to run on multiple cores and servers and exhibits very good scalability. | |
| 231 MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. | |
| 232 It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed. | |
| 233 | |
| 234 **Usage** | |
| 235 MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm. | |
| 236 It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently. | |
| 237 | |
| 238 https://github.com/soedinglab/MMseqs2 | |
| 239 | |
| 240 ]]></help> | |
| 241 <expand macro="citations"/> | |
| 242 </tool> |
