Mercurial > repos > iuc > lexicmap

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lexicmap-index.xml	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,81 @@
+<tool id="lexicmap_index" name="LexicMap Index" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE_VERSION@">
+    <description>Builds LexicMap index</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="bio_tools"/>
+    <expand macro="requirements"/>
+
+    <command detect_errors="exit_code"><![CDATA[
+
+echo '$input_genomes' | sed 's/,/\n/g' > fasta_list.tsv &&
+mkdir '$lexicmap_index.extra_files_path' &&
+lexicmap index
+
+    --threads "\${GALAXY_SLOTS:-1}"
+
+    -X ./fasta_list.tsv -O '$lexicmap_index.extra_files_path'
+    --big-genomes '$lexicmap_big_genomes'
+    --batch-size '$batch_size'
+    --contig-interval '$contig_interval'
+    --kmer '$kmer'
+    --masks '$masks'
+    --max-genome '$max_genome'
+    --min-seq-len '$min_seq_len'
+    --rand-seed '$rand_seed'
+    --seed-in-desert-dist '$seed_in_desert_dist'
+    --seed-max-desert '$seed_max_desert'
+    #if $mask_file
+        --mask-file '$mask_file'
+    #end if
+
+    ]]></command>
+    <inputs>
+        <param argument="--input-genomes" format="@FASTA_TYPES@" type="data" optional="false" multiple="true" label="FASTA files" help="Should be of datatype &quot;fasta.gz&quot; or &quot;fasta&quot;"  />
+        <section name="advanced_settings" title="Advanced settings" expanded="false">
+            <param argument="--batch-size" value="5000" max="131072" type="integer" label="Batch Size" help="Maximum number of genomes in each batch (maximum value: 131072)" />
+            <param argument="--contig-interval" min="1000" value="1000" type="integer" label="Contig interval" help="Length of interval (N's) between contigs in a genome. It can't betoo small (&gt;1000) or some alignments might be fragmented" />
+            <param argument="--kmer" value="31" type="integer" min="2" max="32" label="Max k-mer size" help="Maximum k-mer size. K needs to be &gt;= 32." />
+            <param argument="--mask-file" format="txt" type="data" optional="true" label="Mask file" help="File of custom masks. This flag oversides -k/--kmer, -m/--masks, -s/--rand-seed etc." />
+            <param argument="--masks" min="1" value="20000" type="integer" label="LexicHash masks" help="Number of LexicHash masks." />
+            <param argument="--max-genome" value="15000000" max="268435456" type="integer" label="Max genome size" help="Maximum genome size. Genomes with any single contig larger than the threshold will be skipped, while fragmented (with many contigs) genomes larger than the threshold will be split into chunks and alignments from these chunks will be merged in &quot;lexicmap search&quot;. The value needs to be smaller than the maximum supported genome size: 268435456." />
+            <param argument="--min-seq-len" value="-1" type="integer" label="Max sequence length" help="Maximum sequence length to index. The value would be equal to the kmer parameter value for values &gt;= 0." />
+            <param argument="--rand-seed" value="1" type="integer" label="Rand seed" help="Rand seed for generating random masks." />
+            <param argument="--seed-in-desert-dist" value="50" type="integer" label="Seed in desert dist" help="Distance of k-mers to fill deserts." />
+            <param argument="--seed-max-desert" value="100" type="integer" label="Seed max desert" help="Maximum length of sketching deserts, or maximum seed distance. Deserts with seed distance larger than this value will be filled by choosing k-mers roughly every --seed-in-desert-dist bases." />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="lexicmap_index" format="lexicmap_index" label="index data" />
+        <data name="lexicmap_big_genomes" format="tsv" label="out file with metrics about skipped genomes" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input_genomes" value="genomes/GCF_001502155.1_ViralProj307776_genomic.fna.gz,genomes/GCF_001502175.1_ViralProj307780_genomic.fna.gz" />
+            <section name="advanced_settings">
+                <param name="max_genome" value="10000" />
+            </section>
+            <output name="lexicmap_index" ftype="lexicmap_index">
+                <extra_files name="genomes.chunks.bin" value="db.lmi/genomes.chunks.bin" />
+                <extra_files name="info.toml" value="db.lmi/info.toml" lines_diff="2"/>
+                <extra_files name="masks.bin" value="db.lmi/masks.bin" />
+                <extra_files name="genomes.map.bin">
+                    <assert_contents>
+                        <has_size value="54" />
+                    </assert_contents>
+                </extra_files>
+                <expand macro="genomes_batch" />
+                <expand macro="seeds" />
+            </output>
+            <output name="lexicmap_big_genomes" ftype="tsv" file="lexicmap_big_genomes.tsv" compare="re_match" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+    Search sequences against an LexicMap index Database. For more information about settings
+    please visit: https://bioinf.shenwei.me/LexicMap/usage/index/
+
+    @info@
+        ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lexicmap.xml	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,160 @@
+<tool id="lexicmap_search" name="LexicMap Search" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE_VERSION@">
+    <description>nucleotide sequence tool for querying genomes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="bio_tools"/>
+    <expand macro="requirements"/>
+
+    <command detect_errors="exit_code"><![CDATA[
+
+lexicmap search
+
+    --threads "\${GALAXY_SLOTS:-1}"
+
+    ${load_whole_seeds}
+    ${all}
+
+    #if $db_opts.db_opts_selector == "histdb"
+        --index '${db_opts.histdb.extra_files_path}'
+    #else:
+        --index '${db_opts.lexicmap_index.fields.path}'
+    #end if
+
+    '$query' --out-file '$out_file'
+
+    --top-n-genomes '$top_n_genomes'
+
+    --align-band '$align_band'
+    --align-ext-len '$align_ext_len'
+    --align-max-gap '$align_max_gap'
+    --align-min-match-len '$align_min_match_len'
+    --align-min-match-pident '$align_min_match_pident'
+    --max-evalue '$max_evalue'
+    --max-query-conc '$max_query_conc'
+    --seed-max-dist '$seed_max_dist'
+    --seed-max-gap '$seed_max_gap'
+    --seed-min-prefix '$seed_min_prefix'
+    --seed-min-single-prefix '$seed_min_single_prefix'
+
+    #if $min_qcov_per_genome
+        --min-qcov-per-genome '$min_qcov_per_genome'
+    #end if
+
+    #if $min_qcov_per_hsp
+        --min-qcov-per-hsp '$min_qcov_per_hsp'
+    #end if
+
+    ]]></command>
+    <inputs>
+        <param name="query" type="data" format="fasta.gz" label="LexicMap query file" multiple="true"  help=""/>
+        <conditional name="db_opts">
+            <param name="db_opts_selector" type="select" label="LexiMap index source">
+              <option value="histdb" selected="true">From your history</option>
+              <option value="db">Locally installed LexicMap indexes</option>
+            </param>
+            <when value="histdb">
+                <param name="histdb" type="data" format="lexicmap_index" optional="false" label="LexicMap index" />
+            </when>
+            <when value="db">
+                <param name="lexicmap_index" type="select" optional="false" label="LexicMap index file">
+                    <options from_data_table="lexicmap_index"/>
+                </param>
+            </when>
+        </conditional>
+        <param argument="--top-n-genomes" type="integer" value="0" label="Keep top N genome matches for a query (0 for all)" />
+        <section name="advanced_settings" title="Advanced settings" expanded="false">
+            <param argument="--align-band" value="100" type="integer" label="Align band" help="Band size in backtracking the score matrix (pseudo alignment" />
+            <param argument="-align-ext-len" min="0" value="1000" type="integer" label="Align extend length" help="Extend length of upstream and downstream of seed regions, for extracting query and target sequences for alignment. It should be &lt;= contig interval length in database." />
+            <param argument="-align-max-gap" value="20" type="integer" label="Align max gap" help="Maximum gap in a HSP segment." />
+            <param argument="--align-min-match-len" value="50" type="integer" label="Align min match length" help="Minimum aligned length in a HSP segment." />
+            <param argument="--align-min-match-pident" value="70" type="float" label="Align min match pident" help="Minimum base identity (percentage) in a HSP segment." />
+            <param argument="--all" type="boolean" truevalue="--all" falsevalue="" checked="false" label="All all columns" help="Output more columns, e.g., matched sequences. Use this if you want to output blast-style format with 'lexicmap utils 2blast'." />
+            <param argument="--load-whole-seeds" type="boolean" truevalue="--load-whole-seeds" falsevalue="" checked="false" label="Load whole seeds" help="Load the whole seed data into memory for faster search" />
+            <param argument="--max-evalue" value="10" type="float" label="Max evalue" help="Maximum evalue of a HSP segment." />
+            <param argument="--max-query-conc" value="12" type="integer" label="Max query conc" help="Maximum number of concurrent queries. Bigger values do not improve the batch searching speed and consume much memory." />
+            <param argument="--min-qcov-per-genome" type="float" optional="true" help="Minimum query coverage (percentage) per genome." />
+            <param argument="--min-qcov-per-hsp" type="float" optional="true" help="Minimum query coverage (percentage) per HSP." />
+            <param argument="--seed-max-dist" value="1000" type="integer" label="Seed max dist" help="Minimum distance between seeds in seed chaining. It should be &lt;= contig interval length in database." />
+            <param argument="--seed-max-gap" value="1000" type="integer" label="Seed max gap" help="Minimum gap in seed chaining." />
+            <param argument="--seed-min-prefix" value="15" type="integer" label="Seed min prefix" help="Minimum (prefix/suffix) length of matched seeds (anchors)." />
+            <param argument="--seed-min-single-prefix" value="17" type="integer" label="Seed min single prefix" help="Minimum (prefix/suffix) length of matched seeds (anchors) if there's only one pair of seeds matched." />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="tabular">
+            <actions>
+                <conditional name="advanced_settings.all">
+                    <when value="true">
+                        <action name="column_names" type="metadata" default="Qquery,qlen,hits,sgenome,sseqid,qcovGnm,cls,hsp,qcovHSP,alenHSP,pident,gaps,qstart,qend,sstart,send,sstr,slen,evalue,bitscore,cigar,qseq,sseq,align" />
+                    </when>
+                    <when value="false">
+                        <action name="column_names" type="metadata" default="Qquery,qlen,hits,sgenome,sseqid,qcovGnm,cls,hsp,qcovHSP,alenHSP,pident,gaps,qstart,qend,sstart,send,sstr,slen,evalue,bitscore" />
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="db"/>
+                <param name="lexicmap_index" value="LexicMapIndex1" />
+            </conditional>
+            <param name="query" value="lexicmap_query.fasta.gz" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result.tsv" />
+        </test>
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="histdb"/>
+                <param name="histdb" ftype="lexicmap_index" class="Directory" value="db.lmi" />
+            </conditional>
+            <param name="top_n_genomes" value="0" />
+            <param name="query" value="lexicmap_query.fasta.gz" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+    Search sequences against an LexicMap index Database. For more information about settings
+    please visit: https://bioinf.shenwei.me/LexicMap/usage/search
+
+    Output format:
+    Tab-delimited format with 20+ columns, with 1-based positions.
+
+    1.  query,    Query sequence ID.
+    2.  qlen,     Query sequence length.
+    3.  hits,     Number of subject genomes.
+    4.  sgenome,  Subject genome ID.
+    5.  sseqid,   Subject sequence ID.
+    6.  qcovGnm,  Query coverage (percentage) per genome: $(aligned bases in the genome)/$qlen.
+    7.  cls,      Nth HSP cluster in the genome. (just for improving readability)
+                  It's useful to show if multiple adjacent HSPs are collinear.
+    8.  hsp,      Nth HSP in the genome.         (just for improving readability)
+    9.  qcovHSP   Query coverage (percentage) per HSP: $(aligned bases in a HSP)/$qlen.
+    10. alenHSP,  Aligned length in the current HSP.
+    11. pident,   Percentage of identical matches in the current HSP.
+    12. gaps,     Gaps in the current HSP.
+    13. qstart,   Start of alignment in query sequence.
+    14. qend,     End of alignment in query sequence.
+    15. sstart,   Start of alignment in subject sequence.
+    16. send,     End of alignment in subject sequence.
+    17. sstr,     Subject strand.
+    18. slen,     Subject sequence length.
+    19. evalue,   Expect value.
+    20. bitscore, Bit score.
+    21. cigar,    CIGAR string of the alignment.                      (optional with --all)
+    22. qseq,     Aligned part of query sequence.                     (optional with --all)
+    23. sseq,     Aligned part of subject sequence.                   (optional with --all)
+    24. align,    Alignment text ("|" and " ") between qseq and sseq. (optional with --all)
+
+    @info@
+        ]]></help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,45 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.7.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE_VERSION@">25.0</token>
+    <token name="@FASTA_TYPES@">fasta.gz,fasta</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">lexicmap</requirement>
+        </requirements>
+    </xml>
+    <xml name="bio_tools">
+        <xrefs>
+            <xref type="bio.tools">lexicmap</xref>
+        </xrefs>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2024.08.30.610459</citation>
+        </citations>
+    </xml>
+    <xml name="genomes_batch" type="list">
+        <extra_files name="genomes/batch_0000/genomes.bin.idx">
+            <assert_contents>
+                <has_size value="36"/>
+            </assert_contents>
+        </extra_files>
+        <extra_files name="genomes/batch_0000/genomes.bin">
+            <assert_contents>
+                <has_size value="1954"/>
+            </assert_contents>
+        </extra_files>
+    </xml>
+    <xml name="seeds" type="list">
+        <extra_files name="seeds/chunk_000.bin.idx">
+            <assert_contents>
+                <has_size value="177744" delta="5000"/>
+            </assert_contents>
+        </extra_files>
+        <extra_files name="seeds/chunk_000.bin" >
+            <assert_contents>
+                <has_size value="133749" delta="5000"/>
+            </assert_contents>
+        </extra_files>
+    </xml>
+</macros>
Binary file test-data/db.lmi/genomes.map.bin has changed
Binary file test-data/db.lmi/genomes/batch_0000/genomes.bin has changed
Binary file test-data/db.lmi/genomes/batch_0000/genomes.bin.idx has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/db.lmi/info.toml	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,24 @@
+# Index format
+main-version = 3
+minor-version = 4
+# LexicHash
+max-K = 31
+masks = 20000
+rand-seed = 1
+# Seed distance
+max-seed-dist = 100
+seed-dist-in-desert = 50
+# Seeds (k-mer-value data) files
+chunks = 2
+index-partitions = 4096
+# Input genomes
+input-genomes = 1
+# Input bases
+input-bases = 7417
+# Genome data.
+# 'genomes' might be larger than 'input-genomes', as some big fragmented genomes are split into multiple chunks.
+# In this case, 'genome-batch-size' is not accurate, being variable in different batches.
+genomes = 1
+genome-batch-size = 1
+genome-batches = 1
+contig-interval = 1000
Binary file test-data/db.lmi/masks.bin has changed
Binary file test-data/db.lmi/seeds/chunk_000.bin has changed
Binary file test-data/db.lmi/seeds/chunk_000.bin.idx has changed
Binary file test-data/db.lmi/seeds/chunk_001.bin has changed
Binary file test-data/db.lmi/seeds/chunk_001.bin.idx has changed
Binary file test-data/genomes/GCF_001502155.1_ViralProj307776_genomic.fna.gz has changed
Binary file test-data/genomes/GCF_001502175.1_ViralProj307780_genomic.fna.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_big_genomes.tsv	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,1 @@
+/[A-Za-z0-9_/]*/[A-Za-z0-9/_-]*\.dat	too_large_genome
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_index.loc	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,4 @@
+# This file is just a placeholder since Galxy does
+# not yet suppoort uploading a lexicmap index, which
+# is required for functional tests.
+LexicMapIndex1	LexicMapIndex1	${__HERE__}/db.lmi
\ No newline at end of file
Binary file test-data/lexicmap_query.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query_result.tsv	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,2 @@
+query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
+query1	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/lexicmap_index.loc.xml	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="lexicmap_index" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/lexicmap_index.loc"/>
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="lexicmap_index" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/lexicmap_index.loc"/>
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue Sep 16 13:52:03 2025 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="lexicmap_index" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/lexicmap_index.loc"/>
+    </table>
+</tables>