Mercurial > repos > iuc > lexicmap

--- a/lexicmap.xml	Thu Sep 18 11:27:52 2025 +0000
+++ b/lexicmap.xml	Fri Sep 26 20:47:13 2025 +0000
@@ -7,48 +7,82 @@
     <expand macro="requirements"/>

     <command detect_errors="exit_code"><![CDATA[
-
-lexicmap search
-
-    --threads "\${GALAXY_SLOTS:-1}"
-
-    ${load_whole_seeds}
-    ${all}
+#if $db_opts.db_opts_selector == "histdb"
+    #set INDICES = [db.extra_files_path for db in $db_opts.histdb]
+#else:
+    #set INDICES = $db_opts.lexicmap_index.fields.path.split(",")
+#end if

-    #if $db_opts.db_opts_selector == "histdb"
-        --index '${db_opts.histdb.extra_files_path}'
-    #else:
-        --index '${db_opts.lexicmap_index.fields.path}'
-    #end if
+extract_query_ids() {
+    local input_files="\$1";
+    local query_ids="";
+    IFS=',' read -ra files <<< "\$input_files";
+    query_ids="";
+    for query_file in "\${files[@]}"; do
+        if file --mime-type "\$query_file" | grep -q "gzip"; then
+            query_ids+=\$(zcat "\$query_file" | grep '^>' | while IFS= read -r line; do clean="\${line#>}"; echo "\${clean%% *}>"; done);
+        else
+            query_ids+=\$(cat "\$query_file" | grep '^>' | while IFS= read -r line; do clean="\${line#>}"; echo "\${clean%% *}>"; done);
+        fi
+    done;
+    declare -g -a query_array=();
+    IFS='>' read -r -a query_array <<< "\$query_ids";
+}
+&&
+#for $counter, $index in enumerate($INDICES):
+    lexicmap search

-    #for $q in $query
-        '$q'
-    #end for
+        --threads "\${GALAXY_SLOTS:-1}"
+
+        ${load_whole_seeds}
+        ${all}

-    --out-file '$out_file'
+        --index '${index}'
+
+        #for $q in $query
+            '$q'
+        #end for
+
+        --out-file 'lexicmap_search_result__index${counter}.tsv'

-    --top-n-genomes '$top_n_genomes'
+        --top-n-genomes '$top_n_genomes'
+
+        --align-band '$align_band'
+        --align-ext-len '$align_ext_len'
+        --align-max-gap '$align_max_gap'
+        --align-min-match-len '$align_min_match_len'
+        --align-min-match-pident '$align_min_match_pident'
+        --max-evalue '$max_evalue'
+        --max-query-conc '$max_query_conc'
+        --seed-max-dist '$seed_max_dist'
+        --seed-max-gap '$seed_max_gap'
+        --seed-min-prefix '$seed_min_prefix'
+        --seed-min-single-prefix '$seed_min_single_prefix'
+
+        #if $min_qcov_per_genome
+            --min-qcov-per-genome '$min_qcov_per_genome'
+        #end if

-    --align-band '$align_band'
-    --align-ext-len '$align_ext_len'
-    --align-max-gap '$align_max_gap'
-    --align-min-match-len '$align_min_match_len'
-    --align-min-match-pident '$align_min_match_pident'
-    --max-evalue '$max_evalue'
-    --max-query-conc '$max_query_conc'
-    --seed-max-dist '$seed_max_dist'
-    --seed-max-gap '$seed_max_gap'
-    --seed-min-prefix '$seed_min_prefix'
-    --seed-min-single-prefix '$seed_min_single_prefix'
+        #if $min_qcov_per_hsp
+            --min-qcov-per-hsp '$min_qcov_per_hsp'
+        #end if
+        &&
+#end for

-    #if $min_qcov_per_genome
-        --min-qcov-per-genome '$min_qcov_per_genome'
-    #end if
-
-    #if $min_qcov_per_hsp
-        --min-qcov-per-hsp '$min_qcov_per_hsp'
-    #end if
-
+#if len($INDICES) > 1
+    counter=0 &&
+    extract_query_ids '$query' &&
+    for ((i=0; i<\${#query_array[@]}; i++)); do
+        counter=\$((counter + 1));
+        lexicmap utils merge-search-results
+            --out-file "combined_result.\${counter}.tsv"
+            -q "\${query_array[\$i]}" lexicmap_search_result__index*.tsv
+            -j "\${GALAXY_SLOTS:-1}";
+    done &&
+    cat combined_result.*.tsv | awk 'NR==1 || $0 !~ /^query\tqlen\thits/' > '$out_file'
+#else
+    mv lexicmap_search_result__index0.tsv '$out_file'
+#end if
     ]]></command>
     <inputs>
         <param name="query" type="data" format="fasta.gz" label="LexicMap query file" multiple="true"  help=""/>
@@ -58,10 +92,10 @@
               <option value="db">Locally installed LexicMap indexes</option>
             </param>
             <when value="histdb">
-                <param name="histdb" type="data" format="lexicmap_index" optional="false" label="LexicMap index" />
+                <param name="histdb" type="data" format="lexicmap_index" optional="false" multiple="true" label="LexicMap index" />
             </when>
             <when value="db">
-                <param name="lexicmap_index" type="select" optional="false" label="LexicMap index file">
+                <param name="lexicmap_index" type="select" optional="false" multiple="true" label="LexicMap index file">
                     <options from_data_table="lexicmap_index"/>
                 </param>
             </when>
@@ -100,7 +134,7 @@
         </data>
     </outputs>
     <tests>
-        <!-- Test 1 - query a local index with one query -->
+        <!-- Test 1 - query one local index with one query -->
         <test expect_num_outputs="1">
             <conditional name="db_opts">
                 <param name="db_opts_selector" value="db"/>
@@ -112,7 +146,7 @@
             </section>
             <output name="out_file" value="lexicmap_query_result.tsv" />
         </test>
-        <!-- Test 2 - query a local index with multiple query files -->
+        <!-- Test 2 - query one local index with multiple query files -->
         <test expect_num_outputs="1">
             <conditional name="db_opts">
                 <param name="db_opts_selector" value="db"/>
@@ -124,7 +158,56 @@
             </section>
             <output name="out_file" value="lexicmap_query_result2.tsv" />
         </test>
-        <!-- Test 3 - query a  index found in the history with one query -->
+        <!-- Test 3 - query two local index with one query file -->
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="db"/>
+                <param name="lexicmap_index" value="LexicMapIndexCombined" />
+            </conditional>
+            <param name="query" value="lexicmap_query.fasta.gz" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result.tsv" />
+        </test>
+        <!-- Test 4 - query two local index with multiple query files -->
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="db"/>
+                <param name="lexicmap_index" value="LexicMapIndexCombined" />
+            </conditional>
+            <param name="query" value="lexicmap_query.fasta.gz,lexicmap_query2.fasta.gz,lexicmap_query3.fasta" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result4.tsv" />
+        </test>
+        <!-- Test 5 - query one local index with multiple query files, where only one query will get hits -->
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="db"/>
+                <param name="lexicmap_index" value="LexicMapIndex2" />
+            </conditional>
+            <param name="query" value="lexicmap_query.fasta.gz,lexicmap_query2.fasta.gz,lexicmap_query3.fasta" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result3.tsv" />
+        </test>
+        <!-- Test 6 - query multiple local index with multiple query files -->
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="db"/>
+                <param name="lexicmap_index" value="LexicMapIndex1,LexicMapIndex2,LexicMapIndexCombined" />
+            </conditional>
+
+            <param name="query" value="lexicmap_query.fasta.gz,lexicmap_query2.fasta.gz,lexicmap_query3.fasta" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result6.tsv" />
+        </test>
+        <!-- Test 7 - query one index found in the history with one query -->
         <test expect_num_outputs="1">
             <conditional name="db_opts">
                 <param name="db_opts_selector" value="histdb"/>
@@ -137,6 +220,19 @@
             </section>
             <output name="out_file" value="lexicmap_query_result.tsv" />
         </test>
+        <!-- Test 8 - query two index found in the history with one query -->
+        <test expect_num_outputs="1">
+            <conditional name="db_opts">
+                <param name="db_opts_selector" value="histdb"/>
+                <param name="histdb" ftype="lexicmap_index" class="Directory" value="db.lmi,db2.lmi" />
+            </conditional>
+            <param name="top_n_genomes" value="0" />
+            <param name="query" value="lexicmap_query.fasta.gz,lexicmap_query3.fasta" />
+            <section name="advanced_settings">
+                <param name="load_whole_seeds" value="true" />
+            </section>
+            <output name="out_file" value="lexicmap_query_result5.tsv" />
+        </test>
     </tests>
     <help><![CDATA[

@@ -172,6 +268,11 @@
     23. sseq,     Aligned part of subject sequence.                   (optional with --all)
     24. align,    Alignment text ("|" and " ") between qseq and sseq. (optional with --all)

+    When running against multiple indices lexicmap utils merge-search-results will be used to
+    merge the search results. For more information please visit:
+    https://bioinf.shenwei.me/LexicMap/usage/utils/merge-search-results/
+
+    Note: if the query id contains spaces, only the first part (before the first space) will be kept as the query id.
     @info@
         ]]></help>
     <expand macro="citations" />
--- a/macros.xml	Thu Sep 18 11:27:52 2025 +0000
+++ b/macros.xml	Fri Sep 26 20:47:13 2025 +0000
@@ -1,11 +1,12 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.7.0</token>
-    <token name="@VERSION_SUFFIX@">1</token>
+    <token name="@TOOL_VERSION@">0.8.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE_VERSION@">25.0</token>
     <token name="@FASTA_TYPES@">fasta.gz,fasta</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">lexicmap</requirement>
+            <requirement type="package" version="5.46">file</requirement>
         </requirements>
     </xml>
     <xml name="bio_tools">
Binary file test-data/db2.lmi/genomes.map.bin has changed
Binary file test-data/db2.lmi/genomes/batch_0000/genomes.bin has changed
Binary file test-data/db2.lmi/genomes/batch_0000/genomes.bin.idx has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/db2.lmi/info.toml	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,24 @@
+# Index format
+main-version = 3
+minor-version = 4
+# LexicHash
+max-K = 31
+masks = 20000
+rand-seed = 1
+# Seed distance
+max-seed-dist = 100
+seed-dist-in-desert = 50
+# Seeds (k-mer-value data) files
+chunks = 2
+index-partitions = 4096
+# Input genomes
+input-genomes = 1
+# Input bases
+input-bases = 14243
+# Genome data.
+# 'genomes' might be larger than 'input-genomes', as some big fragmented genomes are split into multiple chunks.
+# In this case, 'genome-batch-size' is not accurate, being variable in different batches.
+genomes = 1
+genome-batch-size = 1
+genome-batches = 1
+contig-interval = 1000
Binary file test-data/db2.lmi/masks.bin has changed
Binary file test-data/db2.lmi/seeds/chunk_000.bin has changed
Binary file test-data/db2.lmi/seeds/chunk_000.bin.idx has changed
Binary file test-data/db2.lmi/seeds/chunk_001.bin has changed
Binary file test-data/db2.lmi/seeds/chunk_001.bin.idx has changed
--- a/test-data/lexicmap_index.loc	Thu Sep 18 11:27:52 2025 +0000
+++ b/test-data/lexicmap_index.loc	Fri Sep 26 20:47:13 2025 +0000
@@ -1,4 +1,6 @@
 # This file is just a placeholder since Galxy does
 # not yet suppoort uploading a lexicmap index, which
 # is required for functional tests.
-LexicMapIndex1	LexicMapIndex1	${__HERE__}/db.lmi
\ No newline at end of file
+LexicMapIndex1	LexicMapIndex1	${__HERE__}/db.lmi
+LexicMapIndex2	LexicMapIndex2	${__HERE__}/db2.lmi
+LexicMapIndexCombined	LexicMapIndexCombined	${__HERE__}/db.lmi,${__HERE__}/db2.lmi
\ No newline at end of file
Binary file test-data/lexicmap_query.fasta.gz has changed
Binary file test-data/lexicmap_query2.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query3.fasta	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,4 @@
+>Third Query
+CCATAGCTTATGCATACAAACCCTAAAAGTGTGCGGAAACCACATTCT
+GTGTAGCCACAATATGCAACGATTACAAAGCACAGTTTTTCTCACTAAATAAACCCGTTATAATGAGCTCATCTTCCAGT
+GTATGCATGCCAATACCTGTATGGAAAAATCCAATCAGTCACGTTGT
--- a/test-data/lexicmap_query_result.tsv	Thu Sep 18 11:27:52 2025 +0000
+++ b/test-data/lexicmap_query_result.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -1,2 +1,2 @@
 query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
-query1	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+FirstQuery	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
--- a/test-data/lexicmap_query_result2.tsv	Thu Sep 18 11:27:52 2025 +0000
+++ b/test-data/lexicmap_query_result2.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -1,3 +1,3 @@
 query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
-query1	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
-query2	320	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	320	100.000	0	1	320	6161	6480	+	7417	1.86e-168	578
+FirstQuery	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+SecondQuery	320	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	320	100.000	0	1	320	6161	6480	+	7417	1.86e-168	578
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query_result3.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,2 @@
+query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
+Third	175	1	dataset_e743a62d-cda6-41a8-b3f7-0e517bd8b59e	NC_028949.1	100.000	1	1	100.000	177	98.870	2	1	175	1631	1807	+	14243	6.86e-92	324
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query_result4.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,4 @@
+query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
+FirstQuery	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+SecondQuery	320	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	320	100.000	0	1	320	6161	6480	+	7417	1.86e-168	578
+Third	175	1	dataset_e743a62d-cda6-41a8-b3f7-0e517bd8b59e	NC_028949.1	100.000	1	1	100.000	177	98.870	2	1	175	1631	1807	+	14243	6.86e-92	324
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query_result5.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,3 @@
+query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
+FirstQuery	240	1	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+Third	175	1	dataset_e743a62d-cda6-41a8-b3f7-0e517bd8b59e	NC_028949.1	100.000	1	1	100.000	177	98.870	2	1	175	1631	1807	+	14243	6.86e-92	324
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/lexicmap_query_result6.tsv	Fri Sep 26 20:47:13 2025 +0000
@@ -0,0 +1,7 @@
+query	qlen	hits	sgenome	sseqid	qcovGnm	cls	hsp	qcovHSP	alenHSP	pident	gaps	qstart	qend	sstart	send	sstr	slen	evalue	bitscore
+FirstQuery	240	2	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+FirstQuery	240	2	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	240	100.000	0	1	240	2001	2240	+	7417	3.76e-125	434
+SecondQuery	320	2	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	320	100.000	0	1	320	6161	6480	+	7417	1.86e-168	578
+SecondQuery	320	2	dataset_963f49fd-cb75-4b60-909c-e63a9651ba65	NC_028981.1	100.000	1	1	100.000	320	100.000	0	1	320	6161	6480	+	7417	1.86e-168	578
+Third	175	2	dataset_e743a62d-cda6-41a8-b3f7-0e517bd8b59e	NC_028949.1	100.000	1	1	100.000	177	98.870	2	1	175	1631	1807	+	14243	6.86e-92	324
+Third	175	2	dataset_e743a62d-cda6-41a8-b3f7-0e517bd8b59e	NC_028949.1	100.000	1	1	100.000	177	98.870	2	1	175	1631	1807	+	14243	6.86e-92	324