diff eukcc_single.xml @ 0:65d952c59d8b draft default tip

planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/eukcc commit ea26eabce05391af21e0919ac5309d23396960e3
author ufz
date Fri, 25 Jul 2025 10:54:22 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/eukcc_single.xml	Fri Jul 25 10:54:22 2025 +0000
@@ -0,0 +1,156 @@
+<tool id="eukcc_single" name="EukCC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.0" license="MIT">
+    <description>estimate completeness and contamination of a novel eukaryotic MAG</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">eukcc</xref>
+    </xrefs>
+    <expand macro="requirements"/>
+    <expand macro="version_command"/>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+        #set $identifier= re.sub(r'[^\w\-.]', '_', $fasta.element_identifier)
+        ln -s '$fasta' '$identifier'  &&
+        mkdir output/ &&
+        eukcc single
+            --out output/
+            --db '$db.fields.path'
+            --threads "\${GALAXY_SLOTS:-1}"
+            ## --threads_epa THREADS_EPA
+            ##     Number of threads to use for epa-ng, recommended: 1 (Default: 1)
+            '$identifier'
+            $sequence_type
+            #if str($advanced.taxids) != ""
+                --taxids $advanced.taxids
+            #end if
+            #if $advanced.genomes
+                --genomes
+                #for $genome in $advanced.genomes
+                    '$genome'
+                #end for
+            #end if
+            --set_size $advanced.set_size
+            #if $advanced.use_placement
+                --use_placement '$advanced.use_placement'
+            #end if
+            --set_number_species $advanced.set_number_species
+            --marker_prevalence $advanced.marker_prevalence
+            --max_set_size $advanced.max_set_size
+            $advanced.marker_gene_selection
+            $advanced.use_ncbi_tree
+            ## --gmes                Use GeneMark-ES instead of metaeuk (much slower) (default: False)
+            ## --ignore_tree         Advanced option, mainly for debugging. Can ignore the tree if genomes are knwon via taxids for example
+            $advanced.simple
+            --clade $advanced.clade
+            ## --rerun, -r           Rerun and remove any previously computed data in the target folder
+            $advanced.no_dynamic_root
+            $advanced.extra
+        ## remove header and path to job working dir from output
+        && tail -n +2 output/eukcc.csv | sed "s|\$(pwd)/\?||"  > '$eukcc'
+        #if $advanced.extra
+            && gzip -d -c output/scmg_marker_table.csv.gz | tail -n +2 > '$scmg_marker_table'
+        #end if
+    ]]></command>
+    <inputs>
+        <param name="fasta" type="data" format="fasta" label="A single bin" help="Estimate quality of this bin"/>
+        <param argument="--db" type="select" label="Reference data">
+            <options from_data_table="eukcc">
+                <validator type="no_options" message="Built-in reference is not available. Contact the Galaxy Admin" />
+            </options>
+        </param>
+        <param name="sequence_type" type="select" label="Sequence type">
+            <option value="">Auto</option>
+            <option value="--DNA">DNA</option>
+            <option value="--AA">AA</option>
+        </param>
+        <section name="advanced" title="Advanced options" expanded="false">
+            <param argument="--taxids" type="text" label="Taxids to use as set starting point">
+                <validator type="regex" message="Must be a space separated list of tax IDs">^[0-9 ]*$</validator>
+            </param>
+            <param argument="--genomes" type="data" format="fasta" optional="true" multiple="true" label="Genome files to base a SCMG set upon"/>
+            <param argument="--set_size" type="integer" min="0" value="20" label="Minimal number of marker genes to use" help="" />
+            <param argument="--use_placement" type="data" format="csv" optional="true" label="Previous result" help="to use exact same marker gene set" />
+            <param argument="--set_number_species" type="integer" min="1" value="3" label="Minimal number of species to define a set" help="" />
+            <param argument="--marker_prevalence" type="float" min="0" max="100" value="95" label="Percentage of species in which markers should be found" help="" />
+            <param argument="--max_set_size" type="integer" min="0" value="500" label="Maximal number of marker genes used" help="set to 0 to include all possible marker genes" />
+            <param name="marker_gene_selection" type="select" label="Marker gene selection method" help="">
+                <option value="--select_best_guess">Use best guess to select marker gene set</option>
+                <option value="--select_species">Use species count to select best marker gene set</option>
+            </param>
+            <param argument="--use_ncbi_tree" type="boolean" truevalue="--use_ncbi_tree" falsevalue="" checked="false" label="Use NCBI tree" help="Instead of using the EukCC phylogenetic tree, rely on NCBI taxids" />
+            <param argument="--simple" type="boolean" truevalue="--simple" falsevalue="" checked="false" label="Use global DB instead of clade specific DBs" help="faster, not suitable for protozoa" />
+            <param argument="--clade" type="select" label="Define clade as base">
+                <option value="base">Root</option>
+                <option value="fungi">Fungi</option>
+                <option value="protozoa">Protozoa</option>
+                <option value="plants">Plants</option>
+            </param>
+            <param argument="--no_dynamic_root" type="boolean" truevalue="" falsevalue="--no_dynamic_root" checked="false" label="re-root tree dynamically" help="Disable for best set detection" />
+            <param argument="--extra" type="boolean" truevalue="--extra" falsevalue="" checked="false" label="Produce extra outputs" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="eukcc" format="tabular">
+            <actions>
+                <action type="metadata" name="column_names" default="fasta,completeness,contamination,ncbi_lng"/>
+            </actions>
+        </data>
+        <data name="scmg_marker_table" format="tabular" label="${tool.name} on ${on_string}: SCMG marker table">
+            <filter>advanced['extra']</filter>
+            <actions>
+                <action type="metadata" name="column_names" default="target,query,bitscore,evalue,expected_GA"/>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <!-- reference data to large for test in CI. Download locally with test-data.sh to run tests.
+        <test expect_num_outputs="1">
+            <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/>
+            <param name="db" value="1.2"/>
+            <output name="eukcc">
+                <assert_contents>
+                    <has_text text="GCA_903798045.1"/>
+                    <has_text text="41874"/> <!\-\- 41874 = Bathycoccus \-\->
+                    <has_n_lines n="1"/>
+                    <has_n_columns n="4"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/>
+            <param name="db" value="1.2"/>
+            <section name="advanced">
+                <param name="extra" value="true"/>
+            </section>
+            <output name="eukcc">
+                <assert_contents>
+                    <has_text text="GCA_903798045.1"/>
+                    <has_n_lines n="1"/>
+                    <has_n_columns n="4"/>
+                </assert_contents>
+            </output>
+            <output name="scmg_marker_table">
+                <assert_contents>
+                    <has_n_lines n="314"/>
+                    <has_n_columns n="5"/>
+                </assert_contents>
+            </output>
+        </test> -->
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+It consumes bins in FASTA format and outputs a table with estimated completeness, contamination and taxonomy lineage (given as dash separated list of TaxIDs).
+
+You should not use EukCC on already published genomes, if they have used during training of the marker gene sets.
+If you want to make sure, you can see all used accessions in the database file db_base/backbone/base_taxinfo.csv.
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1186/s13059-020-02155-4</citation>
+    </citations>
+</tool>
\ No newline at end of file