Mercurial > repos > ufz > eukcc_single
diff eukcc_single.xml @ 0:65d952c59d8b draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/eukcc commit ea26eabce05391af21e0919ac5309d23396960e3
| author | ufz |
|---|---|
| date | Fri, 25 Jul 2025 10:54:22 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eukcc_single.xml Fri Jul 25 10:54:22 2025 +0000 @@ -0,0 +1,156 @@ +<tool id="eukcc_single" name="EukCC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.0" license="MIT"> + <description>estimate completeness and contamination of a novel eukaryotic MAG</description> + <macros> + <import>macros.xml</import> + </macros> + <xrefs> + <xref type="bio.tools">eukcc</xref> + </xrefs> + <expand macro="requirements"/> + <expand macro="version_command"/> + <command detect_errors="exit_code"><![CDATA[ + #import re + #set $identifier= re.sub(r'[^\w\-.]', '_', $fasta.element_identifier) + ln -s '$fasta' '$identifier' && + mkdir output/ && + eukcc single + --out output/ + --db '$db.fields.path' + --threads "\${GALAXY_SLOTS:-1}" + ## --threads_epa THREADS_EPA + ## Number of threads to use for epa-ng, recommended: 1 (Default: 1) + '$identifier' + $sequence_type + #if str($advanced.taxids) != "" + --taxids $advanced.taxids + #end if + #if $advanced.genomes + --genomes + #for $genome in $advanced.genomes + '$genome' + #end for + #end if + --set_size $advanced.set_size + #if $advanced.use_placement + --use_placement '$advanced.use_placement' + #end if + --set_number_species $advanced.set_number_species + --marker_prevalence $advanced.marker_prevalence + --max_set_size $advanced.max_set_size + $advanced.marker_gene_selection + $advanced.use_ncbi_tree + ## --gmes Use GeneMark-ES instead of metaeuk (much slower) (default: False) + ## --ignore_tree Advanced option, mainly for debugging. Can ignore the tree if genomes are knwon via taxids for example + $advanced.simple + --clade $advanced.clade + ## --rerun, -r Rerun and remove any previously computed data in the target folder + $advanced.no_dynamic_root + $advanced.extra + ## remove header and path to job working dir from output + && tail -n +2 output/eukcc.csv | sed "s|\$(pwd)/\?||" > '$eukcc' + #if $advanced.extra + && gzip -d -c output/scmg_marker_table.csv.gz | tail -n +2 > '$scmg_marker_table' + #end if + ]]></command> + <inputs> + <param name="fasta" type="data" format="fasta" label="A single bin" help="Estimate quality of this bin"/> + <param argument="--db" type="select" label="Reference data"> + <options from_data_table="eukcc"> + <validator type="no_options" message="Built-in reference is not available. Contact the Galaxy Admin" /> + </options> + </param> + <param name="sequence_type" type="select" label="Sequence type"> + <option value="">Auto</option> + <option value="--DNA">DNA</option> + <option value="--AA">AA</option> + </param> + <section name="advanced" title="Advanced options" expanded="false"> + <param argument="--taxids" type="text" label="Taxids to use as set starting point"> + <validator type="regex" message="Must be a space separated list of tax IDs">^[0-9 ]*$</validator> + </param> + <param argument="--genomes" type="data" format="fasta" optional="true" multiple="true" label="Genome files to base a SCMG set upon"/> + <param argument="--set_size" type="integer" min="0" value="20" label="Minimal number of marker genes to use" help="" /> + <param argument="--use_placement" type="data" format="csv" optional="true" label="Previous result" help="to use exact same marker gene set" /> + <param argument="--set_number_species" type="integer" min="1" value="3" label="Minimal number of species to define a set" help="" /> + <param argument="--marker_prevalence" type="float" min="0" max="100" value="95" label="Percentage of species in which markers should be found" help="" /> + <param argument="--max_set_size" type="integer" min="0" value="500" label="Maximal number of marker genes used" help="set to 0 to include all possible marker genes" /> + <param name="marker_gene_selection" type="select" label="Marker gene selection method" help=""> + <option value="--select_best_guess">Use best guess to select marker gene set</option> + <option value="--select_species">Use species count to select best marker gene set</option> + </param> + <param argument="--use_ncbi_tree" type="boolean" truevalue="--use_ncbi_tree" falsevalue="" checked="false" label="Use NCBI tree" help="Instead of using the EukCC phylogenetic tree, rely on NCBI taxids" /> + <param argument="--simple" type="boolean" truevalue="--simple" falsevalue="" checked="false" label="Use global DB instead of clade specific DBs" help="faster, not suitable for protozoa" /> + <param argument="--clade" type="select" label="Define clade as base"> + <option value="base">Root</option> + <option value="fungi">Fungi</option> + <option value="protozoa">Protozoa</option> + <option value="plants">Plants</option> + </param> + <param argument="--no_dynamic_root" type="boolean" truevalue="" falsevalue="--no_dynamic_root" checked="false" label="re-root tree dynamically" help="Disable for best set detection" /> + <param argument="--extra" type="boolean" truevalue="--extra" falsevalue="" checked="false" label="Produce extra outputs" /> + </section> + </inputs> + <outputs> + <data name="eukcc" format="tabular"> + <actions> + <action type="metadata" name="column_names" default="fasta,completeness,contamination,ncbi_lng"/> + </actions> + </data> + <data name="scmg_marker_table" format="tabular" label="${tool.name} on ${on_string}: SCMG marker table"> + <filter>advanced['extra']</filter> + <actions> + <action type="metadata" name="column_names" default="target,query,bitscore,evalue,expected_GA"/> + </actions> + </data> + </outputs> + <tests> + <!-- reference data to large for test in CI. Download locally with test-data.sh to run tests. + <test expect_num_outputs="1"> + <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/> + <param name="db" value="1.2"/> + <output name="eukcc"> + <assert_contents> + <has_text text="GCA_903798045.1"/> + <has_text text="41874"/> <!\-\- 41874 = Bathycoccus \-\-> + <has_n_lines n="1"/> + <has_n_columns n="4"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/> + <param name="db" value="1.2"/> + <section name="advanced"> + <param name="extra" value="true"/> + </section> + <output name="eukcc"> + <assert_contents> + <has_text text="GCA_903798045.1"/> + <has_n_lines n="1"/> + <has_n_columns n="4"/> + </assert_contents> + </output> + <output name="scmg_marker_table"> + <assert_contents> + <has_n_lines n="314"/> + <has_n_columns n="5"/> + </assert_contents> + </output> + </test> --> + </tests> + <help><![CDATA[ + +.. class:: infomark + +**What it does** + +It consumes bins in FASTA format and outputs a table with estimated completeness, contamination and taxonomy lineage (given as dash separated list of TaxIDs). + +You should not use EukCC on already published genomes, if they have used during training of the marker gene sets. +If you want to make sure, you can see all used accessions in the database file db_base/backbone/base_taxinfo.csv. + + ]]></help> + <citations> + <citation type="doi">10.1186/s13059-020-02155-4</citation> + </citations> +</tool> \ No newline at end of file
