Mercurial > repos > ufz > eukcc_single

<tool id="eukcc_single" name="EukCC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.0" license="MIT">
    <description>estimate completeness and contamination of a novel eukaryotic MAG</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <xrefs>
        <xref type="bio.tools">eukcc</xref>
    </xrefs>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
        #import re
        #set $identifier= re.sub(r'[^\w\-.]', '_', $fasta.element_identifier)
        ln -s '$fasta' '$identifier'  &&
        mkdir output/ &&
        eukcc single
            --out output/
            --db '$db.fields.path'
            --threads "\${GALAXY_SLOTS:-1}"
            ## --threads_epa THREADS_EPA
            ##     Number of threads to use for epa-ng, recommended: 1 (Default: 1)
            '$identifier'
            $sequence_type
            #if str($advanced.taxids) != ""
                --taxids $advanced.taxids
            #end if
            #if $advanced.genomes
                --genomes
                #for $genome in $advanced.genomes
                    '$genome'
                #end for
            #end if
            --set_size $advanced.set_size
            #if $advanced.use_placement
                --use_placement '$advanced.use_placement'
            #end if
            --set_number_species $advanced.set_number_species
            --marker_prevalence $advanced.marker_prevalence
            --max_set_size $advanced.max_set_size
            $advanced.marker_gene_selection
            $advanced.use_ncbi_tree
            ## --gmes                Use GeneMark-ES instead of metaeuk (much slower) (default: False)
            ## --ignore_tree         Advanced option, mainly for debugging. Can ignore the tree if genomes are knwon via taxids for example
            $advanced.simple
            --clade $advanced.clade
            ## --rerun, -r           Rerun and remove any previously computed data in the target folder
            $advanced.no_dynamic_root
            $advanced.extra
        ## remove header and path to job working dir from output
        && tail -n +2 output/eukcc.csv | sed "s|\$(pwd)/\?||"  > '$eukcc'
        #if $advanced.extra
            && gzip -d -c output/scmg_marker_table.csv.gz | tail -n +2 > '$scmg_marker_table'
        #end if
    ]]></command>
    <inputs>
        <param name="fasta" type="data" format="fasta" label="A single bin" help="Estimate quality of this bin"/>
        <param argument="--db" type="select" label="Reference data">
            <options from_data_table="eukcc">
                <validator type="no_options" message="Built-in reference is not available. Contact the Galaxy Admin" />
            </options>
        </param>
        <param name="sequence_type" type="select" label="Sequence type">
            <option value="">Auto</option>
            <option value="--DNA">DNA</option>
            <option value="--AA">AA</option>
        </param>
        <section name="advanced" title="Advanced options" expanded="false">
            <param argument="--taxids" type="text" label="Taxids to use as set starting point">
                <validator type="regex" message="Must be a space separated list of tax IDs">^[0-9 ]*$</validator>
            </param>
            <param argument="--genomes" type="data" format="fasta" optional="true" multiple="true" label="Genome files to base a SCMG set upon"/>
            <param argument="--set_size" type="integer" min="0" value="20" label="Minimal number of marker genes to use" help="" />
            <param argument="--use_placement" type="data" format="csv" optional="true" label="Previous result" help="to use exact same marker gene set" />
            <param argument="--set_number_species" type="integer" min="1" value="3" label="Minimal number of species to define a set" help="" />
            <param argument="--marker_prevalence" type="float" min="0" max="100" value="95" label="Percentage of species in which markers should be found" help="" />
            <param argument="--max_set_size" type="integer" min="0" value="500" label="Maximal number of marker genes used" help="set to 0 to include all possible marker genes" />
            <param name="marker_gene_selection" type="select" label="Marker gene selection method" help="">
                <option value="--select_best_guess">Use best guess to select marker gene set</option>
                <option value="--select_species">Use species count to select best marker gene set</option>
            </param>
            <param argument="--use_ncbi_tree" type="boolean" truevalue="--use_ncbi_tree" falsevalue="" checked="false" label="Use NCBI tree" help="Instead of using the EukCC phylogenetic tree, rely on NCBI taxids" />
            <param argument="--simple" type="boolean" truevalue="--simple" falsevalue="" checked="false" label="Use global DB instead of clade specific DBs" help="faster, not suitable for protozoa" />
            <param argument="--clade" type="select" label="Define clade as base">
                <option value="base">Root</option>
                <option value="fungi">Fungi</option>
                <option value="protozoa">Protozoa</option>
                <option value="plants">Plants</option>
            </param>
            <param argument="--no_dynamic_root" type="boolean" truevalue="" falsevalue="--no_dynamic_root" checked="false" label="re-root tree dynamically" help="Disable for best set detection" />
            <param argument="--extra" type="boolean" truevalue="--extra" falsevalue="" checked="false" label="Produce extra outputs" />
        </section>
    </inputs>
    <outputs>
        <data name="eukcc" format="tabular">
            <actions>
                <action type="metadata" name="column_names" default="fasta,completeness,contamination,ncbi_lng"/>
            </actions>
        </data>
        <data name="scmg_marker_table" format="tabular" label="${tool.name} on ${on_string}: SCMG marker table">
            <filter>advanced['extra']</filter>
            <actions>
                <action type="metadata" name="column_names" default="target,query,bitscore,evalue,expected_GA"/>
            </actions>
        </data>
    </outputs>
    <tests>
        <!-- reference data to large for test in CI. Download locally with test-data.sh to run tests.
        <test expect_num_outputs="1">
            <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/>
            <param name="db" value="1.2"/>
            <output name="eukcc">
                <assert_contents>
                    <has_text text="GCA_903798045.1"/>
                    <has_text text="41874"/> <!\-\- 41874 = Bathycoccus \-\->
                    <has_n_lines n="1"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="fasta" value="10000_lines_GCA_903798045.1_TARA_EukCC_1_genomic.fna"/>
            <param name="db" value="1.2"/>
            <section name="advanced">
                <param name="extra" value="true"/>
            </section>
            <output name="eukcc">
                <assert_contents>
                    <has_text text="GCA_903798045.1"/>
                    <has_n_lines n="1"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output name="scmg_marker_table">
                <assert_contents>
                    <has_n_lines n="314"/>
                    <has_n_columns n="5"/>
                </assert_contents>
            </output>
        </test> -->
    </tests>
    <help><![CDATA[

.. class:: infomark

**What it does**

It consumes bins in FASTA format and outputs a table with estimated completeness, contamination and taxonomy lineage (given as dash separated list of TaxIDs).

You should not use EukCC on already published genomes, if they have used during training of the marker gene sets.
If you want to make sure, you can see all used accessions in the database file db_base/backbone/base_taxinfo.csv.

    ]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-020-02155-4</citation>
    </citations>
</tool>