view busco.xml @ 23:4e70d88adf2f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/busco/ commit 68696449a909c43d0e44bc9cfc8f8945e8a9dfce
author iuc
date Fri, 04 Apr 2025 11:18:42 +0000
parents e5c372c91e46
children
line wrap: on
line source

<tool id="busco" name="Busco" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>Assess genome assembly and annotation completeness</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <xrefs>
        <xref type="bio.tools">busco</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">busco</requirement>
        <!-- update augustus_species if update the augustus pin -->
        <requirement type="package" version="3.5.0">augustus</requirement>
        <requirement type="package" version="1.34">tar</requirement>
        <requirement type="package" version="1">fonts-conda-ecosystem</requirement>
        <!-- TODO might be removed in the future: for some reason conda installs an outdated sepp 4.4.0 but the latest dendropy 5.x which are incompatible -->
        <requirement type="package" version="4.5.5">sepp</requirement>
    </requirements>
    <version_command>busco --version</version_command>
    <command><![CDATA[
## tool tests can not run with --offline (otherwise we would need to store a lot of data at IUC)
## so we create a mock dir in the working dir where the tool can write to
## 
## a more thorough test can be executed as follows
## - set the `test` parameters to `""`
## - download complete reference DB (~200G, final 105G) to tools/busco/test-data/test-db/busco_downloads
##   ```
##   busco --download_path tools/busco/test-data/test-db/busco_downloads/ --download all
##   find tools/busco/test-data/test-db/busco_downloads/lineages/ -mindepth 1 -maxdepth 1 ! -name '*_odb10*' -exec rm -rf {} \;
##   find tools/busco/test-data/test-db/busco_downloads/placement_files -mindepth 1 -maxdepth 1 ! -name '*_odb10*' -delete
##   find tools/busco/test-data/test-db/busco_downloads/lineages/ -name "*.faa.gz" -exec gunzip {} \;;
##   ```
## - test containerized (note: test-data is mounted ro in containerized tests)
## 
## alternatively .. a bit weaker
## - set the `test` parameters to `""` comment --offline
## - run test NOT containerized (this will download the data needed for the tests)
## - uncomment --offline and test containerized
#if $test == 'true'
    mkdir mock_db_path/ &&
    cp -r '$cached_db.fields.path'/* mock_db_path/ &&
#end if

#if $busco_mode.mode == 'geno' and $busco_mode.use_augustus.use_augustus_selector == 'augustus':

if [ -z "\$AUGUSTUS_CONFIG_PATH" ] ; then BUSCO_PATH=\$(dirname \$(which busco)) ; export AUGUSTUS_CONFIG_PATH=\$(realpath \${BUSCO_PATH}/../config) ; fi &&
cp -r "\$AUGUSTUS_CONFIG_PATH/" augustus_dir/ &&
export AUGUSTUS_CONFIG_PATH=`pwd`/augustus_dir/ &&

#if $busco_mode.use_augustus.aug_prediction.augustus_mode == 'history':
    ## Using an augustus model from history, we need to unzip it and let augustus find it
    mkdir -p 'augustus_dir/species/' &&
    tar -C 'augustus_dir/species/' -xzf '${busco_mode.use_augustus.aug_prediction.augustus_model}' &&
#end if
#end if

#if $input.is_of_type("fasta.gz")
    gunzip -c '$input' > input.fa &&
#else
    ln -s '$input' input.fa &&
#end if

busco
--in 'input.fa'
--mode '${busco_mode.mode}'
--out busco_galaxy
--cpu \${GALAXY_SLOTS:-4}
--evalue ${adv.evalue}
--limit ${adv.limit}
--contig_break ${adv.contig_break}
#if $test == 'true'
--download_path mock_db_path
#else
--offline
--download_path '$cached_db.fields.path'
#end if

#if $lineage.lineage_mode == "auto_detect":
    $lineage.auto_lineage
#else if $lineage.lineage_mode == "select_lineage":
    --lineage_dataset '${lineage.lineage_dataset}'
#end if

#if $busco_mode.mode == 'geno':
    #if $busco_mode.use_augustus.use_augustus_selector == 'miniprot':
        --miniprot
    #elif $busco_mode.use_augustus.use_augustus_selector == 'augustus':
        ${busco_mode.use_augustus.long}
        --augustus
        #if $busco_mode.use_augustus.aug_prediction.augustus_mode == 'builtin':
            --augustus_species '${busco_mode.use_augustus.aug_prediction.augustus_species}'
        #else if $busco_mode.use_augustus.aug_prediction.augustus_mode == 'history':
            --augustus_species local
        #end if
    #else:
        --metaeuk
    #end if
#end if
    
#if $outputs and 'image' in $outputs:
    && mkdir BUSCO_summaries
    && cp busco_galaxy/short_summary.*.txt BUSCO_summaries/
    && generate_plot.py -wd BUSCO_summaries -rt specific
#end if

#if $outputs and 'gff' in $outputs:
    && echo "\##gff-version 3" > busco_output.gff
    ## gff files can be absent
    && (cat busco_galaxy/run_*/busco_sequences/*busco_sequences/*.gff >> busco_output.gff 2> /dev/null || true)
#end if
#if $outputs and 'faa' in $outputs:
    && (cat busco_galaxy/run_*/busco_sequences/*busco_sequences/*.faa >> busco_output.faa 2> /dev/null || true)
#end if
#if $outputs and 'fna' in $outputs:
    && (cat busco_galaxy/run_*/busco_sequences/*busco_sequences/*.fna >> busco_output.fna 2> /dev/null || true)
#end if
]]></command>
    <inputs>
        <param name="test" type="hidden"/>
        <param type="data" name="input" format="fasta,fasta.gz" label="Sequences to analyse" help="Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set."/>
        <param name="cached_db" label="Cached database with lineage" type="select">
            <options from_data_table="busco_database">
                <validator message="No BUSCO database is available - please contact your Galaxy admin." type="no_options"/>
            </options>
        </param>
        <conditional name="busco_mode">
            <param argument="--mode" type="select" label="Mode">
                <option value="geno">Genome assemblies (DNA)</option>
                <option value="tran">Transcriptome assemblies (DNA)</option>
                <option value="prot">Annotated gene sets (protein)</option>
            </param>
            <when value="geno">
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" type="select" label="Select a gene predictor" help="In the case of a prokaryotic genome, Prodigal is the default gene predictor">
                        <option value="augustus">Augustus</option>
                        <option value="metaeuk">Metaeuk</option>
                        <option value="miniprot" selected="true">Miniprot</option>
                    </param>
                    <when value="metaeuk"/>
                    <when value="miniprot"/>
                    <when value="augustus">
                        <conditional name="aug_prediction">
                            <param name="augustus_mode" type="select" label="Augustus species model">
                                <option value="no" selected="true">Use the default species for selected lineage</option>
                                <option value="builtin">Use another predefined species model</option>
                                <option value="history">Use a custom species model</option>
                            </param>
                            <when value="no"/>
                            <when value="history">
                                <param name="augustus_model" type="data" format="augustus" label="Augustus model"/>
                            </when>
                            <when value="builtin">
                                <param name="augustus_species" type="select" label="Augustus species model" help="If model name is different than species name it is shown in parentheses." >
                                    <expand macro="augustus_species"/>
                                </param>
                            </when>
                        </conditional>
                        <param argument="--long" type="boolean" checked="false" truevalue="--long" falsevalue="" label="Optimization mode Augustus self-training" help="Adds considerably to run time, but can improve results for some non-model organisms"/>
                    </when>
                </conditional>
            </when>
            <when value="tran"/>
            <when value="prot"/>
        </conditional>
        <conditional name="lineage">
            <param name="lineage_mode" type="select" label="Auto-detect or select lineage?" help="Let BUSCO decide the best lineage automatically, or select from known lineage">
                <option value="auto_detect">Auto-detect</option>
                <option value="select_lineage">Select lineage</option>
            </param>
            <when value="auto_detect">
                <param name="auto_lineage" type="select" label="auto-lineage group" help="Taxonomic group to run with auto-lineage.">
                    <option value="--auto-lineage">All taxonomic groups (--auto-lineage)</option>
                    <option value="--auto-lineage-prok">Prokaryotes (--auto-lineage-prok)</option>
                    <option value="--auto-lineage-euk">Eukaryotes (--auto-lineage-euk)</option>
                </param>
            </when>
            <when value="select_lineage">
                <param argument="--lineage_dataset" type="select" label="Lineage">
                    <options from_data_table="busco_database_options">
                        <filter type="param_value" column="2" ref="cached_db"/>
                    </options>
                </param>
            </when>
        </conditional>

        <param name="outputs" type="select" optional="true" multiple="true" label="Which outputs should be generated">
            <option value="short_summary" selected="true">short summary text</option>
            <option value="missing">list with missing IDs</option>
            <option value="image">summary image</option>
            <option value="gff">gff</option>
            <option value="faa">Protein sequences</option>
            <option value="fna">Nucleotide sequences</option>
        </param>

        <section name="adv" title="Advanced Options" expanded="False">
            <param argument="--evalue" type="float" value="0.001" min="0" max="1" label="E-value cutoff for BLAST searches."/>
            <param argument="--limit" type="integer" value="3" label="How many candidate regions to consider"/>
            <param argument="--contig_break" type="integer" value="10" label="Number of contiguous Ns to signify a break between contigs"/>
        </section>
    </inputs>

    <outputs>
        <data name='busco_sum' format='txt' label="${tool.name} on ${on_string}: short summary" from_work_dir="busco_galaxy/run_*/short_summary.txt">
            <filter>outputs and 'short_summary' in outputs</filter>
        </data>
        <data name='busco_table' format='tabular' label="${tool.name} on ${on_string}: full table" from_work_dir="busco_galaxy/run_*/full_table.tsv"/>
        <data name='busco_missing' format='tabular' label="${tool.name} on ${on_string}: missing buscos" from_work_dir="busco_galaxy/run_*/missing_busco_list.tsv">
            <filter>outputs and 'missing' in outputs</filter>
        </data>
        <data name='summary_image' format='png' label="${tool.name} on ${on_string}: summary image" from_work_dir="BUSCO_summaries/busco_figure.png">
            <filter>outputs and 'image' in outputs</filter>
        </data>
        <data name='busco_gff' format='gff3' label="${tool.name} on ${on_string}: GFF" from_work_dir="busco_output.gff">
            <filter>outputs and 'gff' in outputs</filter>
        </data>              
        <data name='busco_faa' format='fasta' label="${tool.name} on ${on_string}: Protein sequences" from_work_dir="busco_output.faa">
            <filter>outputs and 'faa' in outputs</filter>
        </data>              
        <data name='busco_fna' format='fasta' label="${tool.name} on ${on_string}: Nucleotide sequences" from_work_dir="busco_output.fna">
            <filter>outputs and 'fna' in outputs</filter>
        </data>              
    </outputs>
    
    <tests>
        <!-- <test expect_num_outputs="6">
            <param name="test" value="true"/>
	        <param name="input" value="genome.fa.gz" ftype="fasta.gz"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="geno"/>
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" value="augustus"/>
                </conditional>
            </conditional>
            <param name="outputs" value="short_summary,missing,gff,faa,fna"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="Gene predictor used: augustus"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="genome_results/full_table" compare="diff" lines_diff="10"/>
            <output name="busco_gff">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_text text="##gff-version 3"/>
                </assert_contents>
            </output>
            <output name="busco_fna">
                <assert_contents>
                    <has_text text=">"/>
                </assert_contents>
            </output>
            <output name="busco_faa">
                <assert_contents>
                    <has_text text=">"/>
                </assert_contents>
            </output>
            <output name="busco_missing" file="genome_results/missing_buscos_list" compare="diff" lines_diff="10"/>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="5">
            <param name="test" value="true"/>
            <param name="input" value="proteome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="prot"/>
            </conditional>
            <param name="outputs" value="short_summary,missing,image,gff"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="BUSCO was run in mode: proteins"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="proteome_results/full_table" compare="diff" lines_diff="10"/>
            <output name="busco_missing" file="proteome_results/missing_buscos_list" compare="diff" lines_diff="10"/>
            <output name="summary_image" file="proteome_results/summary.png" compare="sim_size"/>
            <output name="busco_gff">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_text text="##gff-version 3"/>
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="5">
            <param name="test" value="true"/>
            <param name="input" value="transcriptome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="tran"/>
            </conditional>
            <param name="outputs" value="short_summary,missing,image,gff"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="BUSCO was run in mode: euk_tran"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="transcriptome_results/full_table" compare="diff" lines_diff="6"/>
            <output name="busco_missing" file="transcriptome_results/missing_buscos_list" compare="diff" lines_diff="6"/>
            <output name="summary_image" file="transcriptome_results/summary.png" compare="sim_size"/>
            <output name="busco_gff" file="transcriptome_results/out.gff" compare="diff"/>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="3">
            <param name="test" value="true"/>
            <param name="input" value="genome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="geno"/>
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" value="augustus"/>
                    <conditional name="aug_prediction">
                        <param name="augustus_mode" value="builtin"/>
                        <param name="augustus_species" value="human"/>
                    </conditional>
                </conditional>
            </conditional>
            <param name="outputs" value="short_summary,gff"/>
            <output name="busco_sum" file="genome_results/short_summary" compare="diff" lines_diff="10">
                <assert_contents>
                    <has_text text="# Gene predictor used: augustus"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="genome_results/full_table" compare="diff" lines_diff="10"/>
            <output name="busco_gff">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_text text="##gff-version 3"/>
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="3">
            <param name="test" value="true"/>
            <param name="input" value="genome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="geno"/>
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" value="augustus"/>
                    <conditional name="aug_prediction">
                        <param name="augustus_mode" value="history"/>
                        <param name="augustus_model" value="local.tar.gz" ftype="augustus"/>
                    </conditional>
                </conditional>
            </conditional>
            <param name="outputs" value="short_summary,missing"/>
            <output name="busco_sum" file="genome_results/short_summary" compare="diff" lines_diff="10">
                <assert_contents>
                    <has_text text="# Gene predictor used: augustus"/>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="genome_results/full_table" compare="diff" lines_diff="10"/>
            <output name="busco_missing" file="genome_results/missing_buscos_list" compare="diff" lines_diff="10"/>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="5">
            <param name="test" value="true"/>
            <param name="input" value="genome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="geno"/>
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" value="metaeuk"/>
                </conditional>
            </conditional>
            <param name="outputs" value="short_summary,missing,image,gff"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="# Gene predictor used: metaeuk"/>
                </assert_contents>
            </output>
            <output name="busco_table" file="genome_results_metaeuk/full_table">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="9647at6656"/>
                </assert_contents>
            </output>
            <output name="busco_missing" file="genome_results_metaeuk/missing_buscos_list">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="9647at6656"/>
                </assert_contents>
            </output>
            <output name="summary_image" file="genome_results_metaeuk/summary.png" compare="sim_size"/>
            <output name="busco_gff" file="genome_results_metaeuk/out.gff3" compare="diff" lines_diff="6"/>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test> -->
        <test expect_num_outputs="5">
            <param name="test" value="true"/>
            <param name="input" value="genome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="auto_detect"/>
                <param name="auto_lineage" value="--auto-lineage"/>
            </conditional>
            <conditional name="busco_mode">
                <param name="mode" value="geno"/>
                <conditional name="use_augustus">
                    <param name="use_augustus_selector" value="metaeuk"/>
                </conditional>
            </conditional>
            <param name="outputs" value="short_summary,missing,image,gff"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="Gene predictor used: metaeuk"/>
                </assert_contents>
            </output>
            <output name="busco_table">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="The lineage dataset is: eukaryota_odb10"/>
                </assert_contents>
            </output>
            <output name="busco_missing">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="The lineage dataset is: eukaryota_odb10"/>
                </assert_contents>
            </output>
            <output name="summary_image" file="genome_results_metaeuk_auto/summary.png" compare="sim_size"/>
            <output name="busco_gff" file="genome_results_metaeuk_auto/out.gff" compare="diff"/>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test>
        <!-- <test expect_num_outputs="3">
            <param name="test" value="true"/>
            <param name="input" value="genome.fa"/>
            <conditional name="lineage">
                <param name="lineage_mode" value="select_lineage"/>
                <param name="lineage_dataset" value="arthropoda_odb10"/>
            </conditional>
            <param name="outputs" value="short_summary,gff"/>
            <output name="busco_sum">
                <assert_contents>
                    <has_text text="# Gene predictor used: miniprot"/>
                </assert_contents>
            </output>
            <output name="busco_gff" file="genome_results_miniprot/out.gff" compare="diff"/>
            <output name="busco_table" file="genome_results_miniprot/full_table">
                <assert_contents>
                    <has_text text="# BUSCO version is: @TOOL_VERSION@"/>
                    <has_text text="9647at6656"/>
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text text="BUSCO analysis done"/>
            </assert_stdout>
        </test> -->
    </tests>
    <help><![CDATA[


BUSCO: Assessing genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs
--------------------------------------------------------------------------------------------------------------

Interpreting the results
^^^^^^^^^^^^^^^^^^^^^^^^

BUSCO_ attempts to provide a quantitative assessment of the completeness in terms of the expected gene content of a
genome assembly, transcriptome, or annotated gene set. The results are simplified into categories of Complete
and single-copy, Complete and duplicated, Fragmented, or Missing BUSCOs.

BUSCO completeness results make sense only in the context of the biology of your organism.
You have to understand whether missing or duplicated genes are of biological or technical origin.
For instance, a high level of duplication may be explained by a recent whole duplication event
(biological) or a chimeric assembly of haplotypes (technical).
Transcriptomes and protein sets that are not filtered for isoforms will lead to a high proportion of duplicates.
Therefore you should filter them before a BUSCO analysis.
Finally, focusing on specific tissues or specific life stages and conditions in a transcriptomic experiment
is unlikely to produce a BUSCO-complete transcriptome. In this case, consistency across your samples
is what you will be aiming for.

For more information please refer to the Busco_ `user guide <https://busco.ezlab.org /busco_userguide.html#interpreting-the-results>`_
.

.. _BUSCO: http://busco.ezlab.org/

    ]]></help>
    <expand macro="citations"/>
</tool>