view gecco.xml @ 4:88dc16b4f583 draft

"Fix number of allowed different lines in `galaxy/gecco.xml` tests"
author althonos
date Sat, 11 Dec 2021 23:31:48 +0000
parents 359232b58f6a
children d64fe390f3c9
line wrap: on
line source

<?xml version='1.0' encoding='utf-8'?>
<tool id="gecco" name="GECCO" version="0.8.5" python_template_version="3.5">
    <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
    <requirements>
        <requirement type="package" version="0.8.5">gecco</requirement>
    </requirements>
    <version_command>gecco --version</version_command>
    <command detect_errors="aggressive"><![CDATA[

        #if str($input.ext) == 'genbank':
            #set $file_extension = 'gbk'
        #else:
            #set $file_extension = $input.ext
        #end if
        ln -s '$input' input_tempfile.$file_extension &&

        gecco -vv run
        --format $input.ext
        --genome input_tempfile.$file_extension
        --postproc $postproc
        --force-clusters-tsv
        #if $cds:
            --cds $cds
        #end if
        #if $threshold:
            --threshold $threshold
        #end if
        #if $antismash_sideload:
            --antismash-sideload
        #end if

        && mv input_tempfile.features.tsv '$features'
        && mv input_tempfile.clusters.tsv '$clusters'
        #if $antismash_sideload
        && mv input_tempfile.sideload.json '$sideload'
        #end if

    ]]></command>
    <inputs>
        <param name="input" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank, EMBL or FASTA format"/>
        <param argument="--cds" type="integer" min="0" value="" optional="true" label="Minimum number of genes required for a cluster"/>
        <param argument="--threshold" type="float" min="0" max="1" value="" optional="true" label="Probability threshold for cluster detection"/>
        <param argument="--postproc" type="select" label="Post-processing method for gene cluster validation">
            <option value="antismash">antiSMASH</option>
            <option value="gecco" selected="true">GECCO</option>
        </param>
        <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/>
    </inputs>
    <outputs>
        <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" ext="genbank" visible="false" />
        </collection>
        <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
        <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
        <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)">
            <filter>antismash_sideload</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <output name="features" file="features.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <param name="antismash_sideload" value="True"/>
            <output name="features" file="features.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output name="sideload" file="sideload.json"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[

Overview
--------

GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.

Input
-----

GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.

Output
------

GECCO will create the following files once done (using the same prefix as the input file):

- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences.
- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
- ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.

Contact
-------

If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the
`GitHub repository <https://github.com/zellerlab/gecco>`_. You can also directly contact `Martin Larralde via email <mailto:martin.larralde@embl.de>`_.
If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to open a pull request on the GitHub repository.

    ]]></help>
    <citations>
        <citation type="doi">10.1101/2021.05.03.442509</citation>
    </citations>
</tool>