view gecco.xml @ 22:a41a2bcb0b91 draft default tip

Release v0.9.8
author althonos
date Fri, 09 Jun 2023 11:26:07 +0000
parents 64b724dd8d04
children
line wrap: on
line source

<?xml version='1.0' encoding='utf-8'?>
<tool id="gecco" name="GECCO" version="0.9.6" python_template_version="3.5">
    <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
    <creator>
        <organization name="Zeller Team" url="https://www.embl.org/groups/zeller/"/>
    </creator>
    <edam_topics>
        <edam_topic>topic_0080</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_0415</edam_operation>
    </edam_operations>
    <requirements>
        <requirement type="package" version="0.9.6">gecco</requirement>
    </requirements>
    <version_command>gecco --version</version_command>
    <command detect_errors="aggressive"><![CDATA[

        #if str($input.ext) == 'genbank':
            #set $file_extension = 'gbk'
        #else:
            #set $file_extension = $input.ext
        #end if
        ln -s '$input' input_tempfile.$file_extension &&

        gecco -vv run
        --format $input.ext
        --genome input_tempfile.$file_extension
        --postproc $postproc
        --force-tsv
        --jobs "\${GALAXY_SLOTS:-4}"
        #if $edge_distance
            --edge-distance $edge_distance
        #end if
        #if $mask
            --mask
        #end if
        #if $cds:
            --cds $cds
        #end if
        #if $threshold:
            --threshold $threshold
        #end if
        #if $antismash_sideload:
            --antismash-sideload
        #end if
        #unless $pad:
            --no-pad
        #end unless

        && mv input_tempfile.genes.tsv '$genes'
        && mv input_tempfile.features.tsv '$features'
        && mv input_tempfile.clusters.tsv '$clusters'
        #if $antismash_sideload
        && mv input_tempfile.sideload.json '$sideload'
        #end if

    ]]></command>
    <inputs>
        <param name="input" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank, EMBL or FASTA format"/>
        <param argument="--mask" type="boolean" checked="false" label="Enable masking of regions with unknown nucleotides when finding ORFs"/>
        <param argument="--pad" type="boolean" checked="true" label="Enable padding of gene sequences smaller than the CRF window length"/>
        <param argument="--cds" type="integer" min="0" value="" optional="true" label="Minimum number of genes required for a cluster"/>
        <param argument="--threshold" type="float" min="0" max="1" value="" optional="true" label="Probability threshold for cluster detection"/>
        <param argument="--postproc" type="select" label="Post-processing method for gene cluster validation">
            <option value="antismash">antiSMASH</option>
            <option value="gecco" selected="true">GECCO</option>
        </param>
        <param argument="--edge-distance" type="integer" min="0" optional="true" value="" label="Number of genes from the contig edges to filter out"/>
        <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/>
    </inputs>
    <outputs>
        <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" ext="genbank" visible="false" />
        </collection>
        <data name="genes" format="tabular" label="${tool.name} summary of detected genes on ${on_string} (TSV)"/>
        <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
        <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
        <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)">
            <filter>antismash_sideload</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <output name="features" file="features.tsv"/>
            <output name="genes" file="genes.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <param name="edge_distance" value="10"/>
        </test>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <output name="features" file="features.tsv"/>
            <output name="genes" file="genes.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <param name="antismash_sideload" value="True"/>
            <output name="features" file="features.tsv"/>
            <output name="genes" file="genes.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output name="sideload" file="sideload.json"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[

Overview
--------

GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.

Input
-----

GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.

Output
------

GECCO will create the following files once done (using the same prefix as the input file):

- ``features.tsv``: The genes file, containing the genes identified in the input sequences.
- ``features.tsv``: The features file, containing the protein domains identified in the input sequences.
- ``clusters.tsv``: A clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
- ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.

Contact
-------

If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the
`GitHub repository <https://github.com/zellerlab/gecco>`_. You can also directly contact `Martin Larralde via email <mailto:martin.larralde@embl.de>`_.
If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to open a pull request on the GitHub repository.

    ]]></help>
    <citations>
        <citation type="doi">10.1101/2021.05.03.442509</citation>
    </citations>
</tool>