view gecco.xml @ 12:e6597056a493 draft

"Release v0.9.1-alpha3"
author althonos
date Mon, 28 Mar 2022 15:41:47 +0000
parents d64fe390f3c9
children 56b924f62165
line wrap: on
line source

<?xml version='1.0' encoding='utf-8'?>
<tool id="gecco" name="GECCO" version="0.8.10" python_template_version="3.5">
    <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
    <requirements>
        <requirement type="package" version="0.8.10">gecco</requirement>
    </requirements>
    <version_command>gecco --version</version_command>
    <command detect_errors="aggressive"><![CDATA[

        #if str($input.ext) == 'genbank':
            #set $file_extension = 'gbk'
        #else:
            #set $file_extension = $input.ext
        #end if
        ln -s '$input' input_tempfile.$file_extension &&

        gecco -vv run
        --format $input.ext
        --genome input_tempfile.$file_extension
        --postproc $postproc
        --edge-distance $edge_distance
        --force-clusters-tsv
        #if $mask
            --mask
        #end if
        #if $cds:
            --cds $cds
        #end if
        #if $threshold:
            --threshold $threshold
        #end if
        #if $antismash_sideload:
            --antismash-sideload
        #end if

        && mv input_tempfile.features.tsv '$features'
        && mv input_tempfile.clusters.tsv '$clusters'
        #if $antismash_sideload
        && mv input_tempfile.sideload.json '$sideload'
        #end if

    ]]></command>
    <inputs>
        <param name="input" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank, EMBL or FASTA format"/>
        <param argument="--mask" type="boolean" checked="false" label="Enable masking of regions with unknown nucleotides when finding ORFs"/>
        <param argument="--cds" type="integer" min="0" value="" optional="true" label="Minimum number of genes required for a cluster"/>
        <param argument="--threshold" type="float" min="0" max="1" value="" optional="true" label="Probability threshold for cluster detection"/>
        <param argument="--postproc" type="select" label="Post-processing method for gene cluster validation">
            <option value="antismash">antiSMASH</option>
            <option value="gecco" selected="true">GECCO</option>
        </param>
        <param argument="--edge-distance" type="integer" min="0" value="10" label="Number of genes from the contig edges to filter out"/>
        <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/>
    </inputs>
    <outputs>
        <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" ext="genbank" visible="false" />
        </collection>
        <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
        <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
        <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)">
            <filter>antismash_sideload</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <output name="features" file="features.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
        </test>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <param name="edge_distance" value="0"/>
            <output name="features" file="features.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value="BGC0001866.fna"/>
            <param name="antismash_sideload" value="True"/>
            <param name="edge_distance" value="0"/>
            <output name="features" file="features.tsv"/>
            <output name="clusters" file="clusters.tsv"/>
            <output name="sideload" file="sideload.json"/>
            <output_collection name="records" type="list">
                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[

Overview
--------

GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.

Input
-----

GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.

Output
------

GECCO will create the following files once done (using the same prefix as the input file):

- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences.
- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
- ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.

Contact
-------

If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the
`GitHub repository <https://github.com/zellerlab/gecco>`_. You can also directly contact `Martin Larralde via email <mailto:martin.larralde@embl.de>`_.
If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to open a pull request on the GitHub repository.

    ]]></help>
    <citations>
        <citation type="doi">10.1101/2021.05.03.442509</citation>
    </citations>
</tool>