view bakta.xml @ 6:92eee5f31117 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit fe9aabe924f278694d20b82169edacdb3e91eb49
author iuc
date Sun, 11 Feb 2024 00:56:12 +0000
parents 728dacaf08a9
children ba6990f72184
line wrap: on
line source

<tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Genome annotation via alignment-free sequence identification
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <expand macro='edam'/>
    <expand macro='xrefs'/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>

    <command detect_errors="aggressive"><![CDATA[

        mkdir -p ./database_path/amrfinderplus-db &&
        ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
        ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
        bakta --verbose

        #*======================================
                    CPU option
        ======================================*#
        --threads \${GALAXY_SLOTS:-1}
        #*======================================
                    Bakta database
        ======================================*#
        --db './database_path'
        --output 'bakta_output'
        #if $input_option.min_contig_length
            --min-contig-length $input_option.min_contig_length
        #else if $annotation.compliant
            --min-contig-length 200
        #else
            --min-contig-length 1
        #end if
        --prefix bakta_output
        #*======================================
                  Organism options
              genus/species/strain/plasmid
        ======================================*#
        #if $organism.genus
            --genus '$organism.genus'
        #end if
        #if $organism.species
            --species '$organism.species'
        #end if
        #if $organism.strain
            --strain '$organism.strain'
        #end if
        #if $organism.plasmid
            --plasmid '$organism.plasmid'
        #end if
        #*======================================
                    Annotation options
            gram type, prodigal/protein file
        ======================================*#
        $annotation.complete
        #if $annotation.prodigal
            --prodigal-tf '$annotation.prodigal'
        #end if
        #if $annotation.translation_table
            --translation-table '$annotation.translation_table'
        #end if
            --gram '?'
        $annotation.keep_contig_headers
        #if $annotation.replicons
            --replicons '$annotation.replicons'
        #end if
        $annotation.compliant
        #if $annotation.proteins
            --proteins '$annotation.proteins'
        #end if
        #if $annotation.regions
            --regions '$annotation.regions'
        #end if
        #*======================================
                    Workflow OPTIONS
         skip some step of the bakta analysis
        ======================================*#

        #echo " ".join($workflow.skip_analysis)

        #*======================================
                    Genome file
        ======================================*#
        '$input_option.input_file'
        #*======================================
                    LOG file
        ======================================*#
        | tee '$logfile'
        ]]></command>
    <inputs>
      <!-- DB and file INPUT -->
        <section name="input_option" title="Input/Output options" expanded="true">
            <param name="bakta_db_select" type="select" label="The bakta database">
                <options from_data_table="bakta_database">
                    <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/>
                    <validator message="No bakta database is available" type="no_options"/>
                </options>
            </param>
            <param name="amrfinder_db_select" type="select" label="The amrfinderplus database">
                <options from_data_table="amrfinderplus_database">
                  <validator message="No amrfinderplus database is available" type="no_options"/>
                </options>
            </param>

            <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
            <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
        </section>
        <!-- Organism INFORMATION OPTIONS -->
        <section name="organism" title="Optional organism options" expanded="false">
            <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia">
                <validator type="regex">^[a-zA-Z]+$</validator>
            </param>
            <param argument="--species" type="text" optional="true" label="Specify species name" help="ex. 'coli O157:H7'">
                <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
            </param>
            <param argument="--strain" type="text" optional="true" label="Specify strain name" help="ex. Sakai">
                <validator type="regex">^[a-zA-Z]+$</validator>
            </param>
            <param argument="--plasmid" type="text" optional="true" label="Specify plasmid name" help="ex. pOSAK1">
                <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
            </param>
        </section>
        <!-- ANNOTATION -->
        <section name="annotation" title="Optional annotation">
            <param argument="--complete" type="boolean" truevalue="--complete" falsevalue="" label="Complete replicons" help="All sequences are complete replicons (chromosome/plasmid[s])"/>
            <param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/>
            <param name="translation_table"  type="select" optional="true" label="Translation table" help="Default is the bacterial table 11">
                <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
                <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
            </param>
            <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/>
            <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
            <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
            <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
            <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" label="Metagenome mode" help="Run in metagenome mode. This only affects CDS prediction"/>
            <param argument="--regions" type="data" format="gff,genbank" optional="true" label="Pre-annotated regions" help="Regions only, no functional annotations."/>
        </section>
        <!-- PARAMETER FOR WORKFLOW ANALYSIS -->
        <section name="workflow" title="Workflow option to skip steps">
            <param name="skip_analysis" type="select" display="checkboxes" multiple="true"  label="Select steps to skip">
                <option value="--skip-trna">Skip tRNA detection and annotation</option>
                <option value="--skip-tmrna">Skip tmRNA detection and annotation</option>
                <option value="--skip-rrna">Skip rRNA detection and annotation</option>
                <option value="--skip-ncrna">Skip ncRNA detection and annotation</option>
                <option value="--skip-ncrna-region">Skip ncRNA region detection and annotation</option>
                <option value="--skip-crispr">Skip CRISPR array detection and annotation</option>
                <option value="--skip-cds">Skip CDS detection and annotation</option>
                <option value="--skip-pseudo">Skip pseudogene detection and annotation</option>
                <option value="--skip-sorf">Skip sORF detection and annotation</option>
                <option value="--skip-gap">Skip gap detection and annotation</option>
                <option value="--skip-ori">Skip oriC/oriT detection and annotation</option>
                <option value="--skip-plot">Skip generation of circular genome plots</option>
            </param>
        </section>
        <section name="output_files" title="Selection of the output files">
          <param name="output_selection" type="select" display="checkboxes" multiple="true"  label="Output files selection">
              <option value="file_tsv" selected="true">Annotation file in TSV</option>
              <option value="file_gff3" selected="true">Annotation and sequence in GFF3</option>
              <option value="file_gbff" selected="false">Annotations and sequences in GenBank format</option>
              <option value="file_embl" selected="false">Annotations and sequences in EMBL format</option>
              <option value="file_fna" selected="false">Replicon/contig DNA sequences as FASTA</option>
              <option value="file_ffn" selected="true">Feature nucleotide sequences as FASTA</option>
              <option value="file_faa" selected="false">CDS/sORF amino acid sequences as FASTA</option>
              <option value="hypo_tsv" selected="false">Hypothetical protein CDS in TSV</option>
              <option value="hypo_fa" selected="false">Hypothetical protein CDS amino sequences as FASTA</option>
              <option value="sum_txt" selected="false">Summary as TXT</option>
              <option value="file_json" selected="false">Information on each annotated feature as JSON</option>
              <option value="file_plot" selected="true">Plot of the annotation result as SVG</option>
              <option value="log_txt" selected="false">Log file as TXT</option>
          </param>
        </section>

    </inputs>
    <outputs>
        <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary">
            <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences">
            <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff">
            <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl">
            <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences">
            <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences">
            <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences">
            <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter>
        </data>
        <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary">
            <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter>
        </data>
        <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences">
          <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter>
        </data>
        <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary">
            <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable">
            <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter>
        </data>
        <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
            <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter>
        </data>
        <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
            <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="13"> <!-- TEST_1 database + input -->
            <section name="input_option" >
                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                <param name="min_contig_length" value="250"/>
            </section>
            <section name="output_files">
                <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
            </section>
            <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/>
            <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/>
            <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/>
            <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/>
            <output name="annotation_fna" value="TEST_1/TEST_1.fna"/>
            <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/>
            <output name="annotation_faa" value="TEST_1/TEST_1.faa"/>
            <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/>
            <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/>
            <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/>
            <output name="annotation_plot" value="TEST_1/TEST_1_plot.svg" ftype="svg" compare="sim_size"/>
            <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/>
            <output name="logfile" ftype="txt">
                <expand macro="assert_content_test"/>
            </output>
        </test>
        <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps  -->
            <section name="input_option" >
                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                <param name="min_contig_length" value="250"/>
            </section>
            <section name="organism">
                <param name="genus" value="Escherichia"/>
                <param name="species" value="coli O157:H7"/>
                <param name="strain" value="Sakai"/>
                <param name="plasmid" value="pOSAK1"/>
            </section>
            <section name="annotation">
                <param name="keep_contig_headers" value="true"/>
            </section>
            <section name="workflow">
                <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
            </section>
            <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4">
                <assert_contents>
                    <has_text_matching expression="IHHALP_00005"/>
                </assert_contents>
            </output>
            <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4">
                <assert_contents>
                    <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
                </assert_contents>
            </output>
            <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/>
            <output name="annotation_plot" value="TEST_2/TEST_2_plot.svg" ftype="svg" compare="sim_size"/>
        </test>
        <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps  -->
            <section name="input_option" >
                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                <param name="min_contig_length" value="350"/>
            </section>
            <section name="workflow">
                <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
            </section>
            <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/>
            <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/>
            <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/>
            </test>
            <test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
                <section name="input_option" >
                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                </section>
                <section name="annotation">
                    <param name="complete" value="true"/>
                    <param name="prodigal" value="prodigal.tf"/>
                    <param name="translation_table" value="4"/>
                    <param name="replicons" value="replicons.tsv" ftype="tabular"/>
                    <param name="compliant" value="true"/>
                    <param name="proteins" value="user-proteins.faa" ftype="fasta"/>
                </section>
                <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/>
                <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/>
                <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/>
                <output name="annotation_plot" value="TEST_4/TEST_4_plot.svg" ftype="svg" compare="sim_size"/>
            </test>
            <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
                <section name="input_option" >
                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                </section>
                <section name="annotation">
                    <param name="complete" value="true"/>
                    <param name="translation_table" value="4"/>
                </section>
                <section name="workflow">
                    <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
                </section>
                <section name="output_files">
                    <param name="output_selection" value="log_txt,sum_txt"/>
                </section>
                <output name="logfile" ftype="txt">
                    <expand macro="assert_content_test"/>
                </output>
                <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/>
            </test>
            <test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
                <section name="input_option" >
                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                </section>
                <section name="annotation">
                    <param name="meta" value="true"/>
                </section>
                <section name="output_files">
                    <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
                </section>
                <output name="annotation_tsv" value="TEST_6/TEST_6.tsv" lines_diff="2"/>
                <output name="annotation_gff3" value="TEST_6/TEST_6.gff3" lines_diff="2"/>
                <output name="annotation_gbff" value="TEST_6/TEST_6.gbff" lines_diff="8"/>
                <output name="annotation_embl" value="TEST_6/TEST_6.embl" lines_diff="6"/>
                <output name="annotation_fna" value="TEST_6/TEST_6.fna"/>
                <output name="annotation_ffn" value="TEST_6/TEST_6.ffn"/>
                <output name="annotation_faa" value="TEST_6/TEST_6.faa"/>
                <output name="hypotheticals_tsv" value="TEST_6/TEST_6.hypotheticals.tsv" lines_diff="4"/>
                <output name="hypotheticals_faa" value="TEST_6/TEST_6.hypotheticals.faa"/>
                <output name="summary_txt" value="TEST_6/TEST_6.txt" lines_diff="4"/>
                <output name="annotation_plot" value="TEST_6/TEST_6_plot.svg" ftype="svg" compare="sim_size"/>
                <output name="annotation_json" value="TEST_6/TEST_6.json" lines_diff="6"/>
                <output name="logfile" ftype="txt">
                    <expand macro="assert_content_test"/>
                </output>
            </test>
    </tests>
    <help><![CDATA[**What it does**
          Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.

        *Comprehensive & taxonomy-independent database*
          Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.

        *Protein sequence identification*
          Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
          allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
          This is achieved via an alignment-free sequence identification (AFSI) approach
          using full-length MD5 protein sequence hash digests.
        *Small proteins/short open reading frames*
          Bakta detects and annotates small proteins/short open reading frames (sORF).

        *Expert annotation systems*
          To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
          Bakta includes & merges different expert annotation systems.
          Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
          as well as an generalized protein sequence expert system with distinct
          coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.

        *Comprehensive workflow*
          Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
          and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.

        *GFF3 & INSDC conform annotations*
          Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
          (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
          respectively for representative genomes of all ESKAPE species).

        *Bacteria & plasmids*
          Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.

        **Input options**
          1. Choose a genome or assembly in fasta format to use bakta annotations
          2. Choose A version of the Bakta database

        **Organism options**
        You can specify informations about analysed fasta as text input for:
        - genus
        - species
        - strain
        - plasmid

        **Annotation options**
        1. You can specify if all sequences (chromosome or plasmids) are complete or not
        2. You can add your own prodigal training file for CDS predictionœ
        3. The translation table could be modified, default is the 11th for bacteria
        4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
        5. You can keep the name of contig present in the input file
        6. You can specify your own replicon table as a TSV/CSV file
        7. The compliance option is for ready to submit annotation file to Public database
        as ENA, Genbank EMBL
        8. You can specify a protein sequence file for annotation in GenBank or fasta formats
        Using the Fasta format, each reference sequence can be provided in a short or long format:

        # short:
        >id gene~~~product~~~dbxrefs
        MAQ...

        # long:
        >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
        MAQ...

        **Skip steps**
        Some steps could be skiped:
        - skip-trna           Skip tRNA detection & annotation
        - skip-tmrna          Skip tmRNA detection & annotation
        - skip-rrna           Skip rRNA detection & annotation
        - skip-ncrna          Skip ncRNA detection & annotation
        - skip-ncrna-region   Skip ncRNA region detection & annotation
        - skip-crispr         Skip CRISPR array detection & annotation
        - skip-cds            Skip CDS detection & annotation
        - skip-pseudo         Skip pseudogene detection & annotation
        - skip-sorf           Skip sORF detection & annotation
        - skip-gap            Skip gap detection & annotation
        - skip-ori            Skip oriC/oriT detection & annotation

        **Output options**
        Bakta produce numbers of output files, you can select what type of file you want:
        - Summary of the annotation
        - Annotated files
        - Sequence files for nucleotide and/or amino acid
    ]]></help>
    <expand macro="citations"/>
</tool>