bakta: bakta.xml comparison

comparison bakta.xml @ 7:ba6990f72184 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit e277883fca66013904bae930f04e7f3be5fcb1a2

author	iuc
date	Wed, 05 Jun 2024 14:22:02 +0000
parents	92eee5f31117
children

comparison

equal deleted inserted replaced

-:92eee5f31117
+:ba6990f72184
 <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
 <description>
-Genome annotation via alignment-free sequence identification
+Rapid and standardized annotation of bacterial genomes, MAGs and plasmids
 </description>
 <macros>
 <import>macro.xml</import>
 </macros>
-<expand macro='edam'/>
+<expand macro="xrefs"/>
-<expand macro='xrefs'/>
 <expand macro="requirements"/>
 <expand macro="version_command"/>
 <command detect_errors="aggressive"><![CDATA[
+mkdir -p ./database_path/amrfinderplus-db &&
-mkdir -p ./database_path/amrfinderplus-db &&
+ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
-ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
+ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
-ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
-bakta --verbose
+bakta
+--verbose
 #*======================================
 CPU option
 ======================================*#
 --threads \${GALAXY_SLOTS:-1}
 #*======================================
 Bakta database
 ======================================*#
 --db './database_path'
 --output 'bakta_output'
 #if $input_option.min_contig_length
 --min-contig-length $input_option.min_contig_length
 #else if $annotation.compliant
 --min-contig-length 200
 #else
 --min-contig-length 1
 #end if
 --prefix bakta_output
 #*======================================
 Organism options
 genus/species/strain/plasmid
 ======================================*#
 #if $organism.genus
 --genus '$organism.genus'
 #end if
 #if $organism.species
 --species '$organism.species'
 #end if
 #if $organism.strain
 --strain '$organism.strain'
 #end if
 #if $organism.plasmid
 --plasmid '$organism.plasmid'
 #end if
 #*======================================
 Annotation options
 gram type, prodigal/protein file
 ======================================*#
 $annotation.complete
 #if $annotation.prodigal
 --prodigal-tf '$annotation.prodigal'
 #end if
 #if $annotation.translation_table
 --translation-table '$annotation.translation_table'
 #end if
 --gram '?'
 $annotation.keep_contig_headers
 #if $annotation.replicons
 --replicons '$annotation.replicons'
 #end if
 $annotation.compliant
 #if $annotation.proteins
 --proteins '$annotation.proteins'
 #end if
 #if $annotation.regions
 --regions '$annotation.regions'
 #end if
 #*======================================
 Workflow OPTIONS
 skip some step of the bakta analysis
 ======================================*#
 #echo " ".join($workflow.skip_analysis)
 #*======================================
 Genome file
 ======================================*#
 '$input_option.input_file'
 #*======================================
 LOG file
 ======================================*#
 | tee '$logfile'
 ]]></command>
 <inputs>
 <!-- DB and file INPUT -->
 <section name="input_option" title="Input/Output options" expanded="true">
-<param name="bakta_db_select" type="select" label="The bakta database">
+<param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
+<param argument="--min-contig-length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode)"/>
+<param name="bakta_db_select" type="select" label="Bakta database">
 <options from_data_table="bakta_database">
 <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/>
 <validator message="No bakta database is available" type="no_options"/>
 </options>
 </param>
-<param name="amrfinder_db_select" type="select" label="The amrfinderplus database">
+<param name="amrfinder_db_select" type="select" optional="true" label="AMRFinderPlus database" help="The selection of this database is not needed if Bakta database version is higher 5.0">
-<options from_data_table="amrfinderplus_database">
+<options from_data_table="amrfinderplus_versioned_database">
-<validator message="No amrfinderplus database is available" type="no_options"/>
+<filter type="static_value" value="3.12" column="db_version"/>
+<validator message="No AMRFinderPlus database is available" type="no_options"/>
 </options>
 </param>
-<param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
-<param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
 </section>
 <!-- Organism INFORMATION OPTIONS -->
 <section name="organism" title="Optional organism options" expanded="false">
 <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia">
 <validator type="regex">^[a-zA-Z]+$</validator>
 <param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/>
 <param name="translation_table"  type="select" optional="true" label="Translation table" help="Default is the bacterial table 11">
 <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
 <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
 </param>
-<param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/>
+<param argument="--keep-contig-headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header"/>
 <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
 <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
 <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" label="Metagenome mode" help="Run in metagenome mode. This only affects CDS prediction"/>
 <param argument="--regions" type="data" format="gff,genbank" optional="true" label="Pre-annotated regions" help="Regions only, no functional annotations."/>
 <option value="file_json" selected="false">Information on each annotated feature as JSON</option>
 <option value="file_plot" selected="true">Plot of the annotation result as SVG</option>
 <option value="log_txt" selected="false">Log file as TXT</option>
 </param>
 </section>
 </inputs>
 <outputs>
-<data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary">
+<data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: Summary">
 <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences">
+<data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation and sequences (GFF3)">
 <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff">
+<data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: Annotations and sequences (GenBank format)">
 <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl">
+<data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: Annotations and sequences (EMBL format)">
 <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences">
+<data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Replicon/contig DNA sequences">
 <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences">
+<data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Feature nucleotide sequences">
 <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences">
+<data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: CDS/sORF amino acid sequences">
 <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter>
 </data>
-<data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary">
+<data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: Hypothetical protein CDS summary">
 <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter>
 </data>
-<data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences">
+<data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: Hypothetical protein CDS amino sequences">
 <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter>
 </data>
-<data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary">
+<data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Summary (TXT)">
 <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter>
 </data>
-<data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable">
+<data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: Information on each annotated feature (JSON)">
 <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter>
 </data>
 <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
 <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter>
 </data>
-<data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
+<data name="logfile" format="txt" label="${tool.name} on ${on_string}: Log file">
 <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter>
 </data>
 </outputs>
 <tests>
 <test expect_num_outputs="13"> <!-- TEST_1 database + input -->
 <section name="input_option" >
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
 <param name="min_contig_length" value="250"/>
+<param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
 </section>
 <section name="output_files">
 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
 </section>
-<output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/>
+<output name="annotation_tsv" ftype="tabular">
-<output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/>
+<assert_contents>
-<output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/>
+<has_n_lines n="8"/>
-<output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/>
+<has_text text="IHHALP_00005"/>
-<output name="annotation_fna" value="TEST_1/TEST_1.fna"/>
+<has_text text="hypothetical protein"/>
-<output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/>
+</assert_contents>
-<output name="annotation_faa" value="TEST_1/TEST_1.faa"/>
+</output>
-<output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/>
+<output name="annotation_gff3" ftype="gff3">
-<output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/>
+<assert_contents>
-<output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/>
+<has_n_lines n="36"/>
-<output name="annotation_plot" value="TEST_1/TEST_1_plot.svg" ftype="svg" compare="sim_size"/>
+<has_text text="Prodigal"/>
-<output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/>
+<has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
+</assert_contents>
+</output>
+<output name="annotation_gbff" ftype="tabular">
+<assert_contents>
+<has_n_lines n="81"/>
+<has_text text="DEFINITION"/>
+<has_text text="/inference"/>
+</assert_contents>
+</output>
+<output name="annotation_embl" ftype="tabular">
+<assert_contents>
+<has_text text="##Genome Annotation Summary:##"/>
+<has_text text="/product="/>
+</assert_contents>
+</output>
+<output name="annotation_fna" ftype="fasta">
+<assert_contents>
+<has_text text=">contig_1"/>
+<has_text text="TCTTCTGCGAG"/>
+</assert_contents>
+</output>
+<output name="annotation_ffn" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
+<has_text text="ATGACAAAACGAAGTG"/>
+</assert_contents>
+</output>
+<output name="annotation_faa" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
+<has_text text="MTKRSGSNTR"/>
+</assert_contents>
+</output>
+<output name="hypotheticals_tsv" ftype="tabular">
+<assert_contents>
+<has_n_lines n="5"/>
+<has_text text="IHHALP_00010"/>
+<has_text text="Sequence Id"/>
+</assert_contents>
+</output>
+<output name="hypotheticals_faa" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00010 hypothetical protein"/>
+<has_text text="MNKQQQTALNM"/>
+</assert_contents>
+</output>
+<output name="summary_txt" ftype="txt">
+<assert_contents>
+<has_text text="coding density: 62.0"/>
+<has_text text="CDSs: 2"/>
+</assert_contents>
+</output>
+<output name="annotation_plot" ftype="svg">
+<assert_contents>
+<has_size value="418990" delta="1000"/>
+</assert_contents>
+</output>
+<output name="annotation_json" ftype="json">
+<assert_contents>
+<has_text text="coding_ratio"/>
+<has_text text="n50"/>
+<has_text text="hypothetical protein"/>
+</assert_contents>
+</output>
 <output name="logfile" ftype="txt">
 <expand macro="assert_content_test"/>
 </output>
 </test>
 <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps  -->
 <section name="input_option" >
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
 <param name="min_contig_length" value="250"/>
+<param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
 </section>
 <section name="organism">
 <param name="genus" value="Escherichia"/>
 <param name="species" value="coli O157:H7"/>
 <param name="strain" value="Sakai"/>
 <param name="keep_contig_headers" value="true"/>
 </section>
 <section name="workflow">
 <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
 </section>
-<output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4">
+<output name="annotation_tsv">
 <assert_contents>
+<has_n_lines n="8"/>
 <has_text_matching expression="IHHALP_00005"/>
 </assert_contents>
 </output>
-<output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4">
+<output name="annotation_gff3">
 <assert_contents>
+<has_n_lines n="37"/>
 <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
 </assert_contents>
 </output>
-<output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/>
+<output name="annotation_ffn" ftype="fasta">
-<output name="annotation_plot" value="TEST_2/TEST_2_plot.svg" ftype="svg" compare="sim_size"/>
+<assert_contents>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
+<has_text text="ATGACAAAACGAAGTG"/>
+</assert_contents>
+</output>
+<output name="annotation_plot" ftype="svg">
+<assert_contents>
+<has_size value="418990" delta="1000"/>
+</assert_contents>
+</output>
 </test>
-<test expect_num_outputs="4"> <!-- TEST_3 test all skip steps  -->
+<test expect_num_outputs="4"> <!-- TEST_3 test all skip steps and test previous bakta version -->
 <section name="input_option" >
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
 <param name="min_contig_length" value="350"/>
+<param name="bakta_db_select" value="V5.0_2022-08-19"/>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
 </section>
 <section name="workflow">
 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
 </section>
-<output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/>
+<output name="annotation_tsv">
-<output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/>
+<assert_contents>
-<output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/>
+<has_n_lines n="6"/>
-</test>
+<has_text_matching expression="DbXrefs"/>
-<test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
+</assert_contents>
-<section name="input_option" >
+</output>
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
+<output name="annotation_gff3" ftype="gff3">
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<assert_contents>
-<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<has_n_lines n="34"/>
-</section>
+<has_text text=">contig_1"/>
-<section name="annotation">
+</assert_contents>
-<param name="complete" value="true"/>
+</output>
-<param name="prodigal" value="prodigal.tf"/>
+<output name="annotation_ffn" ftype="fasta">
-<param name="translation_table" value="4"/>
+<assert_contents>
-<param name="replicons" value="replicons.tsv" ftype="tabular"/>
+<has_size value="0"/>
-<param name="compliant" value="true"/>
+</assert_contents>
-<param name="proteins" value="user-proteins.faa" ftype="fasta"/>
+</output>
-</section>
+</test>
-<output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/>
+<test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
-<output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/>
+<section name="input_option" >
-<output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
-<output name="annotation_plot" value="TEST_4/TEST_4_plot.svg" ftype="svg" compare="sim_size"/>
+<param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
-</test>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
-<test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
+</section>
-<section name="input_option" >
+<section name="annotation">
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
+<param name="complete" value="true"/>
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="prodigal" value="prodigal.tf"/>
-<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<param name="translation_table" value="4"/>
-</section>
+<param name="replicons" value="replicons.tsv" ftype="tabular"/>
-<section name="annotation">
+<param name="compliant" value="true"/>
-<param name="complete" value="true"/>
+<param name="proteins" value="user-proteins.faa" ftype="fasta"/>
-<param name="translation_table" value="4"/>
+</section>
-</section>
+<output name="annotation_tsv">
-<section name="workflow">
+<assert_contents>
-<param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
+<has_n_lines n="8"/>
-</section>
+<has_text_matching expression="IHHALP_00005"/>
-<section name="output_files">
+</assert_contents>
-<param name="output_selection" value="log_txt,sum_txt"/>
+</output>
-</section>
+<output name="annotation_gff3" ftype="gff3">
-<output name="logfile" ftype="txt">
+<assert_contents>
-<expand macro="assert_content_test"/>
+<has_n_lines n="13"/>
-</output>
+<has_text text="Prodigal"/>
-<output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/>
+<has_text text="ID=IHHALP_00010_gene;locus_tag=IHHALP_00010"/>
-</test>
+</assert_contents>
-<test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
+</output>
-<section name="input_option" >
+<output name="annotation_ffn" ftype="fasta">
-<param name="bakta_db_select" value="V5.0_2022-08-19"/>
+<assert_contents>
-<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
-<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<has_text text="ATGACAAAACGAAGTG"/>
-</section>
+</assert_contents>
-<section name="annotation">
+</output>
-<param name="meta" value="true"/>
+<output name="annotation_plot" ftype="svg">
-</section>
+<assert_contents>
-<section name="output_files">
+<has_size value="418990" delta="1000"/>
-<param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
+</assert_contents>
-</section>
+</output>
-<output name="annotation_tsv" value="TEST_6/TEST_6.tsv" lines_diff="2"/>
+</test>
-<output name="annotation_gff3" value="TEST_6/TEST_6.gff3" lines_diff="2"/>
+<test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
-<output name="annotation_gbff" value="TEST_6/TEST_6.gbff" lines_diff="8"/>
+<section name="input_option" >
-<output name="annotation_embl" value="TEST_6/TEST_6.embl" lines_diff="6"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
-<output name="annotation_fna" value="TEST_6/TEST_6.fna"/>
+<param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
-<output name="annotation_ffn" value="TEST_6/TEST_6.ffn"/>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
-<output name="annotation_faa" value="TEST_6/TEST_6.faa"/>
+</section>
-<output name="hypotheticals_tsv" value="TEST_6/TEST_6.hypotheticals.tsv" lines_diff="4"/>
+<section name="annotation">
-<output name="hypotheticals_faa" value="TEST_6/TEST_6.hypotheticals.faa"/>
+<param name="complete" value="true"/>
-<output name="summary_txt" value="TEST_6/TEST_6.txt" lines_diff="4"/>
+<param name="translation_table" value="4"/>
-<output name="annotation_plot" value="TEST_6/TEST_6_plot.svg" ftype="svg" compare="sim_size"/>
+</section>
-<output name="annotation_json" value="TEST_6/TEST_6.json" lines_diff="6"/>
+<section name="workflow">
-<output name="logfile" ftype="txt">
+<param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
-<expand macro="assert_content_test"/>
+</section>
-</output>
+<section name="output_files">
-</test>
+<param name="output_selection" value="log_txt,sum_txt"/>
+</section>
+<output name="logfile" ftype="txt">
+<expand macro="assert_content_test"/>
+</output>
+<output name="summary_txt" ftype="txt">
+<assert_contents>
+<has_n_lines n="30"/>
+<has_text text="N50: 1330"/>
+<has_text text="oriTs: 0"/>
+</assert_contents>
+</output>
+</test>
+<test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
+<section name="input_option" >
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+<param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
+</section>
+<section name="annotation">
+<param name="meta" value="true"/>
+</section>
+<section name="output_files">
+<param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
+</section>
+<output name="annotation_tsv" ftype="tabular">
+<assert_contents>
+<has_n_lines n="8"/>
+<has_text text="IHHALP_00005"/>
+<has_text text="hypothetical protein"/>
+</assert_contents>
+</output>
+<output name="annotation_gff3" ftype="gff3">
+<assert_contents>
+<has_n_lines n="36"/>
+<has_text text="Prodigal"/>
+<has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
+</assert_contents>
+</output>
+<output name="annotation_gbff" ftype="tabular">
+<assert_contents>
+<has_n_lines n="81"/>
+<has_text text="DEFINITION"/>
+<has_text text="/inference"/>
+</assert_contents>
+</output>
+<output name="annotation_embl" ftype="tabular">
+<assert_contents>
+<has_text text="##Genome Annotation Summary:##"/>
+<has_text text="/product="/>
+</assert_contents>
+</output>
+<output name="annotation_fna" ftype="fasta">
+<assert_contents>
+<has_text text=">contig_1"/>
+<has_text text="TCTTCTGCGAG"/>
+</assert_contents>
+</output>
+<output name="annotation_ffn" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
+<has_text text="ATGACAAAACGAAGTG"/>
+</assert_contents>
+</output>
+<output name="annotation_faa" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00005 hypothetical protein"/>
+<has_text text="MTKRSGSNTR"/>
+</assert_contents>
+</output>
+<output name="hypotheticals_tsv" ftype="tabular">
+<assert_contents>
+<has_n_lines n="5"/>
+<has_text text="IHHALP_00010"/>
+<has_text text="Sequence Id"/>
+</assert_contents>
+</output>
+<output name="hypotheticals_faa" ftype="fasta">
+<assert_contents>
+<has_text text=">IHHALP_00010 hypothetical protein"/>
+<has_text text="MNKQQQTALNM"/>
+</assert_contents>
+</output>
+<output name="summary_txt" ftype="txt">
+<assert_contents>
+<has_n_lines n="30"/>
+<has_text text="coding density: 62.0"/>
+<has_text text="CDSs: 2"/>
+</assert_contents>
+</output>
+<output name="annotation_plot" ftype="svg">
+<assert_contents>
+<has_size value="418990" delta="1000"/>
+</assert_contents>
+</output>
+<output name="annotation_json" ftype="json">
+<assert_contents>
+<has_text text="coding_ratio"/>
+<has_text text="n50"/>
+<has_text text="hypothetical protein"/>
+</assert_contents>
+</output>
+<output name="logfile" ftype="txt">
+<expand macro="assert_content_test"/>
+</output>
+</test>
 </tests>
-<help><![CDATA[**What it does**
+<help><![CDATA[
-Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
+**What it does**
-*Comprehensive & taxonomy-independent database*
+Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
-Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
+*Comprehensive & taxonomy-independent database*
-*Protein sequence identification*
+Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
-Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
-allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
+*Protein sequence identification*
-This is achieved via an alignment-free sequence identification (AFSI) approach
+Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
-using full-length MD5 protein sequence hash digests.
+allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
-*Small proteins/short open reading frames*
+This is achieved via an alignment-free sequence identification (AFSI) approach
-Bakta detects and annotates small proteins/short open reading frames (sORF).
+using full-length MD5 protein sequence hash digests.
+*Small proteins/short open reading frames*
-*Expert annotation systems*
+Bakta detects and annotates small proteins/short open reading frames (sORF).
-To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
-Bakta includes & merges different expert annotation systems.
+*Expert annotation systems*
-Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
+To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
-as well as an generalized protein sequence expert system with distinct
+Bakta includes & merges different expert annotation systems.
-coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
+Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
+as well as an generalized protein sequence expert system with distinct
-*Comprehensive workflow*
+coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
-Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
-and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
+*Comprehensive workflow*
+Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
-*GFF3 & INSDC conform annotations*
+and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
-Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
-(checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
+*GFF3 & INSDC conform annotations*
-respectively for representative genomes of all ESKAPE species).
+Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
+(checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
-*Bacteria & plasmids*
+respectively for representative genomes of all ESKAPE species).
-Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
+*Bacteria & plasmids*
-**Input options**
+Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
-1. Choose a genome or assembly in fasta format to use bakta annotations
-2. Choose A version of the Bakta database
+**Input options**
-**Organism options**
+1. Choose a genome or assembly in fasta format to use bakta annotations
-You can specify informations about analysed fasta as text input for:
+2. Choose A version of the Bakta database
-- genus
-- species
+**Organism options**
-- strain
+You can specify informations about analysed fasta as text input for:
-- plasmid
+- genus
+- species
-**Annotation options**
+- strain
-1. You can specify if all sequences (chromosome or plasmids) are complete or not
+- plasmid
-2. You can add your own prodigal training file for CDS predictionœ
-3. The translation table could be modified, default is the 11th for bacteria
+**Annotation options**
-4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
+1. You can specify if all sequences (chromosome or plasmids) are complete or not
-5. You can keep the name of contig present in the input file
+2. You can add your own prodigal training file for CDS predictionœ
-6. You can specify your own replicon table as a TSV/CSV file
+3. The translation table could be modified, default is the 11th for bacteria
-7. The compliance option is for ready to submit annotation file to Public database
+4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
-as ENA, Genbank EMBL
+5. You can keep the name of contig present in the input file
-8. You can specify a protein sequence file for annotation in GenBank or fasta formats
+6. You can specify your own replicon table as a TSV/CSV file
-Using the Fasta format, each reference sequence can be provided in a short or long format:
+7. The compliance option is for ready to submit annotation file to Public database
+as ENA, Genbank EMBL
-# short:
+8. You can specify a protein sequence file for annotation in GenBank or fasta formats
->id gene~~~product~~~dbxrefs
+Using the Fasta format, each reference sequence can be provided in a short or long format:
-MAQ...
+# short:
-# long:
+>id gene~~~product~~~dbxrefs
->id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
+MAQ...
-MAQ...
+# long:
-**Skip steps**
+>id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
-Some steps could be skiped:
+MAQ...
-- skip-trna           Skip tRNA detection & annotation
-- skip-tmrna          Skip tmRNA detection & annotation
+**Skip steps**
-- skip-rrna           Skip rRNA detection & annotation
+Some steps could be skiped:
-- skip-ncrna          Skip ncRNA detection & annotation
+- skip-trna           Skip tRNA detection & annotation
-- skip-ncrna-region   Skip ncRNA region detection & annotation
+- skip-tmrna          Skip tmRNA detection & annotation
-- skip-crispr         Skip CRISPR array detection & annotation
+- skip-rrna           Skip rRNA detection & annotation
-- skip-cds            Skip CDS detection & annotation
+- skip-ncrna          Skip ncRNA detection & annotation
-- skip-pseudo         Skip pseudogene detection & annotation
+- skip-ncrna-region   Skip ncRNA region detection & annotation
-- skip-sorf           Skip sORF detection & annotation
+- skip-crispr         Skip CRISPR array detection & annotation
-- skip-gap            Skip gap detection & annotation
+- skip-cds            Skip CDS detection & annotation
-- skip-ori            Skip oriC/oriT detection & annotation
+- skip-pseudo         Skip pseudogene detection & annotation
+- skip-sorf           Skip sORF detection & annotation
-**Output options**
+- skip-gap            Skip gap detection & annotation
-Bakta produce numbers of output files, you can select what type of file you want:
+- skip-ori            Skip oriC/oriT detection & annotation
-- Summary of the annotation
-- Annotated files
+**Output options**
-- Sequence files for nucleotide and/or amino acid
+Bakta produce numbers of output files, you can select what type of file you want:
+- Summary of the annotation
+- Annotated files
+- Sequence files for nucleotide and/or amino acid
 ]]></help>
 <expand macro="citations"/>
 </tool>

Mercurial > repos > iuc > bakta

comparison bakta.xml @ 7:ba6990f72184 draft