Mercurial > repos > iuc > bakta
diff bakta.xml @ 7:ba6990f72184 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit e277883fca66013904bae930f04e7f3be5fcb1a2
author | iuc |
---|---|
date | Wed, 05 Jun 2024 14:22:02 +0000 |
parents | 92eee5f31117 |
children |
line wrap: on
line diff
--- a/bakta.xml Sun Feb 11 00:56:12 2024 +0000 +++ b/bakta.xml Wed Jun 05 14:22:02 2024 +0000 @@ -1,111 +1,109 @@ <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description> - Genome annotation via alignment-free sequence identification + Rapid and standardized annotation of bacterial genomes, MAGs and plasmids </description> <macros> <import>macro.xml</import> </macros> - <expand macro='edam'/> - <expand macro='xrefs'/> + <expand macro="xrefs"/> <expand macro="requirements"/> <expand macro="version_command"/> - <command detect_errors="aggressive"><![CDATA[ - - mkdir -p ./database_path/amrfinderplus-db && - ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path && - ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest && - bakta --verbose +mkdir -p ./database_path/amrfinderplus-db && +ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path && +ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest && - #*====================================== - CPU option - ======================================*# - --threads \${GALAXY_SLOTS:-1} - #*====================================== - Bakta database - ======================================*# - --db './database_path' - --output 'bakta_output' - #if $input_option.min_contig_length - --min-contig-length $input_option.min_contig_length - #else if $annotation.compliant - --min-contig-length 200 - #else - --min-contig-length 1 - #end if - --prefix bakta_output - #*====================================== - Organism options - genus/species/strain/plasmid - ======================================*# - #if $organism.genus - --genus '$organism.genus' - #end if - #if $organism.species - --species '$organism.species' - #end if - #if $organism.strain - --strain '$organism.strain' - #end if - #if $organism.plasmid - --plasmid '$organism.plasmid' - #end if - #*====================================== - Annotation options - gram type, prodigal/protein file - ======================================*# - $annotation.complete - #if $annotation.prodigal - --prodigal-tf '$annotation.prodigal' - #end if - #if $annotation.translation_table - --translation-table '$annotation.translation_table' - #end if - --gram '?' - $annotation.keep_contig_headers - #if $annotation.replicons - --replicons '$annotation.replicons' - #end if - $annotation.compliant - #if $annotation.proteins - --proteins '$annotation.proteins' - #end if - #if $annotation.regions - --regions '$annotation.regions' - #end if - #*====================================== - Workflow OPTIONS - skip some step of the bakta analysis - ======================================*# +bakta + --verbose +#*====================================== + CPU option +======================================*# + --threads \${GALAXY_SLOTS:-1} +#*====================================== + Bakta database +======================================*# + --db './database_path' + --output 'bakta_output' +#if $input_option.min_contig_length + --min-contig-length $input_option.min_contig_length +#else if $annotation.compliant + --min-contig-length 200 +#else + --min-contig-length 1 +#end if + --prefix bakta_output +#*====================================== + Organism options + genus/species/strain/plasmid +======================================*# +#if $organism.genus + --genus '$organism.genus' +#end if +#if $organism.species + --species '$organism.species' +#end if +#if $organism.strain + --strain '$organism.strain' +#end if +#if $organism.plasmid + --plasmid '$organism.plasmid' +#end if +#*====================================== + Annotation options + gram type, prodigal/protein file +======================================*# + $annotation.complete +#if $annotation.prodigal + --prodigal-tf '$annotation.prodigal' +#end if +#if $annotation.translation_table + --translation-table '$annotation.translation_table' +#end if + --gram '?' +$annotation.keep_contig_headers +#if $annotation.replicons + --replicons '$annotation.replicons' +#end if +$annotation.compliant +#if $annotation.proteins + --proteins '$annotation.proteins' +#end if +#if $annotation.regions + --regions '$annotation.regions' +#end if +#*====================================== + Workflow OPTIONS + skip some step of the bakta analysis +======================================*# - #echo " ".join($workflow.skip_analysis) +#echo " ".join($workflow.skip_analysis) - #*====================================== - Genome file - ======================================*# - '$input_option.input_file' - #*====================================== - LOG file - ======================================*# - | tee '$logfile' +#*====================================== + Genome file +======================================*# +'$input_option.input_file' +#*====================================== + LOG file +======================================*# +| tee '$logfile' ]]></command> <inputs> <!-- DB and file INPUT --> <section name="input_option" title="Input/Output options" expanded="true"> - <param name="bakta_db_select" type="select" label="The bakta database"> + <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/> + <param argument="--min-contig-length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode)"/> + <param name="bakta_db_select" type="select" label="Bakta database"> <options from_data_table="bakta_database"> <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/> <validator message="No bakta database is available" type="no_options"/> </options> </param> - <param name="amrfinder_db_select" type="select" label="The amrfinderplus database"> - <options from_data_table="amrfinderplus_database"> - <validator message="No amrfinderplus database is available" type="no_options"/> + <param name="amrfinder_db_select" type="select" optional="true" label="AMRFinderPlus database" help="The selection of this database is not needed if Bakta database version is higher 5.0"> + <options from_data_table="amrfinderplus_versioned_database"> + <filter type="static_value" value="3.12" column="db_version"/> + <validator message="No AMRFinderPlus database is available" type="no_options"/> </options> </param> - - <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/> - <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/> </section> <!-- Organism INFORMATION OPTIONS --> <section name="organism" title="Optional organism options" expanded="false"> @@ -130,7 +128,7 @@ <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option> </param> - <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/> + <param argument="--keep-contig-headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header"/> <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/> <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/> <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/> @@ -171,82 +169,145 @@ <option value="log_txt" selected="false">Log file as TXT</option> </param> </section> - </inputs> <outputs> - <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary"> + <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: Summary"> <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter> </data> - <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences"> + <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation and sequences (GFF3)"> <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter> </data> - <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff"> + <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: Annotations and sequences (GenBank format)"> <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter> </data> - <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl"> + <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: Annotations and sequences (EMBL format)"> <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter> </data> - <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences"> + <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Replicon/contig DNA sequences"> <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter> </data> - <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences"> + <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Feature nucleotide sequences"> <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter> </data> - <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences"> + <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: CDS/sORF amino acid sequences"> <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter> </data> - <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary"> + <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: Hypothetical protein CDS summary"> <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter> </data> - <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences"> + <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: Hypothetical protein CDS amino sequences"> <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter> </data> - <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary"> + <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Summary (TXT)"> <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter> </data> - <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable"> + <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: Information on each annotated feature (JSON)"> <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter> </data> <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation"> <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter> </data> - <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"> + <data name="logfile" format="txt" label="${tool.name} on ${on_string}: Log file"> <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter> </data> </outputs> <tests> <test expect_num_outputs="13"> <!-- TEST_1 database + input --> <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> <param name="min_contig_length" value="250"/> + <param name="bakta_db_select" value="V5.1_light_2024-01-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> </section> <section name="output_files"> <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> </section> - <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/> - <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/> - <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/> - <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/> - <output name="annotation_fna" value="TEST_1/TEST_1.fna"/> - <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/> - <output name="annotation_faa" value="TEST_1/TEST_1.faa"/> - <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/> - <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/> - <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/> - <output name="annotation_plot" value="TEST_1/TEST_1_plot.svg" ftype="svg" compare="sim_size"/> - <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/> + <output name="annotation_tsv" ftype="tabular"> + <assert_contents> + <has_n_lines n="8"/> + <has_text text="IHHALP_00005"/> + <has_text text="hypothetical protein"/> + </assert_contents> + </output> + <output name="annotation_gff3" ftype="gff3"> + <assert_contents> + <has_n_lines n="36"/> + <has_text text="Prodigal"/> + <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/> + </assert_contents> + </output> + <output name="annotation_gbff" ftype="tabular"> + <assert_contents> + <has_n_lines n="81"/> + <has_text text="DEFINITION"/> + <has_text text="/inference"/> + </assert_contents> + </output> + <output name="annotation_embl" ftype="tabular"> + <assert_contents> + <has_text text="##Genome Annotation Summary:##"/> + <has_text text="/product="/> + </assert_contents> + </output> + <output name="annotation_fna" ftype="fasta"> + <assert_contents> + <has_text text=">contig_1"/> + <has_text text="TCTTCTGCGAG"/> + </assert_contents> + </output> + <output name="annotation_ffn" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="ATGACAAAACGAAGTG"/> + </assert_contents> + </output> + <output name="annotation_faa" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="MTKRSGSNTR"/> + </assert_contents> + </output> + <output name="hypotheticals_tsv" ftype="tabular"> + <assert_contents> + <has_n_lines n="5"/> + <has_text text="IHHALP_00010"/> + <has_text text="Sequence Id"/> + </assert_contents> + </output> + <output name="hypotheticals_faa" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00010 hypothetical protein"/> + <has_text text="MNKQQQTALNM"/> + </assert_contents> + </output> + <output name="summary_txt" ftype="txt"> + <assert_contents> + <has_text text="coding density: 62.0"/> + <has_text text="CDSs: 2"/> + </assert_contents> + </output> + <output name="annotation_plot" ftype="svg"> + <assert_contents> + <has_size value="418990" delta="1000"/> + </assert_contents> + </output> + <output name="annotation_json" ftype="json"> + <assert_contents> + <has_text text="coding_ratio"/> + <has_text text="n50"/> + <has_text text="hypothetical protein"/> + </assert_contents> + </output> <output name="logfile" ftype="txt"> <expand macro="assert_content_test"/> </output> </test> <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps --> <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> <param name="min_contig_length" value="250"/> + <param name="bakta_db_select" value="V5.1_light_2024-01-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> </section> <section name="organism"> <param name="genus" value="Escherichia"/> @@ -260,185 +321,304 @@ <section name="workflow"> <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/> </section> - <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4"> + <output name="annotation_tsv"> <assert_contents> + <has_n_lines n="8"/> <has_text_matching expression="IHHALP_00005"/> </assert_contents> </output> - <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4"> + <output name="annotation_gff3"> <assert_contents> + <has_n_lines n="37"/> <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/> </assert_contents> </output> - <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/> - <output name="annotation_plot" value="TEST_2/TEST_2_plot.svg" ftype="svg" compare="sim_size"/> + <output name="annotation_ffn" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="ATGACAAAACGAAGTG"/> + </assert_contents> + </output> + <output name="annotation_plot" ftype="svg"> + <assert_contents> + <has_size value="418990" delta="1000"/> + </assert_contents> + </output> </test> - <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps --> + <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps and test previous bakta version --> <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> <param name="min_contig_length" value="350"/> + <param name="bakta_db_select" value="V5.0_2022-08-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> + </section> + <section name="workflow"> + <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/> + </section> + <output name="annotation_tsv"> + <assert_contents> + <has_n_lines n="6"/> + <has_text_matching expression="DbXrefs"/> + </assert_contents> + </output> + <output name="annotation_gff3" ftype="gff3"> + <assert_contents> + <has_n_lines n="34"/> + <has_text text=">contig_1"/> + </assert_contents> + </output> + <output name="annotation_ffn" ftype="fasta"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="4"> <!-- TEST_4 annotations --> + <section name="input_option" > + <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> + <param name="bakta_db_select" value="V5.1_light_2024-01-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> + </section> + <section name="annotation"> + <param name="complete" value="true"/> + <param name="prodigal" value="prodigal.tf"/> + <param name="translation_table" value="4"/> + <param name="replicons" value="replicons.tsv" ftype="tabular"/> + <param name="compliant" value="true"/> + <param name="proteins" value="user-proteins.faa" ftype="fasta"/> + </section> + <output name="annotation_tsv"> + <assert_contents> + <has_n_lines n="8"/> + <has_text_matching expression="IHHALP_00005"/> + </assert_contents> + </output> + <output name="annotation_gff3" ftype="gff3"> + <assert_contents> + <has_n_lines n="13"/> + <has_text text="Prodigal"/> + <has_text text="ID=IHHALP_00010_gene;locus_tag=IHHALP_00010"/> + </assert_contents> + </output> + <output name="annotation_ffn" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="ATGACAAAACGAAGTG"/> + </assert_contents> + </output> + <output name="annotation_plot" ftype="svg"> + <assert_contents> + <has_size value="418990" delta="1000"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary --> + <section name="input_option" > + <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> + <param name="bakta_db_select" value="V5.1_light_2024-01-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> + </section> + <section name="annotation"> + <param name="complete" value="true"/> + <param name="translation_table" value="4"/> </section> <section name="workflow"> <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/> </section> - <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/> - <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/> - <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/> - </test> - <test expect_num_outputs="4"> <!-- TEST_4 annotations --> - <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> - <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> - </section> - <section name="annotation"> - <param name="complete" value="true"/> - <param name="prodigal" value="prodigal.tf"/> - <param name="translation_table" value="4"/> - <param name="replicons" value="replicons.tsv" ftype="tabular"/> - <param name="compliant" value="true"/> - <param name="proteins" value="user-proteins.faa" ftype="fasta"/> - </section> - <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/> - <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/> - <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/> - <output name="annotation_plot" value="TEST_4/TEST_4_plot.svg" ftype="svg" compare="sim_size"/> - </test> - <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary --> - <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> - <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> - </section> - <section name="annotation"> - <param name="complete" value="true"/> - <param name="translation_table" value="4"/> - </section> - <section name="workflow"> - <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/> - </section> - <section name="output_files"> - <param name="output_selection" value="log_txt,sum_txt"/> - </section> - <output name="logfile" ftype="txt"> - <expand macro="assert_content_test"/> - </output> - <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/> - </test> - <test expect_num_outputs="13"> <!-- TEST_6 metagenome option --> - <section name="input_option" > - <param name="bakta_db_select" value="V5.0_2022-08-19"/> - <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> - <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> - </section> - <section name="annotation"> - <param name="meta" value="true"/> - </section> - <section name="output_files"> - <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> - </section> - <output name="annotation_tsv" value="TEST_6/TEST_6.tsv" lines_diff="2"/> - <output name="annotation_gff3" value="TEST_6/TEST_6.gff3" lines_diff="2"/> - <output name="annotation_gbff" value="TEST_6/TEST_6.gbff" lines_diff="8"/> - <output name="annotation_embl" value="TEST_6/TEST_6.embl" lines_diff="6"/> - <output name="annotation_fna" value="TEST_6/TEST_6.fna"/> - <output name="annotation_ffn" value="TEST_6/TEST_6.ffn"/> - <output name="annotation_faa" value="TEST_6/TEST_6.faa"/> - <output name="hypotheticals_tsv" value="TEST_6/TEST_6.hypotheticals.tsv" lines_diff="4"/> - <output name="hypotheticals_faa" value="TEST_6/TEST_6.hypotheticals.faa"/> - <output name="summary_txt" value="TEST_6/TEST_6.txt" lines_diff="4"/> - <output name="annotation_plot" value="TEST_6/TEST_6_plot.svg" ftype="svg" compare="sim_size"/> - <output name="annotation_json" value="TEST_6/TEST_6.json" lines_diff="6"/> - <output name="logfile" ftype="txt"> - <expand macro="assert_content_test"/> - </output> - </test> + <section name="output_files"> + <param name="output_selection" value="log_txt,sum_txt"/> + </section> + <output name="logfile" ftype="txt"> + <expand macro="assert_content_test"/> + </output> + <output name="summary_txt" ftype="txt"> + <assert_contents> + <has_n_lines n="30"/> + <has_text text="N50: 1330"/> + <has_text text="oriTs: 0"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="13"> <!-- TEST_6 metagenome option --> + <section name="input_option" > + <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> + <param name="bakta_db_select" value="V5.1_light_2024-01-19"/> + <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/> + </section> + <section name="annotation"> + <param name="meta" value="true"/> + </section> + <section name="output_files"> + <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> + </section> + <output name="annotation_tsv" ftype="tabular"> + <assert_contents> + <has_n_lines n="8"/> + <has_text text="IHHALP_00005"/> + <has_text text="hypothetical protein"/> + </assert_contents> + </output> + <output name="annotation_gff3" ftype="gff3"> + <assert_contents> + <has_n_lines n="36"/> + <has_text text="Prodigal"/> + <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/> + </assert_contents> + </output> + <output name="annotation_gbff" ftype="tabular"> + <assert_contents> + <has_n_lines n="81"/> + <has_text text="DEFINITION"/> + <has_text text="/inference"/> + </assert_contents> + </output> + <output name="annotation_embl" ftype="tabular"> + <assert_contents> + <has_text text="##Genome Annotation Summary:##"/> + <has_text text="/product="/> + </assert_contents> + </output> + <output name="annotation_fna" ftype="fasta"> + <assert_contents> + <has_text text=">contig_1"/> + <has_text text="TCTTCTGCGAG"/> + </assert_contents> + </output> + <output name="annotation_ffn" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="ATGACAAAACGAAGTG"/> + </assert_contents> + </output> + <output name="annotation_faa" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00005 hypothetical protein"/> + <has_text text="MTKRSGSNTR"/> + </assert_contents> + </output> + <output name="hypotheticals_tsv" ftype="tabular"> + <assert_contents> + <has_n_lines n="5"/> + <has_text text="IHHALP_00010"/> + <has_text text="Sequence Id"/> + </assert_contents> + </output> + <output name="hypotheticals_faa" ftype="fasta"> + <assert_contents> + <has_text text=">IHHALP_00010 hypothetical protein"/> + <has_text text="MNKQQQTALNM"/> + </assert_contents> + </output> + <output name="summary_txt" ftype="txt"> + <assert_contents> + <has_n_lines n="30"/> + <has_text text="coding density: 62.0"/> + <has_text text="CDSs: 2"/> + </assert_contents> + </output> + <output name="annotation_plot" ftype="svg"> + <assert_contents> + <has_size value="418990" delta="1000"/> + </assert_contents> + </output> + <output name="annotation_json" ftype="json"> + <assert_contents> + <has_text text="coding_ratio"/> + <has_text text="n50"/> + <has_text text="hypothetical protein"/> + </assert_contents> + </output> + <output name="logfile" ftype="txt"> + <expand macro="assert_content_test"/> + </output> + </test> </tests> - <help><![CDATA[**What it does** - Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs. + <help><![CDATA[ +**What it does** - *Comprehensive & taxonomy-independent database* - Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe. +Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs. + +*Comprehensive & taxonomy-independent database* + Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe. - *Protein sequence identification* - Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt - allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families. - This is achieved via an alignment-free sequence identification (AFSI) approach - using full-length MD5 protein sequence hash digests. - *Small proteins/short open reading frames* - Bakta detects and annotates small proteins/short open reading frames (sORF). +*Protein sequence identification* + Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt + allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families. + This is achieved via an alignment-free sequence identification (AFSI) approach + using full-length MD5 protein sequence hash digests. +*Small proteins/short open reading frames* + Bakta detects and annotates small proteins/short open reading frames (sORF). - *Expert annotation systems* - To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes, - Bakta includes & merges different expert annotation systems. - Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations - as well as an generalized protein sequence expert system with distinct - coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules. +*Expert annotation systems* + To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes, + Bakta includes & merges different expert annotation systems. + Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations + as well as an generalized protein sequence expert system with distinct + coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules. - *Comprehensive workflow* - Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT - and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS. +*Comprehensive workflow* + Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT + and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS. - *GFF3 & INSDC conform annotations* - Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission - (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats, - respectively for representative genomes of all ESKAPE species). +*GFF3 & INSDC conform annotations* + Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission + (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats, + respectively for representative genomes of all ESKAPE species). - *Bacteria & plasmids* - Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only. +*Bacteria & plasmids* + Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only. - **Input options** - 1. Choose a genome or assembly in fasta format to use bakta annotations - 2. Choose A version of the Bakta database +**Input options** + +1. Choose a genome or assembly in fasta format to use bakta annotations +2. Choose A version of the Bakta database - **Organism options** - You can specify informations about analysed fasta as text input for: - - genus - - species - - strain - - plasmid +**Organism options** +You can specify informations about analysed fasta as text input for: +- genus +- species +- strain +- plasmid - **Annotation options** - 1. You can specify if all sequences (chromosome or plasmids) are complete or not - 2. You can add your own prodigal training file for CDS predictionœ - 3. The translation table could be modified, default is the 11th for bacteria - 4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow) - 5. You can keep the name of contig present in the input file - 6. You can specify your own replicon table as a TSV/CSV file - 7. The compliance option is for ready to submit annotation file to Public database - as ENA, Genbank EMBL - 8. You can specify a protein sequence file for annotation in GenBank or fasta formats - Using the Fasta format, each reference sequence can be provided in a short or long format: +**Annotation options** +1. You can specify if all sequences (chromosome or plasmids) are complete or not +2. You can add your own prodigal training file for CDS predictionœ +3. The translation table could be modified, default is the 11th for bacteria +4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow) +5. You can keep the name of contig present in the input file +6. You can specify your own replicon table as a TSV/CSV file +7. The compliance option is for ready to submit annotation file to Public database +as ENA, Genbank EMBL +8. You can specify a protein sequence file for annotation in GenBank or fasta formats +Using the Fasta format, each reference sequence can be provided in a short or long format: - # short: - >id gene~~~product~~~dbxrefs - MAQ... +# short: +>id gene~~~product~~~dbxrefs +MAQ... - # long: - >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs - MAQ... +# long: +>id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs +MAQ... - **Skip steps** - Some steps could be skiped: - - skip-trna Skip tRNA detection & annotation - - skip-tmrna Skip tmRNA detection & annotation - - skip-rrna Skip rRNA detection & annotation - - skip-ncrna Skip ncRNA detection & annotation - - skip-ncrna-region Skip ncRNA region detection & annotation - - skip-crispr Skip CRISPR array detection & annotation - - skip-cds Skip CDS detection & annotation - - skip-pseudo Skip pseudogene detection & annotation - - skip-sorf Skip sORF detection & annotation - - skip-gap Skip gap detection & annotation - - skip-ori Skip oriC/oriT detection & annotation +**Skip steps** +Some steps could be skiped: +- skip-trna Skip tRNA detection & annotation +- skip-tmrna Skip tmRNA detection & annotation +- skip-rrna Skip rRNA detection & annotation +- skip-ncrna Skip ncRNA detection & annotation +- skip-ncrna-region Skip ncRNA region detection & annotation +- skip-crispr Skip CRISPR array detection & annotation +- skip-cds Skip CDS detection & annotation +- skip-pseudo Skip pseudogene detection & annotation +- skip-sorf Skip sORF detection & annotation +- skip-gap Skip gap detection & annotation +- skip-ori Skip oriC/oriT detection & annotation - **Output options** - Bakta produce numbers of output files, you can select what type of file you want: - - Summary of the annotation - - Annotated files - - Sequence files for nucleotide and/or amino acid +**Output options** +Bakta produce numbers of output files, you can select what type of file you want: +- Summary of the annotation +- Annotated files +- Sequence files for nucleotide and/or amino acid ]]></help> <expand macro="citations"/> </tool>