Mercurial > repos > iuc > bakta

diff bakta.xml @ 7:ba6990f72184 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit e277883fca66013904bae930f04e7f3be5fcb1a2
author: iuc
date: Wed, 05 Jun 2024 14:22:02 +0000
parents: 92eee5f31117
--- a/bakta.xml	Sun Feb 11 00:56:12 2024 +0000
+++ b/bakta.xml	Wed Jun 05 14:22:02 2024 +0000
@@ -1,111 +1,109 @@
 <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>
-        Genome annotation via alignment-free sequence identification
+        Rapid and standardized annotation of bacterial genomes, MAGs and plasmids
     </description>
     <macros>
         <import>macro.xml</import>
     </macros>
-    <expand macro='edam'/>
-    <expand macro='xrefs'/>
+    <expand macro="xrefs"/>
     <expand macro="requirements"/>
     <expand macro="version_command"/>
-
     <command detect_errors="aggressive"><![CDATA[
-
-        mkdir -p ./database_path/amrfinderplus-db &&
-        ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
-        ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
-        bakta --verbose
+mkdir -p ./database_path/amrfinderplus-db &&
+ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
+ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
 
-        #*======================================
-                    CPU option
-        ======================================*#
-        --threads \${GALAXY_SLOTS:-1}
-        #*======================================
-                    Bakta database
-        ======================================*#
-        --db './database_path'
-        --output 'bakta_output'
-        #if $input_option.min_contig_length
-            --min-contig-length $input_option.min_contig_length
-        #else if $annotation.compliant
-            --min-contig-length 200
-        #else
-            --min-contig-length 1
-        #end if
-        --prefix bakta_output
-        #*======================================
-                  Organism options
-              genus/species/strain/plasmid
-        ======================================*#
-        #if $organism.genus
-            --genus '$organism.genus'
-        #end if
-        #if $organism.species
-            --species '$organism.species'
-        #end if
-        #if $organism.strain
-            --strain '$organism.strain'
-        #end if
-        #if $organism.plasmid
-            --plasmid '$organism.plasmid'
-        #end if
-        #*======================================
-                    Annotation options
-            gram type, prodigal/protein file
-        ======================================*#
-        $annotation.complete
-        #if $annotation.prodigal
-            --prodigal-tf '$annotation.prodigal'
-        #end if
-        #if $annotation.translation_table
-            --translation-table '$annotation.translation_table'
-        #end if
-            --gram '?'
-        $annotation.keep_contig_headers
-        #if $annotation.replicons
-            --replicons '$annotation.replicons'
-        #end if
-        $annotation.compliant
-        #if $annotation.proteins
-            --proteins '$annotation.proteins'
-        #end if
-        #if $annotation.regions
-            --regions '$annotation.regions'
-        #end if
-        #*======================================
-                    Workflow OPTIONS
-         skip some step of the bakta analysis
-        ======================================*#
+bakta 
+    --verbose
+#*======================================
+            CPU option
+======================================*#
+    --threads \${GALAXY_SLOTS:-1}
+#*======================================
+            Bakta database
+======================================*#
+    --db './database_path'
+    --output 'bakta_output'
+#if $input_option.min_contig_length
+    --min-contig-length $input_option.min_contig_length
+#else if $annotation.compliant
+    --min-contig-length 200
+#else
+    --min-contig-length 1
+#end if
+    --prefix bakta_output
+#*======================================
+            Organism options
+        genus/species/strain/plasmid
+======================================*#
+#if $organism.genus
+    --genus '$organism.genus'
+#end if
+#if $organism.species
+    --species '$organism.species'
+#end if
+#if $organism.strain
+    --strain '$organism.strain'
+#end if
+#if $organism.plasmid
+    --plasmid '$organism.plasmid'
+#end if
+#*======================================
+            Annotation options
+    gram type, prodigal/protein file
+======================================*#
+    $annotation.complete
+#if $annotation.prodigal
+    --prodigal-tf '$annotation.prodigal'
+#end if
+#if $annotation.translation_table
+    --translation-table '$annotation.translation_table'
+#end if
+    --gram '?'
+$annotation.keep_contig_headers
+#if $annotation.replicons
+    --replicons '$annotation.replicons'
+#end if
+$annotation.compliant
+#if $annotation.proteins
+    --proteins '$annotation.proteins'
+#end if
+#if $annotation.regions
+    --regions '$annotation.regions'
+#end if
+#*======================================
+            Workflow OPTIONS
+    skip some step of the bakta analysis
+======================================*#
 
-        #echo " ".join($workflow.skip_analysis)
+#echo " ".join($workflow.skip_analysis)
 
-        #*======================================
-                    Genome file
-        ======================================*#
-        '$input_option.input_file'
-        #*======================================
-                    LOG file
-        ======================================*#
-        | tee '$logfile'
+#*======================================
+            Genome file
+======================================*#
+'$input_option.input_file'
+#*======================================
+            LOG file
+======================================*#
+| tee '$logfile'
         ]]></command>
     <inputs>
       <!-- DB and file INPUT -->
         <section name="input_option" title="Input/Output options" expanded="true">
-            <param name="bakta_db_select" type="select" label="The bakta database">
+            <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
+            <param argument="--min-contig-length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode)"/>
+            <param name="bakta_db_select" type="select" label="Bakta database">
                 <options from_data_table="bakta_database">
                     <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/>
                     <validator message="No bakta database is available" type="no_options"/>
                 </options>
             </param>
-            <param name="amrfinder_db_select" type="select" label="The amrfinderplus database">
-                <options from_data_table="amrfinderplus_database">
-                  <validator message="No amrfinderplus database is available" type="no_options"/>
+            <param name="amrfinder_db_select" type="select" optional="true" label="AMRFinderPlus database" help="The selection of this database is not needed if Bakta database version is higher 5.0">
+                <options from_data_table="amrfinderplus_versioned_database">
+                    <filter type="static_value" value="3.12" column="db_version"/>
+                    <validator message="No AMRFinderPlus database is available" type="no_options"/>
                 </options>
             </param>
-
-            <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
-            <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
         </section>
         <!-- Organism INFORMATION OPTIONS -->
         <section name="organism" title="Optional organism options" expanded="false">
@@ -130,7 +128,7 @@
                 <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
                 <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
             </param>
-            <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/>
+            <param argument="--keep-contig-headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header"/>
             <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
             <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
             <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
@@ -171,82 +169,145 @@
               <option value="log_txt" selected="false">Log file as TXT</option>
           </param>
         </section>
-
     </inputs>
     <outputs>
-        <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary">
+        <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: Summary">
             <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences">
+        <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation and sequences (GFF3)">
             <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff">
+        <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: Annotations and sequences (GenBank format)">
             <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl">
+        <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: Annotations and sequences (EMBL format)">
             <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences">
+        <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Replicon/contig DNA sequences">
             <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences">
+        <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Feature nucleotide sequences">
             <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences">
+        <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: CDS/sORF amino acid sequences">
             <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter>
         </data>
-        <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary">
+        <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: Hypothetical protein CDS summary">
             <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter>
         </data>
-        <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences">
+        <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: Hypothetical protein CDS amino sequences">
           <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter>
         </data>
-        <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary">
+        <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Summary (TXT)">
             <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter>
         </data>
-        <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable">
+        <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: Information on each annotated feature (JSON)">
             <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter>
         </data>
         <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
             <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter>
         </data>
-        <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
+        <data name="logfile" format="txt" label="${tool.name} on ${on_string}: Log file">
             <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter>
         </data>
     </outputs>
     <tests>
         <test expect_num_outputs="13"> <!-- TEST_1 database + input -->
             <section name="input_option" >
-                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                 <param name="min_contig_length" value="250"/>
+                <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
             </section>
             <section name="output_files">
                 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
             </section>
-            <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/>
-            <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/>
-            <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/>
-            <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/>
-            <output name="annotation_fna" value="TEST_1/TEST_1.fna"/>
-            <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/>
-            <output name="annotation_faa" value="TEST_1/TEST_1.faa"/>
-            <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/>
-            <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/>
-            <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/>
-            <output name="annotation_plot" value="TEST_1/TEST_1_plot.svg" ftype="svg" compare="sim_size"/>
-            <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/>
+            <output name="annotation_tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                    <has_text text="IHHALP_00005"/>
+                    <has_text text="hypothetical protein"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gff3" ftype="gff3">
+                <assert_contents>
+                    <has_n_lines n="36"/>
+                    <has_text text="Prodigal"/>
+                    <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gbff" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="81"/>
+                    <has_text text="DEFINITION"/>
+                    <has_text text="/inference"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_embl" ftype="tabular">
+                <assert_contents>
+                    <has_text text="##Genome Annotation Summary:##"/>
+                    <has_text text="/product="/>
+                </assert_contents>
+            </output>
+            <output name="annotation_fna" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">contig_1"/>
+                    <has_text text="TCTTCTGCGAG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="ATGACAAAACGAAGTG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_faa" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="MTKRSGSNTR"/>
+                </assert_contents>
+            </output>
+            <output name="hypotheticals_tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                    <has_text text="IHHALP_00010"/>
+                    <has_text text="Sequence Id"/>
+                </assert_contents>
+            </output>
+            <output name="hypotheticals_faa" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00010 hypothetical protein"/>
+                    <has_text text="MNKQQQTALNM"/>
+                </assert_contents>
+            </output>
+            <output name="summary_txt" ftype="txt">
+                <assert_contents>
+                    <has_text text="coding density: 62.0"/>
+                    <has_text text="CDSs: 2"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_plot" ftype="svg">
+                <assert_contents>
+                    <has_size value="418990" delta="1000"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_json" ftype="json">
+                <assert_contents>
+                    <has_text text="coding_ratio"/>
+                    <has_text text="n50"/>
+                    <has_text text="hypothetical protein"/>
+                </assert_contents>
+            </output>
             <output name="logfile" ftype="txt">
                 <expand macro="assert_content_test"/>
             </output>
         </test>
         <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps  -->
             <section name="input_option" >
-                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                 <param name="min_contig_length" value="250"/>
+                <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
             </section>
             <section name="organism">
                 <param name="genus" value="Escherichia"/>
@@ -260,185 +321,304 @@
             <section name="workflow">
                 <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
             </section>
-            <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4">
+            <output name="annotation_tsv">
                 <assert_contents>
+                    <has_n_lines n="8"/>
                     <has_text_matching expression="IHHALP_00005"/>
                 </assert_contents>
             </output>
-            <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4">
+            <output name="annotation_gff3">
                 <assert_contents>
+                    <has_n_lines n="37"/>
                     <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
                 </assert_contents>
             </output>
-            <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/>
-            <output name="annotation_plot" value="TEST_2/TEST_2_plot.svg" ftype="svg" compare="sim_size"/>
+            <output name="annotation_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="ATGACAAAACGAAGTG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_plot" ftype="svg">
+                <assert_contents>
+                    <has_size value="418990" delta="1000"/>
+                </assert_contents>
+            </output>
         </test>
-        <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps  -->
+        <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps and test previous bakta version -->
             <section name="input_option" >
-                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
                 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
                 <param name="min_contig_length" value="350"/>
+                <param name="bakta_db_select" value="V5.0_2022-08-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
+            </section>
+            <section name="workflow">
+                <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
+            </section>
+            <output name="annotation_tsv">
+                <assert_contents>
+                    <has_n_lines n="6"/>
+                    <has_text_matching expression="DbXrefs"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gff3" ftype="gff3">
+                <assert_contents>
+                    <has_n_lines n="34"/>
+                    <has_text text=">contig_1"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_size value="0"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
+            <section name="input_option" >
+                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+                <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
+            </section>
+            <section name="annotation">
+                <param name="complete" value="true"/>
+                <param name="prodigal" value="prodigal.tf"/>
+                <param name="translation_table" value="4"/>
+                <param name="replicons" value="replicons.tsv" ftype="tabular"/>
+                <param name="compliant" value="true"/>
+                <param name="proteins" value="user-proteins.faa" ftype="fasta"/>
+            </section>
+            <output name="annotation_tsv">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                    <has_text_matching expression="IHHALP_00005"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gff3" ftype="gff3">
+                <assert_contents>
+                    <has_n_lines n="13"/>
+                    <has_text text="Prodigal"/>
+                    <has_text text="ID=IHHALP_00010_gene;locus_tag=IHHALP_00010"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="ATGACAAAACGAAGTG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_plot" ftype="svg">
+                <assert_contents>
+                    <has_size value="418990" delta="1000"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
+            <section name="input_option" >
+                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+                <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
+            </section>
+            <section name="annotation">
+                <param name="complete" value="true"/>
+                <param name="translation_table" value="4"/>
             </section>
             <section name="workflow">
                 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
             </section>
-            <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/>
-            <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/>
-            <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/>
-            </test>
-            <test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
-                <section name="input_option" >
-                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
-                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
-                </section>
-                <section name="annotation">
-                    <param name="complete" value="true"/>
-                    <param name="prodigal" value="prodigal.tf"/>
-                    <param name="translation_table" value="4"/>
-                    <param name="replicons" value="replicons.tsv" ftype="tabular"/>
-                    <param name="compliant" value="true"/>
-                    <param name="proteins" value="user-proteins.faa" ftype="fasta"/>
-                </section>
-                <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/>
-                <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/>
-                <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/>
-                <output name="annotation_plot" value="TEST_4/TEST_4_plot.svg" ftype="svg" compare="sim_size"/>
-            </test>
-            <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
-                <section name="input_option" >
-                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
-                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
-                </section>
-                <section name="annotation">
-                    <param name="complete" value="true"/>
-                    <param name="translation_table" value="4"/>
-                </section>
-                <section name="workflow">
-                    <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
-                </section>
-                <section name="output_files">
-                    <param name="output_selection" value="log_txt,sum_txt"/>
-                </section>
-                <output name="logfile" ftype="txt">
-                    <expand macro="assert_content_test"/>
-                </output>
-                <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/>
-            </test>
-            <test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
-                <section name="input_option" >
-                    <param name="bakta_db_select" value="V5.0_2022-08-19"/>
-                    <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
-                    <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
-                </section>
-                <section name="annotation">
-                    <param name="meta" value="true"/>
-                </section>
-                <section name="output_files">
-                    <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
-                </section>
-                <output name="annotation_tsv" value="TEST_6/TEST_6.tsv" lines_diff="2"/>
-                <output name="annotation_gff3" value="TEST_6/TEST_6.gff3" lines_diff="2"/>
-                <output name="annotation_gbff" value="TEST_6/TEST_6.gbff" lines_diff="8"/>
-                <output name="annotation_embl" value="TEST_6/TEST_6.embl" lines_diff="6"/>
-                <output name="annotation_fna" value="TEST_6/TEST_6.fna"/>
-                <output name="annotation_ffn" value="TEST_6/TEST_6.ffn"/>
-                <output name="annotation_faa" value="TEST_6/TEST_6.faa"/>
-                <output name="hypotheticals_tsv" value="TEST_6/TEST_6.hypotheticals.tsv" lines_diff="4"/>
-                <output name="hypotheticals_faa" value="TEST_6/TEST_6.hypotheticals.faa"/>
-                <output name="summary_txt" value="TEST_6/TEST_6.txt" lines_diff="4"/>
-                <output name="annotation_plot" value="TEST_6/TEST_6_plot.svg" ftype="svg" compare="sim_size"/>
-                <output name="annotation_json" value="TEST_6/TEST_6.json" lines_diff="6"/>
-                <output name="logfile" ftype="txt">
-                    <expand macro="assert_content_test"/>
-                </output>
-            </test>
+            <section name="output_files">
+                <param name="output_selection" value="log_txt,sum_txt"/>
+            </section>
+            <output name="logfile" ftype="txt">
+                <expand macro="assert_content_test"/>
+            </output>
+            <output name="summary_txt" ftype="txt">
+                <assert_contents>
+                    <has_n_lines n="30"/>
+                    <has_text text="N50: 1330"/>
+                    <has_text text="oriTs: 0"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
+            <section name="input_option" >
+                <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+                <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
+                <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
+            </section>
+            <section name="annotation">
+                <param name="meta" value="true"/>
+            </section>
+            <section name="output_files">
+                <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
+            </section>
+            <output name="annotation_tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                    <has_text text="IHHALP_00005"/>
+                    <has_text text="hypothetical protein"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gff3" ftype="gff3">
+                <assert_contents>
+                    <has_n_lines n="36"/>
+                    <has_text text="Prodigal"/>
+                    <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_gbff" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="81"/>
+                    <has_text text="DEFINITION"/>
+                    <has_text text="/inference"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_embl" ftype="tabular">
+                <assert_contents>
+                    <has_text text="##Genome Annotation Summary:##"/>
+                    <has_text text="/product="/>
+                </assert_contents>
+            </output>
+            <output name="annotation_fna" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">contig_1"/>
+                    <has_text text="TCTTCTGCGAG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="ATGACAAAACGAAGTG"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_faa" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00005 hypothetical protein"/>
+                    <has_text text="MTKRSGSNTR"/>
+                </assert_contents>
+            </output>
+            <output name="hypotheticals_tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                    <has_text text="IHHALP_00010"/>
+                    <has_text text="Sequence Id"/>
+                </assert_contents>
+            </output>
+            <output name="hypotheticals_faa" ftype="fasta">
+                <assert_contents>
+                    <has_text text=">IHHALP_00010 hypothetical protein"/>
+                    <has_text text="MNKQQQTALNM"/>
+                </assert_contents>
+            </output>
+            <output name="summary_txt" ftype="txt">
+                <assert_contents>
+                    <has_n_lines n="30"/>
+                    <has_text text="coding density: 62.0"/>
+                    <has_text text="CDSs: 2"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_plot" ftype="svg">
+                <assert_contents>
+                    <has_size value="418990" delta="1000"/>
+                </assert_contents>
+            </output>
+            <output name="annotation_json" ftype="json">
+                <assert_contents>
+                    <has_text text="coding_ratio"/>
+                    <has_text text="n50"/>
+                    <has_text text="hypothetical protein"/>
+                </assert_contents>
+            </output>
+            <output name="logfile" ftype="txt">
+                <expand macro="assert_content_test"/>
+            </output>
+        </test>
     </tests>
-    <help><![CDATA[**What it does**
-          Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
+    <help><![CDATA[
+**What it does**
 
-        *Comprehensive & taxonomy-independent database*
-          Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
+Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
+
+*Comprehensive & taxonomy-independent database*
+    Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
 
-        *Protein sequence identification*
-          Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
-          allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
-          This is achieved via an alignment-free sequence identification (AFSI) approach
-          using full-length MD5 protein sequence hash digests.
-        *Small proteins/short open reading frames*
-          Bakta detects and annotates small proteins/short open reading frames (sORF).
+*Protein sequence identification*
+    Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
+    allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
+    This is achieved via an alignment-free sequence identification (AFSI) approach
+    using full-length MD5 protein sequence hash digests.
+*Small proteins/short open reading frames*
+    Bakta detects and annotates small proteins/short open reading frames (sORF).
 
-        *Expert annotation systems*
-          To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
-          Bakta includes & merges different expert annotation systems.
-          Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
-          as well as an generalized protein sequence expert system with distinct
-          coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
+*Expert annotation systems*
+    To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
+    Bakta includes & merges different expert annotation systems.
+    Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
+    as well as an generalized protein sequence expert system with distinct
+    coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
 
-        *Comprehensive workflow*
-          Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
-          and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
+*Comprehensive workflow*
+    Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
+    and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
 
-        *GFF3 & INSDC conform annotations*
-          Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
-          (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
-          respectively for representative genomes of all ESKAPE species).
+*GFF3 & INSDC conform annotations*
+    Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
+    (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
+    respectively for representative genomes of all ESKAPE species).
 
-        *Bacteria & plasmids*
-          Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
+*Bacteria & plasmids*
+    Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
 
-        **Input options**
-          1. Choose a genome or assembly in fasta format to use bakta annotations
-          2. Choose A version of the Bakta database
+**Input options**
+
+1. Choose a genome or assembly in fasta format to use bakta annotations
+2. Choose A version of the Bakta database
 
-        **Organism options**
-        You can specify informations about analysed fasta as text input for:
-        - genus
-        - species
-        - strain
-        - plasmid
+**Organism options**
+You can specify informations about analysed fasta as text input for:
+- genus
+- species
+- strain
+- plasmid
 
-        **Annotation options**
-        1. You can specify if all sequences (chromosome or plasmids) are complete or not
-        2. You can add your own prodigal training file for CDS predictionœ
-        3. The translation table could be modified, default is the 11th for bacteria
-        4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
-        5. You can keep the name of contig present in the input file
-        6. You can specify your own replicon table as a TSV/CSV file
-        7. The compliance option is for ready to submit annotation file to Public database
-        as ENA, Genbank EMBL
-        8. You can specify a protein sequence file for annotation in GenBank or fasta formats
-        Using the Fasta format, each reference sequence can be provided in a short or long format:
+**Annotation options**
+1. You can specify if all sequences (chromosome or plasmids) are complete or not
+2. You can add your own prodigal training file for CDS predictionœ
+3. The translation table could be modified, default is the 11th for bacteria
+4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
+5. You can keep the name of contig present in the input file
+6. You can specify your own replicon table as a TSV/CSV file
+7. The compliance option is for ready to submit annotation file to Public database
+as ENA, Genbank EMBL
+8. You can specify a protein sequence file for annotation in GenBank or fasta formats
+Using the Fasta format, each reference sequence can be provided in a short or long format:
 
-        # short:
-        >id gene~~~product~~~dbxrefs
-        MAQ...
+# short:
+>id gene~~~product~~~dbxrefs
+MAQ...
 
-        # long:
-        >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
-        MAQ...
+# long:
+>id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
+MAQ...
 
-        **Skip steps**
-        Some steps could be skiped:
-        - skip-trna           Skip tRNA detection & annotation
-        - skip-tmrna          Skip tmRNA detection & annotation
-        - skip-rrna           Skip rRNA detection & annotation
-        - skip-ncrna          Skip ncRNA detection & annotation
-        - skip-ncrna-region   Skip ncRNA region detection & annotation
-        - skip-crispr         Skip CRISPR array detection & annotation
-        - skip-cds            Skip CDS detection & annotation
-        - skip-pseudo         Skip pseudogene detection & annotation
-        - skip-sorf           Skip sORF detection & annotation
-        - skip-gap            Skip gap detection & annotation
-        - skip-ori            Skip oriC/oriT detection & annotation
+**Skip steps**
+Some steps could be skiped:
+- skip-trna           Skip tRNA detection & annotation
+- skip-tmrna          Skip tmRNA detection & annotation
+- skip-rrna           Skip rRNA detection & annotation
+- skip-ncrna          Skip ncRNA detection & annotation
+- skip-ncrna-region   Skip ncRNA region detection & annotation
+- skip-crispr         Skip CRISPR array detection & annotation
+- skip-cds            Skip CDS detection & annotation
+- skip-pseudo         Skip pseudogene detection & annotation
+- skip-sorf           Skip sORF detection & annotation
+- skip-gap            Skip gap detection & annotation
+- skip-ori            Skip oriC/oriT detection & annotation
 
-        **Output options**
-        Bakta produce numbers of output files, you can select what type of file you want:
-        - Summary of the annotation
-        - Annotated files
-        - Sequence files for nucleotide and/or amino acid
+**Output options**
+Bakta produce numbers of output files, you can select what type of file you want:
+- Summary of the annotation
+- Annotated files
+- Sequence files for nucleotide and/or amino acid
     ]]></help>
     <expand macro="citations"/>
 </tool>
author	iuc
date	Wed, 05 Jun 2024 14:22:02 +0000
parents	92eee5f31117
children