comparison bakta.xml @ 7:ba6990f72184 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit e277883fca66013904bae930f04e7f3be5fcb1a2
author iuc
date Wed, 05 Jun 2024 14:22:02 +0000
parents 92eee5f31117
children
comparison
equal deleted inserted replaced
6:92eee5f31117 7:ba6990f72184
1 <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> 1 <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description> 2 <description>
3 Genome annotation via alignment-free sequence identification 3 Rapid and standardized annotation of bacterial genomes, MAGs and plasmids
4 </description> 4 </description>
5 <macros> 5 <macros>
6 <import>macro.xml</import> 6 <import>macro.xml</import>
7 </macros> 7 </macros>
8 <expand macro='edam'/> 8 <expand macro="xrefs"/>
9 <expand macro='xrefs'/>
10 <expand macro="requirements"/> 9 <expand macro="requirements"/>
11 <expand macro="version_command"/> 10 <expand macro="version_command"/>
12
13 <command detect_errors="aggressive"><![CDATA[ 11 <command detect_errors="aggressive"><![CDATA[
14 12 mkdir -p ./database_path/amrfinderplus-db &&
15 mkdir -p ./database_path/amrfinderplus-db && 13 ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
16 ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path && 14 ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
17 ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest && 15
18 bakta --verbose 16 bakta
19 17 --verbose
20 #*====================================== 18 #*======================================
21 CPU option 19 CPU option
22 ======================================*# 20 ======================================*#
23 --threads \${GALAXY_SLOTS:-1} 21 --threads \${GALAXY_SLOTS:-1}
24 #*====================================== 22 #*======================================
25 Bakta database 23 Bakta database
26 ======================================*# 24 ======================================*#
27 --db './database_path' 25 --db './database_path'
28 --output 'bakta_output' 26 --output 'bakta_output'
29 #if $input_option.min_contig_length 27 #if $input_option.min_contig_length
30 --min-contig-length $input_option.min_contig_length 28 --min-contig-length $input_option.min_contig_length
31 #else if $annotation.compliant 29 #else if $annotation.compliant
32 --min-contig-length 200 30 --min-contig-length 200
33 #else 31 #else
34 --min-contig-length 1 32 --min-contig-length 1
35 #end if 33 #end if
36 --prefix bakta_output 34 --prefix bakta_output
37 #*====================================== 35 #*======================================
38 Organism options 36 Organism options
39 genus/species/strain/plasmid 37 genus/species/strain/plasmid
40 ======================================*# 38 ======================================*#
41 #if $organism.genus 39 #if $organism.genus
42 --genus '$organism.genus' 40 --genus '$organism.genus'
43 #end if 41 #end if
44 #if $organism.species 42 #if $organism.species
45 --species '$organism.species' 43 --species '$organism.species'
46 #end if 44 #end if
47 #if $organism.strain 45 #if $organism.strain
48 --strain '$organism.strain' 46 --strain '$organism.strain'
49 #end if 47 #end if
50 #if $organism.plasmid 48 #if $organism.plasmid
51 --plasmid '$organism.plasmid' 49 --plasmid '$organism.plasmid'
52 #end if 50 #end if
53 #*====================================== 51 #*======================================
54 Annotation options 52 Annotation options
55 gram type, prodigal/protein file 53 gram type, prodigal/protein file
56 ======================================*# 54 ======================================*#
57 $annotation.complete 55 $annotation.complete
58 #if $annotation.prodigal 56 #if $annotation.prodigal
59 --prodigal-tf '$annotation.prodigal' 57 --prodigal-tf '$annotation.prodigal'
60 #end if 58 #end if
61 #if $annotation.translation_table 59 #if $annotation.translation_table
62 --translation-table '$annotation.translation_table' 60 --translation-table '$annotation.translation_table'
63 #end if 61 #end if
64 --gram '?' 62 --gram '?'
65 $annotation.keep_contig_headers 63 $annotation.keep_contig_headers
66 #if $annotation.replicons 64 #if $annotation.replicons
67 --replicons '$annotation.replicons' 65 --replicons '$annotation.replicons'
68 #end if 66 #end if
69 $annotation.compliant 67 $annotation.compliant
70 #if $annotation.proteins 68 #if $annotation.proteins
71 --proteins '$annotation.proteins' 69 --proteins '$annotation.proteins'
72 #end if 70 #end if
73 #if $annotation.regions 71 #if $annotation.regions
74 --regions '$annotation.regions' 72 --regions '$annotation.regions'
75 #end if 73 #end if
76 #*====================================== 74 #*======================================
77 Workflow OPTIONS 75 Workflow OPTIONS
78 skip some step of the bakta analysis 76 skip some step of the bakta analysis
79 ======================================*# 77 ======================================*#
80 78
81 #echo " ".join($workflow.skip_analysis) 79 #echo " ".join($workflow.skip_analysis)
82 80
83 #*====================================== 81 #*======================================
84 Genome file 82 Genome file
85 ======================================*# 83 ======================================*#
86 '$input_option.input_file' 84 '$input_option.input_file'
87 #*====================================== 85 #*======================================
88 LOG file 86 LOG file
89 ======================================*# 87 ======================================*#
90 | tee '$logfile' 88 | tee '$logfile'
91 ]]></command> 89 ]]></command>
92 <inputs> 90 <inputs>
93 <!-- DB and file INPUT --> 91 <!-- DB and file INPUT -->
94 <section name="input_option" title="Input/Output options" expanded="true"> 92 <section name="input_option" title="Input/Output options" expanded="true">
95 <param name="bakta_db_select" type="select" label="The bakta database"> 93 <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
94 <param argument="--min-contig-length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode)"/>
95 <param name="bakta_db_select" type="select" label="Bakta database">
96 <options from_data_table="bakta_database"> 96 <options from_data_table="bakta_database">
97 <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/> 97 <filter type="static_value" value="@COMPATIBLE_BAKTA_VERSION@" column="bakta_version"/>
98 <validator message="No bakta database is available" type="no_options"/> 98 <validator message="No bakta database is available" type="no_options"/>
99 </options> 99 </options>
100 </param> 100 </param>
101 <param name="amrfinder_db_select" type="select" label="The amrfinderplus database"> 101 <param name="amrfinder_db_select" type="select" optional="true" label="AMRFinderPlus database" help="The selection of this database is not needed if Bakta database version is higher 5.0">
102 <options from_data_table="amrfinderplus_database"> 102 <options from_data_table="amrfinderplus_versioned_database">
103 <validator message="No amrfinderplus database is available" type="no_options"/> 103 <filter type="static_value" value="3.12" column="db_version"/>
104 <validator message="No AMRFinderPlus database is available" type="no_options"/>
104 </options> 105 </options>
105 </param> 106 </param>
106
107 <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
108 <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
109 </section> 107 </section>
110 <!-- Organism INFORMATION OPTIONS --> 108 <!-- Organism INFORMATION OPTIONS -->
111 <section name="organism" title="Optional organism options" expanded="false"> 109 <section name="organism" title="Optional organism options" expanded="false">
112 <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia"> 110 <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia">
113 <validator type="regex">^[a-zA-Z]+$</validator> 111 <validator type="regex">^[a-zA-Z]+$</validator>
128 <param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/> 126 <param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/>
129 <param name="translation_table" type="select" optional="true" label="Translation table" help="Default is the bacterial table 11"> 127 <param name="translation_table" type="select" optional="true" label="Translation table" help="Default is the bacterial table 11">
130 <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> 128 <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
131 <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option> 129 <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
132 </param> 130 </param>
133 <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/> 131 <param argument="--keep-contig-headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header"/>
134 <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/> 132 <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
135 <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/> 133 <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
136 <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/> 134 <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
137 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" label="Metagenome mode" help="Run in metagenome mode. This only affects CDS prediction"/> 135 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" label="Metagenome mode" help="Run in metagenome mode. This only affects CDS prediction"/>
138 <param argument="--regions" type="data" format="gff,genbank" optional="true" label="Pre-annotated regions" help="Regions only, no functional annotations."/> 136 <param argument="--regions" type="data" format="gff,genbank" optional="true" label="Pre-annotated regions" help="Regions only, no functional annotations."/>
169 <option value="file_json" selected="false">Information on each annotated feature as JSON</option> 167 <option value="file_json" selected="false">Information on each annotated feature as JSON</option>
170 <option value="file_plot" selected="true">Plot of the annotation result as SVG</option> 168 <option value="file_plot" selected="true">Plot of the annotation result as SVG</option>
171 <option value="log_txt" selected="false">Log file as TXT</option> 169 <option value="log_txt" selected="false">Log file as TXT</option>
172 </param> 170 </param>
173 </section> 171 </section>
174
175 </inputs> 172 </inputs>
176 <outputs> 173 <outputs>
177 <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary"> 174 <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.tsv" label="${tool.name} on ${on_string}: Summary">
178 <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter> 175 <filter>output_files['output_selection'] and "file_tsv" in output_files['output_selection']</filter>
179 </data> 176 </data>
180 <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences"> 177 <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output/bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation and sequences (GFF3)">
181 <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter> 178 <filter>output_files['output_selection'] and "file_gff3" in output_files['output_selection']</filter>
182 </data> 179 </data>
183 <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff"> 180 <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output/bakta_output.gbff" label="${tool.name} on ${on_string}: Annotations and sequences (GenBank format)">
184 <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter> 181 <filter>output_files['output_selection'] and "file_gbff" in output_files['output_selection']</filter>
185 </data> 182 </data>
186 <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl"> 183 <data name="annotation_embl" format="tabular" from_work_dir="bakta_output/bakta_output.embl" label="${tool.name} on ${on_string}: Annotations and sequences (EMBL format)">
187 <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter> 184 <filter>output_files['output_selection'] and "file_embl" in output_files['output_selection']</filter>
188 </data> 185 </data>
189 <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences"> 186 <data name="annotation_fna" format="fasta" from_work_dir="bakta_output/bakta_output.fna" label="${tool.name} on ${on_string}: Replicon/contig DNA sequences">
190 <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter> 187 <filter>output_files['output_selection'] and "file_fna" in output_files['output_selection']</filter>
191 </data> 188 </data>
192 <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences"> 189 <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output/bakta_output.ffn" label="${tool.name} on ${on_string}: Feature nucleotide sequences">
193 <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter> 190 <filter>output_files['output_selection'] and "file_ffn" in output_files['output_selection']</filter>
194 </data> 191 </data>
195 <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences"> 192 <data name="annotation_faa" format="fasta" from_work_dir="bakta_output/bakta_output.faa" label="${tool.name} on ${on_string}: CDS/sORF amino acid sequences">
196 <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter> 193 <filter>output_files['output_selection'] and "file_faa" in output_files['output_selection']</filter>
197 </data> 194 </data>
198 <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary"> 195 <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output/bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: Hypothetical protein CDS summary">
199 <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter> 196 <filter>output_files['output_selection'] and "hypo_tsv" in output_files['output_selection']</filter>
200 </data> 197 </data>
201 <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences"> 198 <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output/bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: Hypothetical protein CDS amino sequences">
202 <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter> 199 <filter>output_files['output_selection'] and "hypo_fa" in output_files['output_selection']</filter>
203 </data> 200 </data>
204 <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary"> 201 <data name="summary_txt" format="txt" from_work_dir="bakta_output/bakta_output.txt" label="${tool.name} on ${on_string}: Summary (TXT)">
205 <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter> 202 <filter>output_files['output_selection'] and "sum_txt" in output_files['output_selection']</filter>
206 </data> 203 </data>
207 <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable"> 204 <data name="annotation_json" format="json" from_work_dir="bakta_output/bakta_output.json" label="${tool.name} on ${on_string}: Information on each annotated feature (JSON)">
208 <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter> 205 <filter>output_files['output_selection'] and "file_json" in output_files['output_selection']</filter>
209 </data> 206 </data>
210 <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation"> 207 <data name="annotation_plot" format="svg" from_work_dir="bakta_output/bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
211 <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter> 208 <filter>output_files['output_selection'] and "file_plot" in output_files['output_selection']</filter>
212 </data> 209 </data>
213 <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"> 210 <data name="logfile" format="txt" label="${tool.name} on ${on_string}: Log file">
214 <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter> 211 <filter>output_files['output_selection'] and "log_txt" in output_files['output_selection']</filter>
215 </data> 212 </data>
216 </outputs> 213 </outputs>
217 <tests> 214 <tests>
218 <test expect_num_outputs="13"> <!-- TEST_1 database + input --> 215 <test expect_num_outputs="13"> <!-- TEST_1 database + input -->
219 <section name="input_option" > 216 <section name="input_option" >
220 <param name="bakta_db_select" value="V5.0_2022-08-19"/>
221 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
222 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 217 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
223 <param name="min_contig_length" value="250"/> 218 <param name="min_contig_length" value="250"/>
219 <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
220 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
224 </section> 221 </section>
225 <section name="output_files"> 222 <section name="output_files">
226 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> 223 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
227 </section> 224 </section>
228 <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/> 225 <output name="annotation_tsv" ftype="tabular">
229 <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/> 226 <assert_contents>
230 <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/> 227 <has_n_lines n="8"/>
231 <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/> 228 <has_text text="IHHALP_00005"/>
232 <output name="annotation_fna" value="TEST_1/TEST_1.fna"/> 229 <has_text text="hypothetical protein"/>
233 <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/> 230 </assert_contents>
234 <output name="annotation_faa" value="TEST_1/TEST_1.faa"/> 231 </output>
235 <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/> 232 <output name="annotation_gff3" ftype="gff3">
236 <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/> 233 <assert_contents>
237 <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/> 234 <has_n_lines n="36"/>
238 <output name="annotation_plot" value="TEST_1/TEST_1_plot.svg" ftype="svg" compare="sim_size"/> 235 <has_text text="Prodigal"/>
239 <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/> 236 <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
237 </assert_contents>
238 </output>
239 <output name="annotation_gbff" ftype="tabular">
240 <assert_contents>
241 <has_n_lines n="81"/>
242 <has_text text="DEFINITION"/>
243 <has_text text="/inference"/>
244 </assert_contents>
245 </output>
246 <output name="annotation_embl" ftype="tabular">
247 <assert_contents>
248 <has_text text="##Genome Annotation Summary:##"/>
249 <has_text text="/product="/>
250 </assert_contents>
251 </output>
252 <output name="annotation_fna" ftype="fasta">
253 <assert_contents>
254 <has_text text=">contig_1"/>
255 <has_text text="TCTTCTGCGAG"/>
256 </assert_contents>
257 </output>
258 <output name="annotation_ffn" ftype="fasta">
259 <assert_contents>
260 <has_text text=">IHHALP_00005 hypothetical protein"/>
261 <has_text text="ATGACAAAACGAAGTG"/>
262 </assert_contents>
263 </output>
264 <output name="annotation_faa" ftype="fasta">
265 <assert_contents>
266 <has_text text=">IHHALP_00005 hypothetical protein"/>
267 <has_text text="MTKRSGSNTR"/>
268 </assert_contents>
269 </output>
270 <output name="hypotheticals_tsv" ftype="tabular">
271 <assert_contents>
272 <has_n_lines n="5"/>
273 <has_text text="IHHALP_00010"/>
274 <has_text text="Sequence Id"/>
275 </assert_contents>
276 </output>
277 <output name="hypotheticals_faa" ftype="fasta">
278 <assert_contents>
279 <has_text text=">IHHALP_00010 hypothetical protein"/>
280 <has_text text="MNKQQQTALNM"/>
281 </assert_contents>
282 </output>
283 <output name="summary_txt" ftype="txt">
284 <assert_contents>
285 <has_text text="coding density: 62.0"/>
286 <has_text text="CDSs: 2"/>
287 </assert_contents>
288 </output>
289 <output name="annotation_plot" ftype="svg">
290 <assert_contents>
291 <has_size value="418990" delta="1000"/>
292 </assert_contents>
293 </output>
294 <output name="annotation_json" ftype="json">
295 <assert_contents>
296 <has_text text="coding_ratio"/>
297 <has_text text="n50"/>
298 <has_text text="hypothetical protein"/>
299 </assert_contents>
300 </output>
240 <output name="logfile" ftype="txt"> 301 <output name="logfile" ftype="txt">
241 <expand macro="assert_content_test"/> 302 <expand macro="assert_content_test"/>
242 </output> 303 </output>
243 </test> 304 </test>
244 <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps --> 305 <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps -->
245 <section name="input_option" > 306 <section name="input_option" >
246 <param name="bakta_db_select" value="V5.0_2022-08-19"/>
247 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
248 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 307 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
249 <param name="min_contig_length" value="250"/> 308 <param name="min_contig_length" value="250"/>
309 <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
310 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
250 </section> 311 </section>
251 <section name="organism"> 312 <section name="organism">
252 <param name="genus" value="Escherichia"/> 313 <param name="genus" value="Escherichia"/>
253 <param name="species" value="coli O157:H7"/> 314 <param name="species" value="coli O157:H7"/>
254 <param name="strain" value="Sakai"/> 315 <param name="strain" value="Sakai"/>
258 <param name="keep_contig_headers" value="true"/> 319 <param name="keep_contig_headers" value="true"/>
259 </section> 320 </section>
260 <section name="workflow"> 321 <section name="workflow">
261 <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/> 322 <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
262 </section> 323 </section>
263 <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4"> 324 <output name="annotation_tsv">
264 <assert_contents> 325 <assert_contents>
326 <has_n_lines n="8"/>
265 <has_text_matching expression="IHHALP_00005"/> 327 <has_text_matching expression="IHHALP_00005"/>
266 </assert_contents> 328 </assert_contents>
267 </output> 329 </output>
268 <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4"> 330 <output name="annotation_gff3">
269 <assert_contents> 331 <assert_contents>
332 <has_n_lines n="37"/>
270 <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/> 333 <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
271 </assert_contents> 334 </assert_contents>
272 </output> 335 </output>
273 <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/> 336 <output name="annotation_ffn" ftype="fasta">
274 <output name="annotation_plot" value="TEST_2/TEST_2_plot.svg" ftype="svg" compare="sim_size"/> 337 <assert_contents>
338 <has_text text=">IHHALP_00005 hypothetical protein"/>
339 <has_text text="ATGACAAAACGAAGTG"/>
340 </assert_contents>
341 </output>
342 <output name="annotation_plot" ftype="svg">
343 <assert_contents>
344 <has_size value="418990" delta="1000"/>
345 </assert_contents>
346 </output>
275 </test> 347 </test>
276 <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps --> 348 <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps and test previous bakta version -->
277 <section name="input_option" > 349 <section name="input_option" >
278 <param name="bakta_db_select" value="V5.0_2022-08-19"/>
279 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
280 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 350 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
281 <param name="min_contig_length" value="350"/> 351 <param name="min_contig_length" value="350"/>
352 <param name="bakta_db_select" value="V5.0_2022-08-19"/>
353 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
282 </section> 354 </section>
283 <section name="workflow"> 355 <section name="workflow">
284 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/> 356 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
285 </section> 357 </section>
286 <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/> 358 <output name="annotation_tsv">
287 <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/> 359 <assert_contents>
288 <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/> 360 <has_n_lines n="6"/>
289 </test> 361 <has_text_matching expression="DbXrefs"/>
290 <test expect_num_outputs="4"> <!-- TEST_4 annotations --> 362 </assert_contents>
291 <section name="input_option" > 363 </output>
292 <param name="bakta_db_select" value="V5.0_2022-08-19"/> 364 <output name="annotation_gff3" ftype="gff3">
293 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> 365 <assert_contents>
294 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 366 <has_n_lines n="34"/>
295 </section> 367 <has_text text=">contig_1"/>
296 <section name="annotation"> 368 </assert_contents>
297 <param name="complete" value="true"/> 369 </output>
298 <param name="prodigal" value="prodigal.tf"/> 370 <output name="annotation_ffn" ftype="fasta">
299 <param name="translation_table" value="4"/> 371 <assert_contents>
300 <param name="replicons" value="replicons.tsv" ftype="tabular"/> 372 <has_size value="0"/>
301 <param name="compliant" value="true"/> 373 </assert_contents>
302 <param name="proteins" value="user-proteins.faa" ftype="fasta"/> 374 </output>
303 </section> 375 </test>
304 <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/> 376 <test expect_num_outputs="4"> <!-- TEST_4 annotations -->
305 <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/> 377 <section name="input_option" >
306 <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/> 378 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
307 <output name="annotation_plot" value="TEST_4/TEST_4_plot.svg" ftype="svg" compare="sim_size"/> 379 <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
308 </test> 380 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
309 <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary --> 381 </section>
310 <section name="input_option" > 382 <section name="annotation">
311 <param name="bakta_db_select" value="V5.0_2022-08-19"/> 383 <param name="complete" value="true"/>
312 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> 384 <param name="prodigal" value="prodigal.tf"/>
313 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 385 <param name="translation_table" value="4"/>
314 </section> 386 <param name="replicons" value="replicons.tsv" ftype="tabular"/>
315 <section name="annotation"> 387 <param name="compliant" value="true"/>
316 <param name="complete" value="true"/> 388 <param name="proteins" value="user-proteins.faa" ftype="fasta"/>
317 <param name="translation_table" value="4"/> 389 </section>
318 </section> 390 <output name="annotation_tsv">
319 <section name="workflow"> 391 <assert_contents>
320 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/> 392 <has_n_lines n="8"/>
321 </section> 393 <has_text_matching expression="IHHALP_00005"/>
322 <section name="output_files"> 394 </assert_contents>
323 <param name="output_selection" value="log_txt,sum_txt"/> 395 </output>
324 </section> 396 <output name="annotation_gff3" ftype="gff3">
325 <output name="logfile" ftype="txt"> 397 <assert_contents>
326 <expand macro="assert_content_test"/> 398 <has_n_lines n="13"/>
327 </output> 399 <has_text text="Prodigal"/>
328 <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/> 400 <has_text text="ID=IHHALP_00010_gene;locus_tag=IHHALP_00010"/>
329 </test> 401 </assert_contents>
330 <test expect_num_outputs="13"> <!-- TEST_6 metagenome option --> 402 </output>
331 <section name="input_option" > 403 <output name="annotation_ffn" ftype="fasta">
332 <param name="bakta_db_select" value="V5.0_2022-08-19"/> 404 <assert_contents>
333 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> 405 <has_text text=">IHHALP_00005 hypothetical protein"/>
334 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/> 406 <has_text text="ATGACAAAACGAAGTG"/>
335 </section> 407 </assert_contents>
336 <section name="annotation"> 408 </output>
337 <param name="meta" value="true"/> 409 <output name="annotation_plot" ftype="svg">
338 </section> 410 <assert_contents>
339 <section name="output_files"> 411 <has_size value="418990" delta="1000"/>
340 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> 412 </assert_contents>
341 </section> 413 </output>
342 <output name="annotation_tsv" value="TEST_6/TEST_6.tsv" lines_diff="2"/> 414 </test>
343 <output name="annotation_gff3" value="TEST_6/TEST_6.gff3" lines_diff="2"/> 415 <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
344 <output name="annotation_gbff" value="TEST_6/TEST_6.gbff" lines_diff="8"/> 416 <section name="input_option" >
345 <output name="annotation_embl" value="TEST_6/TEST_6.embl" lines_diff="6"/> 417 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
346 <output name="annotation_fna" value="TEST_6/TEST_6.fna"/> 418 <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
347 <output name="annotation_ffn" value="TEST_6/TEST_6.ffn"/> 419 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
348 <output name="annotation_faa" value="TEST_6/TEST_6.faa"/> 420 </section>
349 <output name="hypotheticals_tsv" value="TEST_6/TEST_6.hypotheticals.tsv" lines_diff="4"/> 421 <section name="annotation">
350 <output name="hypotheticals_faa" value="TEST_6/TEST_6.hypotheticals.faa"/> 422 <param name="complete" value="true"/>
351 <output name="summary_txt" value="TEST_6/TEST_6.txt" lines_diff="4"/> 423 <param name="translation_table" value="4"/>
352 <output name="annotation_plot" value="TEST_6/TEST_6_plot.svg" ftype="svg" compare="sim_size"/> 424 </section>
353 <output name="annotation_json" value="TEST_6/TEST_6.json" lines_diff="6"/> 425 <section name="workflow">
354 <output name="logfile" ftype="txt"> 426 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori,--skip-plot"/>
355 <expand macro="assert_content_test"/> 427 </section>
356 </output> 428 <section name="output_files">
357 </test> 429 <param name="output_selection" value="log_txt,sum_txt"/>
430 </section>
431 <output name="logfile" ftype="txt">
432 <expand macro="assert_content_test"/>
433 </output>
434 <output name="summary_txt" ftype="txt">
435 <assert_contents>
436 <has_n_lines n="30"/>
437 <has_text text="N50: 1330"/>
438 <has_text text="oriTs: 0"/>
439 </assert_contents>
440 </output>
441 </test>
442 <test expect_num_outputs="13"> <!-- TEST_6 metagenome option -->
443 <section name="input_option" >
444 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
445 <param name="bakta_db_select" value="V5.1_light_2024-01-19"/>
446 <param name="amrfinder_db_select" value="V3.12-2024-05-02.2"/>
447 </section>
448 <section name="annotation">
449 <param name="meta" value="true"/>
450 </section>
451 <section name="output_files">
452 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
453 </section>
454 <output name="annotation_tsv" ftype="tabular">
455 <assert_contents>
456 <has_n_lines n="8"/>
457 <has_text text="IHHALP_00005"/>
458 <has_text text="hypothetical protein"/>
459 </assert_contents>
460 </output>
461 <output name="annotation_gff3" ftype="gff3">
462 <assert_contents>
463 <has_n_lines n="36"/>
464 <has_text text="Prodigal"/>
465 <has_text text="ID=IHHALP_00005;Name=hypothetical protein;locus_tag=IHHALP_00005;product=hypothetical protein"/>
466 </assert_contents>
467 </output>
468 <output name="annotation_gbff" ftype="tabular">
469 <assert_contents>
470 <has_n_lines n="81"/>
471 <has_text text="DEFINITION"/>
472 <has_text text="/inference"/>
473 </assert_contents>
474 </output>
475 <output name="annotation_embl" ftype="tabular">
476 <assert_contents>
477 <has_text text="##Genome Annotation Summary:##"/>
478 <has_text text="/product="/>
479 </assert_contents>
480 </output>
481 <output name="annotation_fna" ftype="fasta">
482 <assert_contents>
483 <has_text text=">contig_1"/>
484 <has_text text="TCTTCTGCGAG"/>
485 </assert_contents>
486 </output>
487 <output name="annotation_ffn" ftype="fasta">
488 <assert_contents>
489 <has_text text=">IHHALP_00005 hypothetical protein"/>
490 <has_text text="ATGACAAAACGAAGTG"/>
491 </assert_contents>
492 </output>
493 <output name="annotation_faa" ftype="fasta">
494 <assert_contents>
495 <has_text text=">IHHALP_00005 hypothetical protein"/>
496 <has_text text="MTKRSGSNTR"/>
497 </assert_contents>
498 </output>
499 <output name="hypotheticals_tsv" ftype="tabular">
500 <assert_contents>
501 <has_n_lines n="5"/>
502 <has_text text="IHHALP_00010"/>
503 <has_text text="Sequence Id"/>
504 </assert_contents>
505 </output>
506 <output name="hypotheticals_faa" ftype="fasta">
507 <assert_contents>
508 <has_text text=">IHHALP_00010 hypothetical protein"/>
509 <has_text text="MNKQQQTALNM"/>
510 </assert_contents>
511 </output>
512 <output name="summary_txt" ftype="txt">
513 <assert_contents>
514 <has_n_lines n="30"/>
515 <has_text text="coding density: 62.0"/>
516 <has_text text="CDSs: 2"/>
517 </assert_contents>
518 </output>
519 <output name="annotation_plot" ftype="svg">
520 <assert_contents>
521 <has_size value="418990" delta="1000"/>
522 </assert_contents>
523 </output>
524 <output name="annotation_json" ftype="json">
525 <assert_contents>
526 <has_text text="coding_ratio"/>
527 <has_text text="n50"/>
528 <has_text text="hypothetical protein"/>
529 </assert_contents>
530 </output>
531 <output name="logfile" ftype="txt">
532 <expand macro="assert_content_test"/>
533 </output>
534 </test>
358 </tests> 535 </tests>
359 <help><![CDATA[**What it does** 536 <help><![CDATA[
360 Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs. 537 **What it does**
361 538
362 *Comprehensive & taxonomy-independent database* 539 Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
363 Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe. 540
364 541 *Comprehensive & taxonomy-independent database*
365 *Protein sequence identification* 542 Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
366 Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt 543
367 allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families. 544 *Protein sequence identification*
368 This is achieved via an alignment-free sequence identification (AFSI) approach 545 Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
369 using full-length MD5 protein sequence hash digests. 546 allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
370 *Small proteins/short open reading frames* 547 This is achieved via an alignment-free sequence identification (AFSI) approach
371 Bakta detects and annotates small proteins/short open reading frames (sORF). 548 using full-length MD5 protein sequence hash digests.
372 549 *Small proteins/short open reading frames*
373 *Expert annotation systems* 550 Bakta detects and annotates small proteins/short open reading frames (sORF).
374 To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes, 551
375 Bakta includes & merges different expert annotation systems. 552 *Expert annotation systems*
376 Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations 553 To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
377 as well as an generalized protein sequence expert system with distinct 554 Bakta includes & merges different expert annotation systems.
378 coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules. 555 Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
379 556 as well as an generalized protein sequence expert system with distinct
380 *Comprehensive workflow* 557 coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
381 Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT 558
382 and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS. 559 *Comprehensive workflow*
383 560 Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
384 *GFF3 & INSDC conform annotations* 561 and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
385 Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission 562
386 (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats, 563 *GFF3 & INSDC conform annotations*
387 respectively for representative genomes of all ESKAPE species). 564 Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
388 565 (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
389 *Bacteria & plasmids* 566 respectively for representative genomes of all ESKAPE species).
390 Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only. 567
391 568 *Bacteria & plasmids*
392 **Input options** 569 Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
393 1. Choose a genome or assembly in fasta format to use bakta annotations 570
394 2. Choose A version of the Bakta database 571 **Input options**
395 572
396 **Organism options** 573 1. Choose a genome or assembly in fasta format to use bakta annotations
397 You can specify informations about analysed fasta as text input for: 574 2. Choose A version of the Bakta database
398 - genus 575
399 - species 576 **Organism options**
400 - strain 577 You can specify informations about analysed fasta as text input for:
401 - plasmid 578 - genus
402 579 - species
403 **Annotation options** 580 - strain
404 1. You can specify if all sequences (chromosome or plasmids) are complete or not 581 - plasmid
405 2. You can add your own prodigal training file for CDS predictionœ 582
406 3. The translation table could be modified, default is the 11th for bacteria 583 **Annotation options**
407 4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow) 584 1. You can specify if all sequences (chromosome or plasmids) are complete or not
408 5. You can keep the name of contig present in the input file 585 2. You can add your own prodigal training file for CDS predictionœ
409 6. You can specify your own replicon table as a TSV/CSV file 586 3. The translation table could be modified, default is the 11th for bacteria
410 7. The compliance option is for ready to submit annotation file to Public database 587 4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
411 as ENA, Genbank EMBL 588 5. You can keep the name of contig present in the input file
412 8. You can specify a protein sequence file for annotation in GenBank or fasta formats 589 6. You can specify your own replicon table as a TSV/CSV file
413 Using the Fasta format, each reference sequence can be provided in a short or long format: 590 7. The compliance option is for ready to submit annotation file to Public database
414 591 as ENA, Genbank EMBL
415 # short: 592 8. You can specify a protein sequence file for annotation in GenBank or fasta formats
416 >id gene~~~product~~~dbxrefs 593 Using the Fasta format, each reference sequence can be provided in a short or long format:
417 MAQ... 594
418 595 # short:
419 # long: 596 >id gene~~~product~~~dbxrefs
420 >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs 597 MAQ...
421 MAQ... 598
422 599 # long:
423 **Skip steps** 600 >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
424 Some steps could be skiped: 601 MAQ...
425 - skip-trna Skip tRNA detection & annotation 602
426 - skip-tmrna Skip tmRNA detection & annotation 603 **Skip steps**
427 - skip-rrna Skip rRNA detection & annotation 604 Some steps could be skiped:
428 - skip-ncrna Skip ncRNA detection & annotation 605 - skip-trna Skip tRNA detection & annotation
429 - skip-ncrna-region Skip ncRNA region detection & annotation 606 - skip-tmrna Skip tmRNA detection & annotation
430 - skip-crispr Skip CRISPR array detection & annotation 607 - skip-rrna Skip rRNA detection & annotation
431 - skip-cds Skip CDS detection & annotation 608 - skip-ncrna Skip ncRNA detection & annotation
432 - skip-pseudo Skip pseudogene detection & annotation 609 - skip-ncrna-region Skip ncRNA region detection & annotation
433 - skip-sorf Skip sORF detection & annotation 610 - skip-crispr Skip CRISPR array detection & annotation
434 - skip-gap Skip gap detection & annotation 611 - skip-cds Skip CDS detection & annotation
435 - skip-ori Skip oriC/oriT detection & annotation 612 - skip-pseudo Skip pseudogene detection & annotation
436 613 - skip-sorf Skip sORF detection & annotation
437 **Output options** 614 - skip-gap Skip gap detection & annotation
438 Bakta produce numbers of output files, you can select what type of file you want: 615 - skip-ori Skip oriC/oriT detection & annotation
439 - Summary of the annotation 616
440 - Annotated files 617 **Output options**
441 - Sequence files for nucleotide and/or amino acid 618 Bakta produce numbers of output files, you can select what type of file you want:
619 - Summary of the annotation
620 - Annotated files
621 - Sequence files for nucleotide and/or amino acid
442 ]]></help> 622 ]]></help>
443 <expand macro="citations"/> 623 <expand macro="citations"/>
444 </tool> 624 </tool>