comparison itsx.xml @ 0:2cbd0d66aa08 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/itsx commit 229c8c8ce97a8e4d3d46406dba2942135fbdb910"
author bgruening
date Mon, 02 May 2022 13:03:49 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2cbd0d66aa08
1 <tool id="itsx" name="ITSx" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01">
2 <description>identifies ITS sequences and extracts the ITS region</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <xrefs>
7 <xref type="bio.tools">ITSx</xref>
8 </xrefs>
9 <expand macro="requirements" />
10 <version_command>ITSx --license</version_command>
11 <command detect_errors="exit_code"><![CDATA[
12 ITSx
13 --cpu \${GALAXY_SLOTS:-4}
14 -i '$i'
15 --save_regions #echo ",".join($output_options.save_regions)
16 --only-output $output_options.only_output
17 --partial $output_options.partial
18 --minlen $output_options.minlen
19 --truncate $output_options.truncate
20 -t #echo ",".join($sequence_section.character_code)
21 -E $sequence_section.E
22 -S $sequence_section.S
23 -N $sequence_section.N
24 --selection_priority $sequence_section.selection_priority
25 --allow_single_domain $sequence_section.allow_single_domain
26 --allow_reorder $sequence_section.allow_reorder
27 --complement $sequence_section.complement
28 --heuristics $sequence_section.heuristics
29 --nhmmer $sequence_section.nhmmer
30 #if 'summary' in $output_options.output_files
31 --summary T
32 #else
33 --summary F
34 #end if
35 #if 'graphical' in $output_options.output_files
36 --graphical T
37 #else
38 --graphical F
39 #end if
40 #if 'fasta' in $output_options.output_files
41 --fasta T
42 #else
43 --fasta F
44 #end if
45 #if 'concat' in $output_options.output_files
46 --concat T
47 #else
48 --concat F
49 #end if
50 #if 'table' in $output_options.output_files
51 --table T
52 #else
53 --table F
54 #end if
55 #if 'not_found' in $output_options.output_files
56 --not_found T
57 #else
58 --not_found F
59 #end if
60 #if 'detailed_results' in $output_options.output_files
61 --detailed_results T
62 #else
63 --detailed_results F
64 #end if
65 #if 'positions' in $output_options.output_files
66 --positions T
67 #else
68 --positions F
69 #end if
70 --preserve $output_options.preserve
71
72 ]]></command>
73 <inputs>
74 <param argument="-i" type="data" format="fasta" label="FASTA file"/>
75 <section name="sequence_section" title="Sequence selection options" expanded="true">
76 <param name="character_code" argument="-t" type="select"
77 multiple="true" label="Profile set to use for the search"
78 help="Can be used to restrict the search to only a few organism groups types to save time, if
79 one or more of the origins are not relevant to the dataset under study. Default: all">
80 <option value="all" selected="true">All</option>
81 <option value="a">Alveolata</option>
82 <option value="b">Bryophyta</option>
83 <option value="c">Bacillariophyta</option>
84 <option value="d">Amoebozoa</option>
85 <option value="e">Euglenozoa</option>
86 <option value="f">Fungi</option>
87 <option value="g">Chlorophyta</option>
88 <option value="h">Rhodophyta</option>
89 <option value="i">Phaeophyceae</option>
90 <option value="l">Marchantiophyta</option>
91 <option value="m">Metazoa</option>
92 <option value="o">Oomycota</option>
93 <option value="p">Haptophyceae</option>
94 <option value="q">Raphidophyceae</option>
95 <option value="r">Rhizaria</option>
96 <option value="s">Synurophyceae</option>
97 <option value="t">Tracheophyta</option>
98 <option value="u">Eustigmatophyceae</option>
99 <option value="x">Apusozoa</option>
100 <option value="y">Parabasalia</option>
101 </param>
102 <param argument="-E" type="float" min="0" max="1" value="0.00005" label="Domain E-value cutoff"
103 help="Domain E-value cutoff for a sequence to be included in the output. Default: 1e-5"/>
104 <param argument="-S" type="float" min="0" max="1" value="0" label="Domain score cutoff"
105 help="Domain score cutoff for a sequence to be included in the output. Default:0" />
106 <param argument="-N" type="integer" min="0" max="20" value="2" label="Minimal number of domains"
107 help="The minimal number of domains that must match a sequence before it is included. Default:2" />
108 <param argument="--selection_priority" type="select" label="Selection priority" help="Selects what will
109 be of highest priority when determining the origin of the sequence. Default:score">
110 <option value="score">Score</option>
111 <option value="sum">Sum</option>
112 <option value="domains">Domains</option>
113 <option value="eval">Eval</option>
114 </param>
115 <conditional name="cutoff_conditional">
116 <param name="cutoff_mode" type="select" label="HMMER search cutoff mode" help="Default: Score cutoff">
117 <option value="eval">E-value cutoff</option>
118 <option value="score" selected="true">Score cutoff</option>
119 </param>
120 <when value="eval">
121 <param argument="--search_eval" type="float" min="0" max="1" value="" optional="true"
122 label="E-value cutoff used in the HMMER search"
123 help="High numbers may slow down the process" />
124 </when>
125 <when value="score">
126 <param argument="--search_score" type="float" min="0" max="1" value="" optional="true"
127 label="The score cutoff used in the HMMER search"
128 help="Low numbers may slow down the process" />
129 </when>
130 </conditional>
131 <param argument="--allow_single_domain" type="float" min="0" max="1" value="0" label="Allow single domain"
132 help="Allow inclusion of sequences that only find a single domain, given that they meet the given E-value and score thresholds. Default: 0" />
133 <param argument="--allow_reorder" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Allow reorder"
134 help="Allows profiles to be in the wrong order on extracted sequences" />
135 <param argument="--complement" type="boolean" truevalue="T" falsevalue="F" checked="true" label="Check both DNA strains"
136 help="Checks both DNA strands against the database, creating reverse complements" />
137 <param argument="--heuristics" type="boolean" truevalue="T" falsevalue="F" checked="false"
138 label="HMMER's heuristic filtering" help="Leave this setting off for higher precision" />
139 <param argument="--nhmmer" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Use nhmmer instead of hmmsearch" help="Default: No" />
140 </section>
141 <section name="output_options" title="Output options">
142 <param name="output_files" type="select" multiple="true" display="checkboxes" label="Output files">
143 <option value="summary" selected="true">Summary</option>
144 <option value="graphical">Graphical output</option>
145 <option value="fasta" selected="true">FASTA-format of extracted ITS sequences</option>
146 <option value="concat">Concateneted ITS sequences</option>
147 <option value="positions" selected="true">Positions of found IT sequences</option>
148 <option value="table" selected="true">Positions of probable IT sequences</option>
149 <option value="not_found" selected="true">List of not-found entries</option>
150 <option value="detailed_results">Table of all results</option>
151 </param>
152 <param argument="--preserve" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Preserve sequence headers in input file" help="Preserve sequence headers in input file instead of printing out ITSx headers. Default: No" />
153 <param argument="--save_regions" type="select" label="Save regions" multiple="true" help="A comma separated list of regions to output separate FASTA files">
154 <option value="none">None</option>
155 <option value="SSU">SSU</option>
156 <option value="ITS1" selected="true">ITS1</option>
157 <option value="ITS2" selected="true">ITS2</option>
158 <option value="5.8S">5.8S</option>
159 <option value="LSU">LSU</option>
160 <option value="all">All</option>
161 </param>
162 <param argument="--only_output" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Only full-lenght regions" help="If enabled, output is limited to full-length regions. Default: no" />
163 <param argument="--partial" type="integer" min="0" value="0" label="Saves additional FASTA-files for full and partial ITS sequences longer than the specified cutoff" help="Default: 0 (disabled)" />
164 <param argument="--minlen" type="integer" min="0" value="0" label="Minimum length the ITS regions must be to be outputted in the concatenated file" help="Default: 0" />
165 <param argument="--truncate" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Truncates the FASTA output to only contain the actual ITS sequences found" help="Default: yes" />
166 </section>
167 </inputs>
168 <outputs>
169 <data name="not_detection_fasta" format="fasta" from_work_dir="ITSx_out_no_detections.fasta" label="${tool.name} on ${on_string}: no detection (FASTA)">
170 <filter> output_options['output_files'] and 'not_found' in output_options['output_files']</filter>
171 </data>
172 <data name="not_detection_txt" format="txt" from_work_dir="ITSx_out_no_detections.txt" label="${tool.name} on ${on_string}: no detection (TXT)">
173 <filter> output_options['output_files'] and 'not_found' in output_options['output_files']</filter>
174 </data>
175 <data name="its1" format="fasta" from_work_dir="ITSx_out.ITS1.fasta" label="${tool.name} on ${on_string}: ITS1">
176 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
177 <filter>output_options['save_regions'] and 'ITS1' in output_options['save_regions']</filter>
178 </data>
179 <data name="its2" format="fasta" from_work_dir="ITSx_out.ITS2.fasta" label="${tool.name} on ${on_string}: ITS2">
180 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
181 <filter>output_options['save_regions'] and 'ITS2' in output_options['save_regions']</filter>
182 </data>
183 <data name="ribosomall58" format="fasta" from_work_dir="ITSx_out.5_8S.fasta" label="${tool.name} on ${on_string}: 5.8S">
184 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
185 <filter>output_options['save_regions'] and '5.8S' in output_options['save_regions']</filter>
186 </data>
187 <data name="LSU" format="fasta" from_work_dir="ITSx_out.LSU.fasta" label="${tool.name} on ${on_string}: LSU">
188 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
189 <filter>output_options['save_regions'] and 'LSU' in output_options['save_regions']</filter>
190 </data>
191 <data name="SSU" format="fasta" from_work_dir="ITSx_out.SSU.fasta" label="${tool.name} on ${on_string}: SSU">
192 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
193 <filter>output_options['save_regions'] and 'SSU' in output_options['save_regions']</filter>
194 </data>
195 <data name="full_sequences" format="fasta" from_work_dir="ITSx_out.full.fasta" label="${tool.name} on ${on_string}: full sequences">
196 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter>
197 </data>
198 <data name="concat" format="fasta" from_work_dir="ITSx_out.concat.fasta" label="${tool.name} on ${on_string}: concatenated sequences">
199 <filter>output_options['output_files'] and 'concat' in output_options['output_files']</filter>
200 </data>
201 <data name="graph" format="txt" from_work_dir="ITSx_out.graph" label="${tool.name} on ${on_string}: graphical representation">
202 <filter>output_options['output_files'] and 'graph' in output_options['output_files']</filter>
203 </data>
204 <data name="summary" format="txt" from_work_dir="ITSx_out.summary.txt" label="${tool.name} on ${on_string}: summary">
205 <filter>output_options['output_files'] and 'summary' in output_options['output_files']</filter>
206 </data>
207 <data name="detailed_results" format="tabular" from_work_dir="ITSx_out.extraction.results" label="${tool.name} on ${on_string}: detailed results">
208 <filter>output_options['output_files'] and 'detailed_results' in output_options['output_files']</filter>
209 </data>
210 <data name="positions_probable" format="tabular" from_work_dir="ITSx_out.hmmer.table" label="${tool.name} on ${on_string}: probable ITS positions">
211 <filter>output_options['output_files'] and 'table' in output_options['output_files']</filter>
212 </data>
213 <data name="positions_found" format="tabular" from_work_dir="ITSx_out.positions.txt" label="${tool.name} on ${on_string}: positions">
214 <filter>output_options['output_files'] and 'positions' in output_options['output_files']</filter>
215 </data>
216 <data name="problematic" format="tabular" from_work_dir="ITSx_out.problematic.txt" label="${tool.name} on ${on_string}: problematic reads"/>
217 </outputs>
218 <tests>
219 <!--Default parameters-->
220 <test expect_num_outputs="9">
221 <param name="i" value="metagenomics.fasta"/>
222 <section name="sequence_section">
223 <param name="character_code" value="f"/>
224 <param name="E" value="0.00005"/>
225 <param name="S" value="0"/>
226 <param name="N" value="2"/>
227 <param name="selection_priority" value="score"/>
228 <param name="allow_single_domain" value="0"/>
229 <param name="allow_reorder" value="false"/>
230 <param name="complement" value="true"/>
231 <param name="heuristics" value="false"/>
232 <param name="nhmmer" value="false"/>
233 </section>
234 <section name="output_options">
235 <param name="output_files" value="summary,fasta,positions,table,not_found"/>
236 <param name="preserve" value="false"/>
237 <param name="save_regions" value="ITS1,ITS2"/>
238 <param name="only_output" value="false"/>
239 <param name="partial" value="0"/>
240 <param name="minlen" value="0"/>
241 <param name="truncate" value="false"/>
242 </section>
243 <output name="not_detection_fasta" file="test_01_not_detected.fasta" ftype="fasta"/>
244 <output name="not_detection_txt" file="test_01_not_detected.txt" ftype="txt"/>
245 <output name="its2" file="test_01_its2.fasta" ftype="fasta"/>
246 <output name="summary" file="test_01_summary.txt" ftype="txt" lines_diff="4"/>
247 <output name="positions_found" file="test_01_found.tabular" ftype="tabular"/>
248 <output name="positions_probable" ftype="tabular">
249 <assert_contents>
250 <has_text text="10088_SRR5572434|r"/>
251 <has_text text="19705_SRR5572434|fr"/>
252 </assert_contents>
253 </output>
254 <output name="problematic" file="test_01_problematic.tabular" ftype="tabular"/>
255 </test>
256 <!--Custom options-->
257 <test expect_num_outputs="9">
258 <param name="i" value="metagenomics.fasta"/>
259 <section name="sequence_section">
260 <param name="character_code" value="f"/>
261 <param name="E" value="0.0005"/>
262 <param name="S" value="1"/>
263 <param name="N" value="2"/>
264 <param name="selection_priority" value="score"/>
265 <param name="allow_single_domain" value="0.1"/>
266 <param name="allow_reorder" value="true"/>
267 <param name="complement" value="true"/>
268 <param name="heuristics" value="true"/>
269 <param name="nhmmer" value="false"/>
270 </section>
271 <section name="output_options">
272 <param name="output_files" value="summary,fasta,positions,table,not_found"/>
273 <param name="preserve" value="false"/>
274 <param name="save_regions" value="ITS1,ITS2"/>
275 <param name="only_output" value="false"/>
276 <param name="partial" value="0"/>
277 <param name="minlen" value="20"/>
278 <param name="truncate" value="true"/>
279 </section>
280 <output name="not_detection_fasta" file="test_02_not_detected.fasta" ftype="fasta"/>
281 <output name="not_detection_txt" file="test_02_not_detected.txt" ftype="txt"/>
282 <output name="its2" file="test_02_its2.fasta" ftype="fasta"/>
283 <output name="summary" file="test_02_summary.txt" ftype="txt" lines_diff="4"/>
284 <output name="positions_found" file="test_02_found.tabular" ftype="tabular"/>
285 <output name="positions_probable" ftype="tabular">
286 <assert_contents>
287 <has_text text="10088_SRR5572434|r"/>
288 <has_text text="19705_SRR5572434|fr"/>
289 </assert_contents>
290 </output>
291 <output name="problematic" file="test_02_problematic.tabular" ftype="tabular"/>
292 </test>
293 </tests>
294 <help><![CDATA[
295
296 .. class:: infomark
297
298 **Purpose**
299
300 ITSx is an open source software utility to extract the highly variable ITS1 and ITS2 subregions from ITS sequences, which is commonly used as a molecular barcode for e.g. fungi. As the inclusion of parts of the neighbouring,
301 very conserved, ribosomal genes (SSU, 5S and LSU rRNA sequences) in the sequence identification process can lead to severely misleading results, ITSx identifies and extracts only the ITS regions themselves.
302
303 ITSx accepts input in the FASTA format. As it pre-processes the input sequences, it is possible to input both aligned and unaligned FASTA files, containing both DNA and RNA sequences.
304
305 ----
306
307 .. class:: infomark
308
309 **Algorithm and implementation**
310
311 The main design goal for ITSx is to achieve fast and accurate extraction of ITS regions in large data sets, without introducing a large number of false positives. To be able to reach a high speed, ITSx relies on
312 the HMMER3 software, which allows for extremely fast comparisons of HMM-profiles to a sequence set. To achieve high detection accuracy, ITSx uses multiple HMM-profiles built from the conserved domains flanking
313 the ITS regions (SSU, 5.8S and LSU), representing a large number of species groups. This enabled ITSx to extract ITS regions from all eukaryote lineages for which a substantial number of reference ITS sequences
314 were available as of 2012. The 2017 release of 1.1 saw the introduction of additional HMM profiles to cover even more lineages, particularly within the fungi.
315
316 While the default settings of ITSx should be usable in most situations, you should consider if they suitable for your purposes and for your data set. If the data set is small, this can be done by running the
317 software multiple times on the data, with different settings, and analyse the outcome. On larger data sets, it might be more feasible to only run ITSx on a subset of the sequences for testing. The graphical output
318 is very useful for determining whether ITSx performs as desired on the data, as the positions of the found conserved domains can be easily investigated. If domains are missing, the criteria might be set to be too
319 stringent. If they are not in sequential order (from SSU to LSU with the 5.8S in between), that might be an indication that there is something wrong with the input sequences
320
321 ----
322
323 .. class:: infomark
324
325 **Notes on PCR primers and short input sequences**
326
327 One obvious use of ITSx would be to run it on sequence datasets generated by PCR amplicon studies of the ITS region, to extract ITS1 and/or ITS2 regions, and to sort
328 out non-target sequences in the data prior to further analysis. ITSx uses the conserved SSU, 5S and LSU genes to located and orient the ITS regions. To do this, the
329 software requires at least ~20 bp. (of at least one) of those genes to be present for each input sequence – preferably 25 bp. However, some primer pairs targeting the ITS
330 region will not include sufficiently large portions of these genes to be detected with the accuracy required by ITSx by default. This may lead to that fewer, or even none, of
331 the input sequences are recognized as ITS containing.
332
333
334 If the dataset is known to contain only ITS sequences, a remedy to this problem can be to lower the stringency of ITSx, using the -E option. By default, this is set to 1e-5,
335 but this can be increased to say 0.01, or even 1 to allow for detection of shorter portions of the conserved genes – down to some 15 bases. This feature comes at the
336 price of an increase proportion of false-positive matches, but in the case of ITS-only datasets this will be less of a concern. Note that this should normally be done only for
337 datasets that are known to contain only ITS sequences, and that caution needs to be taken in the downstream analysis so that false-positive extractions can be avoided (e.g.
338 investigate spuriously long or short ITS sequences with a sound degree of scepticism). Conversely, going for very stringent settings may come at the price of sensitivity as
339 sequences with deviant genes (or of reduced read quality) may be missed. The default settings of the software are calibrated with environmental datasets in mind, to keep
340 false-positive extractions at a minimum.
341
342 If the problem instead is that only one of those genes is present on the input sequence, another, even more polished, solution exists. In order to score a sequence as an ITS
343 sequence, ITSx prefers to see that the sequence produces matches to at least two HMMs (such as 3’ SSU and 5’ 5.8S). This helps ITSx to keep the number of falsepositive
344 identifications low. However, the software also allows sequences that produce a match to only one HMM (such as 3’ SSU) to be scored as ITS sequences, provided
345 that the match is particularly stringent. The stringency of this parameter is controlled through the --allow_single_domain switch (see above). The default value is 1e-9. If
346 ITSx is used on very short sequences (e.g. 3’ SSU + 100 bp. ITS1), then only one HMM can be expected to match, and the E-value of that match will be compared to
347 the value of this parameter. Therefore, if the dataset at hand contains sequences which should only contain a single conserved region, using e.g. --allow_single_domain
348 "1e-5,0" can be enough to go from no matches to all matches. However, as with the -E option above, increasing the E-value of this parameter comes at the expense of
349 higher risk for false positives.
350
351 ]]></help>
352 <expand macro="citations" />
353 </tool>