Mercurial > repos > bgruening > itsx
comparison itsx.xml @ 0:2cbd0d66aa08 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/itsx commit 229c8c8ce97a8e4d3d46406dba2942135fbdb910"
author | bgruening |
---|---|
date | Mon, 02 May 2022 13:03:49 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2cbd0d66aa08 |
---|---|
1 <tool id="itsx" name="ITSx" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> | |
2 <description>identifies ITS sequences and extracts the ITS region</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <xrefs> | |
7 <xref type="bio.tools">ITSx</xref> | |
8 </xrefs> | |
9 <expand macro="requirements" /> | |
10 <version_command>ITSx --license</version_command> | |
11 <command detect_errors="exit_code"><![CDATA[ | |
12 ITSx | |
13 --cpu \${GALAXY_SLOTS:-4} | |
14 -i '$i' | |
15 --save_regions #echo ",".join($output_options.save_regions) | |
16 --only-output $output_options.only_output | |
17 --partial $output_options.partial | |
18 --minlen $output_options.minlen | |
19 --truncate $output_options.truncate | |
20 -t #echo ",".join($sequence_section.character_code) | |
21 -E $sequence_section.E | |
22 -S $sequence_section.S | |
23 -N $sequence_section.N | |
24 --selection_priority $sequence_section.selection_priority | |
25 --allow_single_domain $sequence_section.allow_single_domain | |
26 --allow_reorder $sequence_section.allow_reorder | |
27 --complement $sequence_section.complement | |
28 --heuristics $sequence_section.heuristics | |
29 --nhmmer $sequence_section.nhmmer | |
30 #if 'summary' in $output_options.output_files | |
31 --summary T | |
32 #else | |
33 --summary F | |
34 #end if | |
35 #if 'graphical' in $output_options.output_files | |
36 --graphical T | |
37 #else | |
38 --graphical F | |
39 #end if | |
40 #if 'fasta' in $output_options.output_files | |
41 --fasta T | |
42 #else | |
43 --fasta F | |
44 #end if | |
45 #if 'concat' in $output_options.output_files | |
46 --concat T | |
47 #else | |
48 --concat F | |
49 #end if | |
50 #if 'table' in $output_options.output_files | |
51 --table T | |
52 #else | |
53 --table F | |
54 #end if | |
55 #if 'not_found' in $output_options.output_files | |
56 --not_found T | |
57 #else | |
58 --not_found F | |
59 #end if | |
60 #if 'detailed_results' in $output_options.output_files | |
61 --detailed_results T | |
62 #else | |
63 --detailed_results F | |
64 #end if | |
65 #if 'positions' in $output_options.output_files | |
66 --positions T | |
67 #else | |
68 --positions F | |
69 #end if | |
70 --preserve $output_options.preserve | |
71 | |
72 ]]></command> | |
73 <inputs> | |
74 <param argument="-i" type="data" format="fasta" label="FASTA file"/> | |
75 <section name="sequence_section" title="Sequence selection options" expanded="true"> | |
76 <param name="character_code" argument="-t" type="select" | |
77 multiple="true" label="Profile set to use for the search" | |
78 help="Can be used to restrict the search to only a few organism groups types to save time, if | |
79 one or more of the origins are not relevant to the dataset under study. Default: all"> | |
80 <option value="all" selected="true">All</option> | |
81 <option value="a">Alveolata</option> | |
82 <option value="b">Bryophyta</option> | |
83 <option value="c">Bacillariophyta</option> | |
84 <option value="d">Amoebozoa</option> | |
85 <option value="e">Euglenozoa</option> | |
86 <option value="f">Fungi</option> | |
87 <option value="g">Chlorophyta</option> | |
88 <option value="h">Rhodophyta</option> | |
89 <option value="i">Phaeophyceae</option> | |
90 <option value="l">Marchantiophyta</option> | |
91 <option value="m">Metazoa</option> | |
92 <option value="o">Oomycota</option> | |
93 <option value="p">Haptophyceae</option> | |
94 <option value="q">Raphidophyceae</option> | |
95 <option value="r">Rhizaria</option> | |
96 <option value="s">Synurophyceae</option> | |
97 <option value="t">Tracheophyta</option> | |
98 <option value="u">Eustigmatophyceae</option> | |
99 <option value="x">Apusozoa</option> | |
100 <option value="y">Parabasalia</option> | |
101 </param> | |
102 <param argument="-E" type="float" min="0" max="1" value="0.00005" label="Domain E-value cutoff" | |
103 help="Domain E-value cutoff for a sequence to be included in the output. Default: 1e-5"/> | |
104 <param argument="-S" type="float" min="0" max="1" value="0" label="Domain score cutoff" | |
105 help="Domain score cutoff for a sequence to be included in the output. Default:0" /> | |
106 <param argument="-N" type="integer" min="0" max="20" value="2" label="Minimal number of domains" | |
107 help="The minimal number of domains that must match a sequence before it is included. Default:2" /> | |
108 <param argument="--selection_priority" type="select" label="Selection priority" help="Selects what will | |
109 be of highest priority when determining the origin of the sequence. Default:score"> | |
110 <option value="score">Score</option> | |
111 <option value="sum">Sum</option> | |
112 <option value="domains">Domains</option> | |
113 <option value="eval">Eval</option> | |
114 </param> | |
115 <conditional name="cutoff_conditional"> | |
116 <param name="cutoff_mode" type="select" label="HMMER search cutoff mode" help="Default: Score cutoff"> | |
117 <option value="eval">E-value cutoff</option> | |
118 <option value="score" selected="true">Score cutoff</option> | |
119 </param> | |
120 <when value="eval"> | |
121 <param argument="--search_eval" type="float" min="0" max="1" value="" optional="true" | |
122 label="E-value cutoff used in the HMMER search" | |
123 help="High numbers may slow down the process" /> | |
124 </when> | |
125 <when value="score"> | |
126 <param argument="--search_score" type="float" min="0" max="1" value="" optional="true" | |
127 label="The score cutoff used in the HMMER search" | |
128 help="Low numbers may slow down the process" /> | |
129 </when> | |
130 </conditional> | |
131 <param argument="--allow_single_domain" type="float" min="0" max="1" value="0" label="Allow single domain" | |
132 help="Allow inclusion of sequences that only find a single domain, given that they meet the given E-value and score thresholds. Default: 0" /> | |
133 <param argument="--allow_reorder" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Allow reorder" | |
134 help="Allows profiles to be in the wrong order on extracted sequences" /> | |
135 <param argument="--complement" type="boolean" truevalue="T" falsevalue="F" checked="true" label="Check both DNA strains" | |
136 help="Checks both DNA strands against the database, creating reverse complements" /> | |
137 <param argument="--heuristics" type="boolean" truevalue="T" falsevalue="F" checked="false" | |
138 label="HMMER's heuristic filtering" help="Leave this setting off for higher precision" /> | |
139 <param argument="--nhmmer" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Use nhmmer instead of hmmsearch" help="Default: No" /> | |
140 </section> | |
141 <section name="output_options" title="Output options"> | |
142 <param name="output_files" type="select" multiple="true" display="checkboxes" label="Output files"> | |
143 <option value="summary" selected="true">Summary</option> | |
144 <option value="graphical">Graphical output</option> | |
145 <option value="fasta" selected="true">FASTA-format of extracted ITS sequences</option> | |
146 <option value="concat">Concateneted ITS sequences</option> | |
147 <option value="positions" selected="true">Positions of found IT sequences</option> | |
148 <option value="table" selected="true">Positions of probable IT sequences</option> | |
149 <option value="not_found" selected="true">List of not-found entries</option> | |
150 <option value="detailed_results">Table of all results</option> | |
151 </param> | |
152 <param argument="--preserve" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Preserve sequence headers in input file" help="Preserve sequence headers in input file instead of printing out ITSx headers. Default: No" /> | |
153 <param argument="--save_regions" type="select" label="Save regions" multiple="true" help="A comma separated list of regions to output separate FASTA files"> | |
154 <option value="none">None</option> | |
155 <option value="SSU">SSU</option> | |
156 <option value="ITS1" selected="true">ITS1</option> | |
157 <option value="ITS2" selected="true">ITS2</option> | |
158 <option value="5.8S">5.8S</option> | |
159 <option value="LSU">LSU</option> | |
160 <option value="all">All</option> | |
161 </param> | |
162 <param argument="--only_output" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Only full-lenght regions" help="If enabled, output is limited to full-length regions. Default: no" /> | |
163 <param argument="--partial" type="integer" min="0" value="0" label="Saves additional FASTA-files for full and partial ITS sequences longer than the specified cutoff" help="Default: 0 (disabled)" /> | |
164 <param argument="--minlen" type="integer" min="0" value="0" label="Minimum length the ITS regions must be to be outputted in the concatenated file" help="Default: 0" /> | |
165 <param argument="--truncate" type="boolean" truevalue="T" falsevalue="F" checked="false" label="Truncates the FASTA output to only contain the actual ITS sequences found" help="Default: yes" /> | |
166 </section> | |
167 </inputs> | |
168 <outputs> | |
169 <data name="not_detection_fasta" format="fasta" from_work_dir="ITSx_out_no_detections.fasta" label="${tool.name} on ${on_string}: no detection (FASTA)"> | |
170 <filter> output_options['output_files'] and 'not_found' in output_options['output_files']</filter> | |
171 </data> | |
172 <data name="not_detection_txt" format="txt" from_work_dir="ITSx_out_no_detections.txt" label="${tool.name} on ${on_string}: no detection (TXT)"> | |
173 <filter> output_options['output_files'] and 'not_found' in output_options['output_files']</filter> | |
174 </data> | |
175 <data name="its1" format="fasta" from_work_dir="ITSx_out.ITS1.fasta" label="${tool.name} on ${on_string}: ITS1"> | |
176 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
177 <filter>output_options['save_regions'] and 'ITS1' in output_options['save_regions']</filter> | |
178 </data> | |
179 <data name="its2" format="fasta" from_work_dir="ITSx_out.ITS2.fasta" label="${tool.name} on ${on_string}: ITS2"> | |
180 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
181 <filter>output_options['save_regions'] and 'ITS2' in output_options['save_regions']</filter> | |
182 </data> | |
183 <data name="ribosomall58" format="fasta" from_work_dir="ITSx_out.5_8S.fasta" label="${tool.name} on ${on_string}: 5.8S"> | |
184 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
185 <filter>output_options['save_regions'] and '5.8S' in output_options['save_regions']</filter> | |
186 </data> | |
187 <data name="LSU" format="fasta" from_work_dir="ITSx_out.LSU.fasta" label="${tool.name} on ${on_string}: LSU"> | |
188 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
189 <filter>output_options['save_regions'] and 'LSU' in output_options['save_regions']</filter> | |
190 </data> | |
191 <data name="SSU" format="fasta" from_work_dir="ITSx_out.SSU.fasta" label="${tool.name} on ${on_string}: SSU"> | |
192 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
193 <filter>output_options['save_regions'] and 'SSU' in output_options['save_regions']</filter> | |
194 </data> | |
195 <data name="full_sequences" format="fasta" from_work_dir="ITSx_out.full.fasta" label="${tool.name} on ${on_string}: full sequences"> | |
196 <filter>output_options['output_files'] and 'fasta' in output_options['output_files']</filter> | |
197 </data> | |
198 <data name="concat" format="fasta" from_work_dir="ITSx_out.concat.fasta" label="${tool.name} on ${on_string}: concatenated sequences"> | |
199 <filter>output_options['output_files'] and 'concat' in output_options['output_files']</filter> | |
200 </data> | |
201 <data name="graph" format="txt" from_work_dir="ITSx_out.graph" label="${tool.name} on ${on_string}: graphical representation"> | |
202 <filter>output_options['output_files'] and 'graph' in output_options['output_files']</filter> | |
203 </data> | |
204 <data name="summary" format="txt" from_work_dir="ITSx_out.summary.txt" label="${tool.name} on ${on_string}: summary"> | |
205 <filter>output_options['output_files'] and 'summary' in output_options['output_files']</filter> | |
206 </data> | |
207 <data name="detailed_results" format="tabular" from_work_dir="ITSx_out.extraction.results" label="${tool.name} on ${on_string}: detailed results"> | |
208 <filter>output_options['output_files'] and 'detailed_results' in output_options['output_files']</filter> | |
209 </data> | |
210 <data name="positions_probable" format="tabular" from_work_dir="ITSx_out.hmmer.table" label="${tool.name} on ${on_string}: probable ITS positions"> | |
211 <filter>output_options['output_files'] and 'table' in output_options['output_files']</filter> | |
212 </data> | |
213 <data name="positions_found" format="tabular" from_work_dir="ITSx_out.positions.txt" label="${tool.name} on ${on_string}: positions"> | |
214 <filter>output_options['output_files'] and 'positions' in output_options['output_files']</filter> | |
215 </data> | |
216 <data name="problematic" format="tabular" from_work_dir="ITSx_out.problematic.txt" label="${tool.name} on ${on_string}: problematic reads"/> | |
217 </outputs> | |
218 <tests> | |
219 <!--Default parameters--> | |
220 <test expect_num_outputs="9"> | |
221 <param name="i" value="metagenomics.fasta"/> | |
222 <section name="sequence_section"> | |
223 <param name="character_code" value="f"/> | |
224 <param name="E" value="0.00005"/> | |
225 <param name="S" value="0"/> | |
226 <param name="N" value="2"/> | |
227 <param name="selection_priority" value="score"/> | |
228 <param name="allow_single_domain" value="0"/> | |
229 <param name="allow_reorder" value="false"/> | |
230 <param name="complement" value="true"/> | |
231 <param name="heuristics" value="false"/> | |
232 <param name="nhmmer" value="false"/> | |
233 </section> | |
234 <section name="output_options"> | |
235 <param name="output_files" value="summary,fasta,positions,table,not_found"/> | |
236 <param name="preserve" value="false"/> | |
237 <param name="save_regions" value="ITS1,ITS2"/> | |
238 <param name="only_output" value="false"/> | |
239 <param name="partial" value="0"/> | |
240 <param name="minlen" value="0"/> | |
241 <param name="truncate" value="false"/> | |
242 </section> | |
243 <output name="not_detection_fasta" file="test_01_not_detected.fasta" ftype="fasta"/> | |
244 <output name="not_detection_txt" file="test_01_not_detected.txt" ftype="txt"/> | |
245 <output name="its2" file="test_01_its2.fasta" ftype="fasta"/> | |
246 <output name="summary" file="test_01_summary.txt" ftype="txt" lines_diff="4"/> | |
247 <output name="positions_found" file="test_01_found.tabular" ftype="tabular"/> | |
248 <output name="positions_probable" ftype="tabular"> | |
249 <assert_contents> | |
250 <has_text text="10088_SRR5572434|r"/> | |
251 <has_text text="19705_SRR5572434|fr"/> | |
252 </assert_contents> | |
253 </output> | |
254 <output name="problematic" file="test_01_problematic.tabular" ftype="tabular"/> | |
255 </test> | |
256 <!--Custom options--> | |
257 <test expect_num_outputs="9"> | |
258 <param name="i" value="metagenomics.fasta"/> | |
259 <section name="sequence_section"> | |
260 <param name="character_code" value="f"/> | |
261 <param name="E" value="0.0005"/> | |
262 <param name="S" value="1"/> | |
263 <param name="N" value="2"/> | |
264 <param name="selection_priority" value="score"/> | |
265 <param name="allow_single_domain" value="0.1"/> | |
266 <param name="allow_reorder" value="true"/> | |
267 <param name="complement" value="true"/> | |
268 <param name="heuristics" value="true"/> | |
269 <param name="nhmmer" value="false"/> | |
270 </section> | |
271 <section name="output_options"> | |
272 <param name="output_files" value="summary,fasta,positions,table,not_found"/> | |
273 <param name="preserve" value="false"/> | |
274 <param name="save_regions" value="ITS1,ITS2"/> | |
275 <param name="only_output" value="false"/> | |
276 <param name="partial" value="0"/> | |
277 <param name="minlen" value="20"/> | |
278 <param name="truncate" value="true"/> | |
279 </section> | |
280 <output name="not_detection_fasta" file="test_02_not_detected.fasta" ftype="fasta"/> | |
281 <output name="not_detection_txt" file="test_02_not_detected.txt" ftype="txt"/> | |
282 <output name="its2" file="test_02_its2.fasta" ftype="fasta"/> | |
283 <output name="summary" file="test_02_summary.txt" ftype="txt" lines_diff="4"/> | |
284 <output name="positions_found" file="test_02_found.tabular" ftype="tabular"/> | |
285 <output name="positions_probable" ftype="tabular"> | |
286 <assert_contents> | |
287 <has_text text="10088_SRR5572434|r"/> | |
288 <has_text text="19705_SRR5572434|fr"/> | |
289 </assert_contents> | |
290 </output> | |
291 <output name="problematic" file="test_02_problematic.tabular" ftype="tabular"/> | |
292 </test> | |
293 </tests> | |
294 <help><![CDATA[ | |
295 | |
296 .. class:: infomark | |
297 | |
298 **Purpose** | |
299 | |
300 ITSx is an open source software utility to extract the highly variable ITS1 and ITS2 subregions from ITS sequences, which is commonly used as a molecular barcode for e.g. fungi. As the inclusion of parts of the neighbouring, | |
301 very conserved, ribosomal genes (SSU, 5S and LSU rRNA sequences) in the sequence identification process can lead to severely misleading results, ITSx identifies and extracts only the ITS regions themselves. | |
302 | |
303 ITSx accepts input in the FASTA format. As it pre-processes the input sequences, it is possible to input both aligned and unaligned FASTA files, containing both DNA and RNA sequences. | |
304 | |
305 ---- | |
306 | |
307 .. class:: infomark | |
308 | |
309 **Algorithm and implementation** | |
310 | |
311 The main design goal for ITSx is to achieve fast and accurate extraction of ITS regions in large data sets, without introducing a large number of false positives. To be able to reach a high speed, ITSx relies on | |
312 the HMMER3 software, which allows for extremely fast comparisons of HMM-profiles to a sequence set. To achieve high detection accuracy, ITSx uses multiple HMM-profiles built from the conserved domains flanking | |
313 the ITS regions (SSU, 5.8S and LSU), representing a large number of species groups. This enabled ITSx to extract ITS regions from all eukaryote lineages for which a substantial number of reference ITS sequences | |
314 were available as of 2012. The 2017 release of 1.1 saw the introduction of additional HMM profiles to cover even more lineages, particularly within the fungi. | |
315 | |
316 While the default settings of ITSx should be usable in most situations, you should consider if they suitable for your purposes and for your data set. If the data set is small, this can be done by running the | |
317 software multiple times on the data, with different settings, and analyse the outcome. On larger data sets, it might be more feasible to only run ITSx on a subset of the sequences for testing. The graphical output | |
318 is very useful for determining whether ITSx performs as desired on the data, as the positions of the found conserved domains can be easily investigated. If domains are missing, the criteria might be set to be too | |
319 stringent. If they are not in sequential order (from SSU to LSU with the 5.8S in between), that might be an indication that there is something wrong with the input sequences | |
320 | |
321 ---- | |
322 | |
323 .. class:: infomark | |
324 | |
325 **Notes on PCR primers and short input sequences** | |
326 | |
327 One obvious use of ITSx would be to run it on sequence datasets generated by PCR amplicon studies of the ITS region, to extract ITS1 and/or ITS2 regions, and to sort | |
328 out non-target sequences in the data prior to further analysis. ITSx uses the conserved SSU, 5S and LSU genes to located and orient the ITS regions. To do this, the | |
329 software requires at least ~20 bp. (of at least one) of those genes to be present for each input sequence – preferably 25 bp. However, some primer pairs targeting the ITS | |
330 region will not include sufficiently large portions of these genes to be detected with the accuracy required by ITSx by default. This may lead to that fewer, or even none, of | |
331 the input sequences are recognized as ITS containing. | |
332 | |
333 | |
334 If the dataset is known to contain only ITS sequences, a remedy to this problem can be to lower the stringency of ITSx, using the -E option. By default, this is set to 1e-5, | |
335 but this can be increased to say 0.01, or even 1 to allow for detection of shorter portions of the conserved genes – down to some 15 bases. This feature comes at the | |
336 price of an increase proportion of false-positive matches, but in the case of ITS-only datasets this will be less of a concern. Note that this should normally be done only for | |
337 datasets that are known to contain only ITS sequences, and that caution needs to be taken in the downstream analysis so that false-positive extractions can be avoided (e.g. | |
338 investigate spuriously long or short ITS sequences with a sound degree of scepticism). Conversely, going for very stringent settings may come at the price of sensitivity as | |
339 sequences with deviant genes (or of reduced read quality) may be missed. The default settings of the software are calibrated with environmental datasets in mind, to keep | |
340 false-positive extractions at a minimum. | |
341 | |
342 If the problem instead is that only one of those genes is present on the input sequence, another, even more polished, solution exists. In order to score a sequence as an ITS | |
343 sequence, ITSx prefers to see that the sequence produces matches to at least two HMMs (such as 3’ SSU and 5’ 5.8S). This helps ITSx to keep the number of falsepositive | |
344 identifications low. However, the software also allows sequences that produce a match to only one HMM (such as 3’ SSU) to be scored as ITS sequences, provided | |
345 that the match is particularly stringent. The stringency of this parameter is controlled through the --allow_single_domain switch (see above). The default value is 1e-9. If | |
346 ITSx is used on very short sequences (e.g. 3’ SSU + 100 bp. ITS1), then only one HMM can be expected to match, and the E-value of that match will be compared to | |
347 the value of this parameter. Therefore, if the dataset at hand contains sequences which should only contain a single conserved region, using e.g. --allow_single_domain | |
348 "1e-5,0" can be enough to go from no matches to all matches. However, as with the -E option above, increasing the E-value of this parameter comes at the expense of | |
349 higher risk for false positives. | |
350 | |
351 ]]></help> | |
352 <expand macro="citations" /> | |
353 </tool> |