comparison bbduk.xml @ 0:204a131e47db draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bbtools commit ae31a678eb5c04fb74b94161db95705d597990ad"
author iuc
date Thu, 11 Nov 2021 16:38:32 +0000
parents
children 60dd895841cd
comparison
equal deleted inserted replaced
-1:000000000000 0:204a131e47db
1 <tool id="bbtools_bbduk" name="BBTools: BBduk" version="@WRAPPER_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>decontamination using kmers</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements"/>
7 <command detect_errors="exit_code"><![CDATA[
8 #import os
9 #import re
10
11 #if str($input_type_cond.input_type) in ['single', 'pair']:
12 #set read1 = $input_type_cond.read1
13 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
14 ## bbduk uses the file extension to determine the input format.
15 #set ext = $read1_identifier + '.fastq'
16 #if $read1.ext.endswith('.gz'):
17 #set ext = $ext + '.gz'
18 #end if
19 #set read1_file = $read1_identifier + $ext
20 ln -s '${read1}' '${read1_file}' &&
21 #if str($input_type_cond.input_type) == 'pair':
22 #set read2 = $input_type_cond.read2
23 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
24 #set read2_file = $read2_identifier + $ext
25 ln -s '${read2}' '${read2_file}' &&
26 #end if
27 #else:
28 #set read1 = $input_type_cond.reads_collection['forward']
29 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
30 ## bbduk uses the file extension to determine the input format.
31 #set ext = $read1_identifier + '.fastq'
32 #if $read1.ext.endswith('.gz'):
33 #set ext = $ext + '.gz'
34 #end if
35 #set read1_file = $read1_identifier + $ext
36 ln -s '${read1}' '${read1_file}' &&
37 #set read2 = $input_type_cond.reads_collection['reverse']
38 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
39 #set read2_file = $read2_identifier + $ext
40 ln -s '${read2}' '${read2_file}' &&
41 #end if
42
43 #if str($reference_type_cond.reference_type) == 'files':
44 #set refs = list()
45 #for ref in $reference_type_cond.reference:
46 ## bbduk looks at the file extension.
47 #set ref_name = str($os.path.basename($ref.file_name)) + '.fa'
48 #if $ref.ext.endswith('.gz'):
49 gunzip -c '$ref' > '$ref_name' &&
50 #else:
51 ln -s '$ref' '$ref_name' &&
52 #end if
53 $refs.append(str($ref_name))
54 #end for
55 #set refs = ','.join($refs)
56 #else if str($reference_type_cond.reference_type) == 'keywords':
57 #set refs = str($reference_type_cond.reference)
58 #end if
59
60 bbduk.sh
61 in='${read1_file}'
62
63 #if str($input_type_cond.input_type) in ['pair', 'paired']:
64 in2='${read2_file}'
65 #end if
66 #if str($outputs_select).find('outu') >= 0:
67 out='${outputu}'
68 #if str($input_type_cond.input_type) in ['pair', 'paired']:
69 out2='${outputu2}'
70 #end if
71 #end if
72 #if str($outputs_select).find('outm') >= 0:
73 outm='${outputm}'
74 #if str($input_type_cond.input_type) in ['pair', 'paired']:
75 outm2='${outputm2}'
76 #end if
77 #end if
78 #if str($outputs_select).find('outs') >= 0:
79 outs='${outputs}'
80 #end if
81
82 #if str($reference_type_cond.reference_type) != 'no_reference':
83 ref='$refs'
84 #if str($reference_type_cond.ktrim_cond.ktrim_select) == 'yes':
85 ktrim='${reference_type_cond.ktrim_cond.ktrim}'
86 minlength=$reference_type_cond.ktrim_cond.minlength
87 #end if
88 #end if
89
90 k=$advanced_options.k
91 rcomp='${advanced_options.rcomp}'
92 maskmiddle='${advanced_options.maskmiddle}'
93 minkmerhits='${advanced_options.minkmerhits}'
94 minkmerfraction=$advanced_options.minkmerfraction
95 mincovfraction=$advanced_options.mincovfraction
96 hammingdistance=$advanced_options.hammingdistance
97 qhdist=$advanced_options.qhdist
98 editdistance=$advanced_options.editdistance
99 forbidn='${advanced_options.forbidn}'
100 trimfailures='${advanced_options.trimfailures}'
101 findbestmatch='${advanced_options.findbestmatch}'
102 skipr1='${advanced_options.skipr1}'
103 skipr2='${advanced_options.skipr2}'
104
105 #if str($output_stats_cond.output_stats) == 'yes':
106 #if str($output_stats_cond.output_stats_select).find('stats') >= 0:
107 stats='${output_stats}'
108 #end if
109 #if str($output_stats_cond.output_stats_select).find('ref') >= 0:
110 refstats='${output_ref}'
111 #end if
112 #if str($output_stats_cond.output_stats_select).find('rpkm') >= 0:
113 rpkm='${output_rpkm}'
114 #end if
115 #if str($output_stats_cond.output_stats_select).find('dump') >= 0:
116 dump='${output_dump}'
117 #end if
118 #end if
119 #if str($output_hists_cond.output_hists) == 'yes':
120 #if str($output_hists_cond.output_hists_select).find('bhist') >= 0:
121 bhist='${output_bhist}'
122 #end if
123 #if str($output_hists_cond.output_hists_select).find('quhist') >= 0:
124 qhist='${output_quhist}'
125 #end if
126 #if str($output_hists_cond.output_hists_select).find('quchist') >= 0:
127 qchist='${output_quchist}'
128 #end if
129 #if str($output_hists_cond.output_hists_select).find('aqhist') >= 0:
130 aqhist='${output_aqhist}'
131 #end if
132 #if str($output_hists_cond.output_hists_select).find('bqhist') >= 0:
133 bqhist='${output_bqhist}'
134 #end if
135 #if str($output_hists_cond.output_hists_select).find('lhist') >= 0:
136 lhist='${output_lhist}'
137 #end if
138 #if str($output_hists_cond.output_hists_select).find('phist') >= 0:
139 phist='${output_phist}'
140 #end if
141 #if str($output_hists_cond.output_hists_select).find('gchist') >= 0:
142 gchist='${output_gchist}'
143 #end if
144 #if str($output_hists_cond.output_hists_select).find('enthist') >= 0:
145 enthist='${output_enthist}'
146 #end if
147 #end if
148 t=\${GALAXY_SLOTS:-4}
149 ]]></command>
150 <inputs>
151 <expand macro="input_type_cond"/>
152 <conditional name="reference_type_cond">
153 <param name="reference_type" type="select" label="Choose the reference type" help="Optional, no reference is the default">
154 <option value="no_reference" selected="true">No reference</option>
155 <option value="files">files</option>
156 <option value="keywords">keywords</option>
157 </param>
158 <when value="no_reference"/>
159 <when value="files">
160 <param name="reference" type="data" format="fasta,fasta.gz" multiple="true" optional="false" label="Select one or more fasta file"/>
161 <expand macro="ktrim_cond"/>
162 </when>
163 <when value="keywords">
164 <param name="reference" type="select" multiple="true" optional="false" label="Select one or more keywords">
165 <option value="adapters">adapters</option>
166 <option value="artifacts">artifacts</option>
167 <option value="phix">phix</option>
168 <option value="lambda">lambda</option>
169 <option value="pjet">pjet</option>
170 <option value="mtst">mtst</option>
171 <option value="kapa">kapa</option>
172 </param>
173 <expand macro="ktrim_cond"/>
174 </when>
175 </conditional>
176 <section name="advanced_options" title="Advanced options" expanded="false">
177 <param argument="k" type="integer" value="27" min="1" label="Kmer length used for finding contaminants"/>
178 <param argument="rcomp" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Look for reverse-complements of kmers in addition to forward kmers?"/>
179 <param argument="maskmiddle" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Treat the middle base of a kmer as a wildcard to increase sensitivity in the presence of errors?"/>
180 <param argument="minkmerhits" type="integer" value="1" min="1" label="Reads need at least this many matching kmers to be considered as matching the reference"/>
181 <param argument="minkmerfraction" type="float" value="0" min="0" label="A read needs at least this fraction of its total kmers to hit a ref in order to be considered a match"/>
182 <param argument="mincovfraction" type="float" value="0" min="0" label="A read needs at least this fraction of its total bases to be covered by ref kmers to be considered a match"/>
183 <param argument="hammingdistance" type="integer" value="0" min="0" label="Maximum Hamming distance for ref kmers (subs only)"/>
184 <param argument="qhdist" type="integer" value="0" min="0" label="Hamming distance for query kmers"/>
185 <param argument="editdistance" type="integer" value="0" min="0" label="Maximum edit distance from ref kmers (subs and indels)"/>
186 <param argument="forbidn" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Do not match kmers comntaining N?"/>
187 <param argument="trimfailures" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Trim failed reads to 1bp instead of discarding them?"/>
188 <param argument="findbestmatch" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Associate read with sequence sharing most kmers if multiple matches?"/>
189 <param argument="skipr1" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Don't do kmer-based operations on read 1?"/>
190 <param argument="skipr2" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Don't do kmer-based operations on read 2?"/>
191 </section>
192 <param name="outputs_select" type="select" multiple="true" optional="false" label="Specify outputs">
193 <option value="outu">Unmatched</option>
194 <option value="outm">Matched</option>
195 <option value="outs">Single</option>
196 </param>
197 <conditional name="output_stats_cond">
198 <param name="output_stats" type="select" label="Output statistics?">
199 <option value="no" selected="true">No</option>
200 <option value="yes">Yes</option>
201 </param>
202 <when value="no"/>
203 <when value="yes">
204 <param name="output_stats_select" type="select" multiple="true" optional="false" label="Specify statistics outputs">
205 <option value="stats">Statistics about which contamininants were detected</option>
206 <option value="ref">Statistics on a per-reference-file basis</option>
207 <option value="rpkm">RPKM for each reference sequence (for RNA-seq)</option>
208 <option value="dump">kmer tables in fasta format</option>
209 </param>
210 </when>
211 </conditional>
212 <conditional name="output_hists_cond">
213 <param name="output_hists" type="select" label="Output histograms?">
214 <option value="no" selected="true">No</option>
215 <option value="yes">Yes</option>
216 </param>
217 <when value="no"/>
218 <when value="yes">
219 <param name="output_hists_select" type="select" multiple="true" optional="false" label="Specify statistics outputs">
220 <option value="bhist">Base composition histogram by position</option>
221 <option value="quhist">Quality histogram by position</option>
222 <option value="quchist">Count of bases with each quality value</option>
223 <option value="aqhist">Histogram of average read quality</option>
224 <option value="bqhist">Quality histogram designed for box plots</option>
225 <option value="lhist">Read length histogram</option>
226 <option value="phist">Polymer length histogram</option>
227 <option value="gchist">Read GC content histogram</option>
228 <option value="enthist">Read entropy histogram</option>
229 </param>
230 </when>
231 </conditional>
232 </inputs>
233 <outputs>
234 <data name="outputu" format="fastqsanger" label="${tool.name} on ${on_string} (Unmatched)">
235 <filter>str(outputs_select).find('outu') >= 0</filter>
236 <filter>'outu' in outputs_select</filter>
237 </data>
238 <data name="outputu2" format="fastqsanger" label="${tool.name} on ${on_string} (Unmatched)">
239 <filter>'outu' in outputs_select and input_type_cond['input_type'] != 'single'</filter>
240 </data>
241 <data name="outputm" format="fastqsanger" label="${tool.name} on ${on_string} (Matched)">
242 <filter>'outm' in outputs_select</filter>
243 </data>
244 <data name="outputm2" format="fastqsanger" label="${tool.name} on ${on_string} (Matched)">
245 <filter>'outm' in outputs_select and input_type_cond['input_type'] != 'single'</filter>
246 </data>
247 <data name="outputs" format="fastqsanger" label="${tool.name} on ${on_string} (Single)">
248 <filter>'outs' in outputs_select</filter>
249 </data>
250 <data name="output_stats" format="tabular" label="${tool.name} on ${on_string} (Detected contaminates stats)">
251 <filter>output_stats_cond['output_stats'] == 'yes' and 'stats' in output_stats_cond['output_stats_select']</filter>
252 </data>
253 <data name="output_ref" format="tabular" label="${tool.name} on ${on_string} (Per reference file stats)">
254 <filter>output_stats_cond['output_stats'] == 'yes' and 'ref' in output_stats_cond['output_stats_select']</filter>
255 </data>
256 <data name="output_rpkm" format="tabular" label="${tool.name} on ${on_string} (RPKM for each ref seq)">
257 <filter>output_stats_cond['output_stats'] == 'yes' and 'rpkm' in output_stats_cond['output_stats_select']</filter>
258 </data>
259 <data name="output_dump" format="fasta" label="${tool.name} on ${on_string} (kmer tables)">
260 <filter>output_stats_cond['output_stats'] == 'yes' and 'dump' in output_stats_cond['output_stats_select']</filter>
261 </data>
262 <data name="output_bhist" format="tabular" label="${tool.name} on ${on_string} (Base composition py poition)">
263 <filter>output_hists_cond['output_hists'] == 'yes' and 'bhist' in output_hists_cond['output_hists_select']</filter>
264 </data>
265 <data name="output_quhist" format="tabular" label="${tool.name} on ${on_string} (Quality by position)">
266 <filter>output_hists_cond['output_hists'] == 'yes' and 'quhist' in output_hists_cond['output_hists_select']</filter>
267 </data>
268 <data name="output_quchist" format="tabular" label="${tool.name} on ${on_string} (Bases w/ each quality value)">
269 <filter>output_hists_cond['output_hists'] == 'yes' and 'quchist' in output_hists_cond['output_hists_select']</filter>
270 </data>
271 <data name="output_aqhist" format="tabular" label="${tool.name} on ${on_string} (average read quality)">
272 <filter>output_hists_cond['output_hists'] == 'yes' and 'aqhist' in output_hists_cond['output_hists_select']</filter>
273 </data>
274 <data name="output_bqhist" format="tabular" label="${tool.name} on ${on_string} (Quality for box plots)">
275 <filter>output_hists_cond['output_hists'] == 'yes' and 'bqhist' in output_hists_cond['output_hists_select']</filter>
276 </data>
277 <data name="output_lhist" format="tabular" label="${tool.name} on ${on_string} (Read length)">
278 <filter>output_hists_cond['output_hists'] == 'yes' and 'lhist' in output_hists_cond['output_hists_select']</filter>
279 </data>
280 <data name="output_phist" format="tabular" label="${tool.name} on ${on_string} (Polymer length)">
281 <filter>output_hists_cond['output_hists'] == 'yes' and 'phist' in output_hists_cond['output_hists_select']</filter>
282 </data>
283 <data name="output_gchist" format="tabular" label="${tool.name} on ${on_string} (Read GC content)">
284 <filter>output_hists_cond['output_hists'] == 'yes' and 'gchist' in output_hists_cond['output_hists_select']</filter>
285 </data>
286 <data name="output_enthist" format="tabular" label="${tool.name} on ${on_string} (Read entropy)">
287 <filter>output_hists_cond['output_hists'] == 'yes' and 'enthist' in output_hists_cond['output_hists_select']</filter>
288 </data>
289 </outputs>
290 <tests>
291 <!-- Single read -->
292 <test expect_num_outputs="1">
293 <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
294 <param name="reference" value="adapters.fa.gz" ftype="fasta.gz"/>
295 <param name="reference_type" value="files"/>
296 <param name="outputs_select" value="outu"/>
297 <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
298 </test>
299 <!-- Paired reads in separate datasets -->
300 <test expect_num_outputs="4">
301 <param name="input_type" value="pair"/>
302 <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
303 <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz"/>
304 <param name="reference_type" value="files"/>
305 <param name="reference" value="adapters.fa.gz" ftype="fasta.gz"/>
306 <param name="outputs_select" value="outu"/>
307 <param name="output_stats" value="yes"/>
308 <param name="output_stats_select" value="dump"/>
309 <param name="output_hists" value="yes"/>
310 <param name="output_hists_select" value="quhist"/>
311 <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
312 <output name="outputu2" file="bduk_outputu2.fastqsanger" ftype="fastqsanger" compare="contains"/>
313 <output name="output_dump" file="bduk_output_dump1.fasta" ftype="fasta" compare="contains"/>
314 <output name="output_quhist" file="bduk_output_quhist1.tabular" ftype="tabular"/>
315 </test>
316 <!-- Collection of Paired reads -->
317 <test expect_num_outputs="2">
318 <param name="input_type" value="paired"/>
319 <param name="reads_collection">
320 <collection type="paired">
321 <element name="forward" value="13-1941-6_S4_L001_R1_600000.fastq.gz"/>
322 <element name="reverse" value="13-1941-6_S4_L001_R2_600000.fastq.gz"/>
323 </collection>
324 </param>
325 <param name="reference_type" value="keywords"/>
326 <param name="reference" value="adapters,artifacts,phix,lambda,pjet,mtst,kapa"/>
327 <param name="outputs_select" value="outu"/>
328 <output name="outputu" file="bduk_outputu1.fastqsanger" ftype="fastqsanger" compare="contains"/>
329 <output name="outputu2" file="bduk_outputu2.fastqsanger" ftype="fastqsanger" compare="contains"/>
330 </test>
331 </tests>
332 <help>
333 **What it does**
334
335 BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single
336 high-performance tool. It is capable of quality-trimming and filtering, adapter-trimming, contaminant-filtering via kmer
337 matching, sequence masking, GC-filtering, length filtering, entropy-filtering, format conversion, histogram generation,
338 subsampling, quality-score recalibration, kmer cardinality estimation, and various other operations in a single pass.
339 Specifically, any combination of operations is possible in a single pass with the exception of kmer-based operations (kmer
340 trimming, kmer masking, or kmer filtering). At most 1 kmer-based operation can be done in a single pass.
341
342 **Options**
343
344 * **Reference** - if a reference is specified, BBDuk will operate on kmers in one of 4 modes: right-trimming, left-trimming, masking, or filtering. The default is filtering - any read matching a reference kmer will be discarded.
345
346 * **Trim reads to remove bases matching reference kmers** - When trimming to the right, once a reference kmer is matched in a read, that kmer and all the bases to the right will be trimmed, leaving only the bases to the left. When trimming to the left, trimming will be done to the left instead.
347
348 **Outputs**
349
350 * **Unmatched** - All the reads that pass all filtering criteria. Reads will be at least as long as **Minimum read length** after any trimming operations and reads will not match any reference kmer if kmer-filtering is being performed. A read’s average quality will be at least as high as the specified **Minimum average quality**.
351 * **Matched** - Reads failing any filter criteria (such as matching a reference kmer). By default, if either read in a pair fails, both will be included in *Matched*.
352 * **Single** - Singleton reads whose mate was trimmed shorter than the value of **Minimum read length**.
353 </help>
354 <expand macro="citations"/>
355 </tool>
356