comparison sortmerna_wrapper.xml @ 0:2e7f0da431e3 draft default tip

Uploaded version 1.0
author bonsai
date Tue, 30 Apr 2013 13:12:35 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2e7f0da431e3
1 <?xml version="1.0" encoding="utf-8"?>
2 <tool id="sortmerna_wrapper" version="1.0" name="Filter with SortMeRNA">
3 <requirements>
4 <requirement type='package' version="1.7">sortmerna</requirement>
5 </requirements>
6 <description>Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data</description>
7 <command interpreter="python">
8 sortmerna_wrapper.py
9 --sortmerna "
10 $strand_search
11 #if str( $read_family.read_family_selector ) == 'other':
12 --I $input_reads -r $read_family.ratio_parameter
13 #else:
14 $read_family.read_family_selector $input_reads
15 #end if
16 #if str( $sequencing_type.sequencing_type_selector ) == 'paired':
17 $sequencing_type.paired_type
18 #end if
19
20 #if $outputs_selected:
21 #if 'accept' in $outputs_selected.value:
22 --accept accept_file
23 #end if
24 #if 'other' in $outputs_selected.value:
25 --other other_file
26 #end if
27 #end if
28 $log
29 #if str( $options.options_type_selector ) == 'more':
30 -a $options.number_of_threads
31 #end if
32 "
33 #if str( $databases_type.databases_selector ) == 'history':
34 --buildtrie
35 #for $db in $databases_type.input_databases
36 $db.database_name
37 #end for
38 #else:
39 ## databases path is not directly accessible, must match by hand with LOC file contents
40 ${' '.join([dict([(x[0], x[2]) for x in $databases_type.input_databases.input.options.tool_data_table.data])[y]
41 for y in $databases_type.input_databases.value])}
42 #end if
43 </command>
44 <inputs>
45 <conditional name="read_family">
46 <param name="read_family_selector" type="select" format="text"
47 help="The Illumina platform is more common for large scale metatranscriptomic projects requiring a high throughput.">
48 <label>Sequencing technology of querying sequences (reads)</label>
49 <option value="--I">Illumina Solexa</option>
50 <option value="--454">454 Roche</option>
51 <option value="other">Other</option>
52 </param>
53 <when value="other">
54 <param name="ratio_parameter" type="float" value="1" min="0" max="1"
55 label="Ratio parameter (the number of hits on the read / read length)"
56 help="The ratio parameter for SortMeRNA has been set to r=0.25 for Illumina Solexa reads and to r=0.15 for 454 Roche reads.
57 For other read types, if the sequencing technology produces high quality reads with a low substitution error rate
58 (0.1 substitutions per 100 bases, such as Illumina), then the ratio parameter can be set to r=[0.23,0.27].
59 If the sequencing technology has a high indel error rate (1-2 indels per 100 bases, such as 454 or Ion Torrent),
60 then the ratio parameter can be set to r=[0.13,0.17]."/>
61 </when>
62 </conditional>
63 <param format="fasta,fastq" name="input_reads" type="data" label="Querying sequences (reads)" help=""/>
64
65 <conditional name="sequencing_type">
66 <param name="sequencing_type_selector" type="select" label="Sequencing type">
67 <option value="not_paired">Reads are not paired</option>
68 <option value="paired">Reads are paired</option>
69 </param>
70 <when value="paired">
71 <param name="paired_type" type="select" label="If one read of a pair is accepted and the other not, output both reads" display="radio"
72 help="SortMeRNA does not use the pairing information for filtering RNA,
73 however if one read of a pair is accepted and the other is not,
74 the resulting output may break apart the pair into two separate files.
75 The purpose of 'Reads are paired' option is to preserve the pairing of the reads.">
76 <option value="--paired-in">to accepted file</option>
77 <option value="--paired-out">to rejected file</option>
78 </param>
79 </when>
80 </conditional>
81
82 <param name="strand_search" type="select" label="Which strands to search" display="radio">
83 <option value="">Search both strands</option>
84 <option value="-F">Search only the forward strand</option>
85 <option value="-R">Search only the reverse-complementary strand</option>
86 </param>
87
88 <conditional name="databases_type">
89 <param name="databases_selector" type="select" label="Databases to query"
90 help="Public rRNA databases provided with SortMeRNA have been indexed.
91 On the contrary, personal databases must be indexed each time SortMeRNA is launched.
92 Please be patient, this may take some time depending on the size of the given database.">
93 <option value="cached" selected="true">Public ribosomal databases</option>
94 <option value="history">Databases from your history</option>
95 </param>
96 <when value="cached">
97 <param name="input_databases" label="rRNA database"
98 type="select" display="checkboxes" multiple="true">
99 <options from_data_table="rRNA_databases" />
100 <validator type="no_options" message="Select at least one database"/>
101 </param>
102 </when>
103 <when value="history">
104 <repeat name="input_databases" title="Database" min="1">
105 <param name="database_name" type="data" format="fasta" label="rRNA database"
106 help="Your database will be indexed first, which may take up to several minutes."/>
107 </repeat>
108 </when>
109 </conditional>
110
111 <!-- Outputs -->
112 <param name="outputs_selected" type="select" display="checkboxes" multiple="true" label="Output options">
113 <option value="accept" selected="True">Reads matching to at least one database</option>
114 <option value="other">Reads not found in any database</option>
115 </param>
116 <param name="log" type="boolean" checked="False" truevalue="--log log_file" falsevalue="" label="Statistics file"
117 help="Generates statistics for the rRNA content of reads, as well as rRNA subunit distribution.">
118 </param>
119
120 <!-- Advanced options -->
121 <conditional name="options">
122 <param name="options_type_selector" type="select" label="Advanced Options">
123 <option value="less" selected="True">Less options</option>
124 <option value="more">More options</option>
125 </param>
126 <when value="less">
127 <!-- no options -->
128 </when>
129 <when value="more">
130 <param name="number_of_threads" type="integer" label="Number of threads to use" value="1" min="1"/>
131 </when>
132 </conditional>
133 </inputs>
134 <outputs>
135 <data format="input" format_source="input_reads" name="output_accept" from_work_dir="accept_file.dat"
136 label="Matching reads on ${on_string} (${input_reads.datatype.file_ext})">
137 <filter>outputs_selected and 'accept' in outputs_selected</filter>
138 </data>
139 <data format="input" format_source="input_reads" name="output_other" from_work_dir="other_file.dat"
140 label="Reads not found on ${on_string} (${input_reads.datatype.file_ext})">
141 <filter>outputs_selected and 'other' in outputs_selected</filter>
142 </data>
143 <data format="txt" name="output_log" label="${tool.name} statistics (txt)" from_work_dir="log_file.log">
144 <filter>log</filter>
145 </data>
146 </outputs>
147 <stdio>
148 <regex match="This program builds a Burst trie on an input rRNA database"
149 source="both"
150 level="fatal"
151 description="Buildtrie program failed to execute." />
152 <regex match="The database name"
153 source="both"
154 level="fatal"
155 description="The database ${databases} has not been preprocessed using buildtrie before using SortMeRNA." />
156 </stdio>
157 <tests>
158 <test>
159 <param name="read_family_selector" value="I" />
160 <param name="input_reads" value="sortmerna_wrapper_in1.fastq" />
161 <param name="sequencing_type_selector" value ="not_paired" />
162 <param name="strand_search" value="" />
163 <param name="databases_selector" value="cached" />
164 <param name="input_databases" value="rfam-5.8s,rfam-5s" />
165 <param name="outputs_selected" value="accept,other" />
166 <param name="log" value="" />
167 <param name="options_type_selector" value="less" />
168 <output name="output_accept" file="sortmerna_wrapper_accept1.fastq" />
169 <output name="output_other" file="sortmerna_wrapper_other1.fastq" />
170 </test>
171 </tests>
172 <help>
173 **Overview**
174
175 SortMeRNA_ is a software designed to rapidly filter ribosomal RNA fragments
176 from metatransriptomic data produced by next-generation sequencers.
177 It is capable of handling large RNA databases and sorting out all fragments
178 matching to the database with high accuracy and specificity.
179
180 .. _SortMeRNA: http://bioinfo.lifl.fr/RNA/sortmerna/
181
182 If you use this tool, please cite Kopylova E., Noé L. and Touzet H.,
183 `"SortMeRNA: Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data"`__,
184 Bioinformatics (2012), doi: 10.1093/bioinformatics/bts611.
185
186 .. __: http://bioinformatics.oxfordjournals.org/content/28/24/3211
187
188 ------
189
190 **Input**
191
192 The input is one file of reads in FASTA or FASTQ format and any number of rRNA databases to search against.
193 If the user has two foward-reverse paired-sequencing reads files, they may use
194 the script "merge_paired_reads.sh" to interleave the reads into one file, preserving their order.
195
196 If the sequencing type for the reads is paired-ended, the user has two options under
197 "Sequencing type" to filter the reads and preserve their order in the file.
198 For a further example of each option, please refer to Section 4.2.3 in the `SortMeRNA User Manual`_.
199
200 .. _sortmerna user manual: http://bioinfo.lifl.fr/RNA/sortmerna/code/SortMeRNA-user-manual-v1.7.pdf
201
202 ------
203
204 **Output**
205
206 The output will follow the same format (FASTA or FASTQ) as the reads.
207
208 In the standalone version of SortMeRNA, the user may output the matching reads in a separate file per database (--bydbs option). This option will be made available in a future version of Galaxy.
209
210 ------
211
212 **rRNA databases**
213
214 SortMeRNA is distributed with 8 representative rRNA databases, which were
215 all constructed from the SILVA SSU,LSU (version 111) and the RFAM 5/5.8S
216 (version 11.0) databases using the tool UCLUST.
217
218 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
219 | Representative database | id % | avergage id% | # seq | Origin | # seq | filtered to remove |
220 +==========================+======+==============+=======+========================+========+====================+
221 | SILVA 16S bacteria | 85 | 91.6 | 8174 | SILVA SSU Ref NR v.111 | 244077 | 23s |
222 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
223 | SILVA 16S archaea | 95 | 96.7 | 3845 | SILVA SSU Ref NR v.111 | 10919 | 23s |
224 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
225 | SILVA 18S eukarya | 95 | 96.7 | 4512 | SILVA SSU Ref NR v.111 | 31862 | 26s,28s,23s |
226 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
227 | |
228 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
229 | SILVA 23S bacteria | 98 | 99.4 | 3055 | SILVA LSU Ref v.111 | 19580 | 16s,26s,28s |
230 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
231 | SILVA 23s archaea | 98 | 99.5 | 164 | SILVA LSU Ref v.111 | 405 | 16s,26s,28s |
232 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
233 | SILVA 28S eukarya | 98 | 99.1 | 4578 | SILVA LSU Ref v.111 | 9321 | 18s |
234 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
235 | |
236 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
237 | Rfam 5S archaea/bacteria | 98 | 99.2 | 59513 | RFAM | 116760 | |
238 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
239 | Rfam 5.8S eukarya | 98 | 98.9 | 13034 | RFAM | 225185 | |
240 +--------------------------+------+--------------+-------+------------------------+--------+--------------------+
241
242
243 id % :
244 members of the cluster must have identity at least 'id %' identity with the representative sequence
245
246 average id % :
247 average identity of a cluster member to the representative sequence
248
249 The user may also choose to use their own rRNA databases.
250
251 .. class:: warningmark
252
253 Note that your personal databases are indexed each time, and that
254 this may take some time depending on the size of the given database.
255
256 ------
257
258 **SortMeRNA parameter list**
259
260 The standalone, command-line version of SortMeRNA uses the following parameters.
261
262 For indexing (buildtrie):
263
264 This program builds a Burst trie on an input rRNA database file in fasta format
265 and stores the material in binary files under the folder '/automata'::
266
267 ./buildtrie --db [path to rrnas database file name {.fasta}] {OPTIONS}
268
269 The list of OPTIONS can be left blank, the default values will be used::
270
271 -L length of the sliding window (the seed)
272 (default: 18)
273
274 -F search only the forward strand
275 -R search only the reverse-complementary strand
276 (default: both strands are searched)
277
278 -h help
279
280
281
282
283 For sorting (sortmerna):
284
285 To run SortMeRNA, type in any order after 'sortmerna'::
286
287 --I [illumina reads file name {fasta/fastq}]
288
289 --454 [roche 454 reads file name {fasta/fastq}]
290
291 -n number of databases to use (must precede --db)
292
293 --db [rrnas database name(s)]
294
295 One database,
296 ex 1. -n 1 --db /path1/database1.fasta
297
298 Multiple databases,
299 ex 2. -n 2 --db /path2/database2.fasta /path3/database3.fasta
300
301 {OPTIONS}
302
303 The list of OPTIONS can be left blank, the default values will be used::
304
305 --accept [accepted reads file name]
306 --other [rejected reads file name]
307 (default: no output file is created)
308
309 --bydbs output the accepted reads by database
310 (default: concatenated file of reads)
311
312 --log [overall statistics file name]
313 (default: no statistics file created)
314
315 --paired-in put both paired-end reads into --accept file
316 --paired-out put both paired-end reads into --other file
317 (default: if one read is accepted and the other is not,
318 separate the reads into --accept and --other files)
319
320 -r ratio of the number of hits on the read / read length
321 (default Illumina: 0.25, Roche 454: 0.15)
322
323 -F search only the forward strand
324 -R search only the reverse-complementary strand
325 (default: both strands are searched)
326
327 -a number of threads to use
328 (default: 1)
329
330 -m (m x 4096 bytes) for loading the reads into memory
331 ex. '-m 4' means 4*4096 = 16384 bytes will be allocated for the reads
332 note: maximum -m is 1020039
333 (default: m = 262144 = 1GB)
334
335 -v verbose
336 (default: deactivated)
337
338 -h help
339
340 --version version number
341
342 ------
343
344 **Bibliography**
345
346 [1] Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, Glöckner FO (2013) The SILVA ribosomal RNA gene database project: improved data processing and web-based tools, Nucleic Acids Research, 41 (D1): D590-D596.
347
348 [2] Rfam 11.0: 10 years of RNA families. S.W. Burge, J. Daub, R. Eberhardt, J. Tate, L. Barquist, E.P. Nawrocki, S.R. Eddy, P.P. Gardner, A. Bateman. Nucleic Acids Research (2012), doi: 10.1093/nar/gks1005
349
350 [3] Edgar, R.C. (2010) Search and clustering orders of magnitude faster than BLAST, Bioinformatics 26(19), 2460-2461, doi: 10.1093/bioinformatics/btq461
351
352 [4] Loman, N. J. and Misra, Raju V and Dallman, Timothy J and Constantinidou, Chrystala and Gharbia, Saheer E and Wain, John and Pallen, Mark J., Performance comparison of benchtop high-throughput sequencing platforms (2012), Nature Biotechnology, 30 (5). pp. 434-439
353 </help>
354 </tool>