Mercurial > repos > frogs > frogs_2_0_0
comparison preprocess.xml @ 0:76c750c5f0d1 draft default tip
planemo upload for repository https://github.com/oinizan/FROGS-wrappers commit 0b900a51e220ce6f17c1e76292c06a5f4d934055-dirty
author | frogs |
---|---|
date | Thu, 25 Oct 2018 05:01:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:76c750c5f0d1 |
---|---|
1 <?xml version="1.0"?> | |
2 <!-- | |
3 # Copyright (C) 2015 INRA | |
4 # | |
5 # This program is free software: you can redistribute it and/or modify | |
6 # it under the terms of the GNU General Public License as published by | |
7 # the Free Software Foundation, either version 3 of the License, or | |
8 # (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 --> | |
18 <tool id="FROGS_preprocess" name="FROGS Pre-process" version="2.0.1"> | |
19 <description>Step 1 in metagenomics analysis: denoising and dereplication.</description> | |
20 <requirements> | |
21 <requirement type="package" version="2.0.1">frogs</requirement> | |
22 </requirements> | |
23 <stdio> | |
24 <exit_code range="1:" /> | |
25 <exit_code range=":-1" /> | |
26 </stdio> | |
27 <command> | |
28 preprocess.py $sequencer_type.sequencer_selected | |
29 --output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file | |
30 --nb-cpus $nb_cpus | |
31 #if $sequencer_type.sequencer_selected == "illumina" | |
32 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size | |
33 #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard" | |
34 --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer | |
35 #else | |
36 --without-primers | |
37 #end if | |
38 #else | |
39 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size | |
40 --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer | |
41 #end if | |
42 | |
43 #if $sequencer_type.input_type.input_type_selected == "archive" | |
44 --input-archive $sequencer_type.input_type.archive_file | |
45 #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_contiged" | |
46 --already-contiged | |
47 #elif $sequencer_type.sequencer_selected == "illumina" | |
48 --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size | |
49 --expected-amplicon-size $sequencer_type.input_type.archive_type.expected_amplicon_size | |
50 --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate | |
51 #end if | |
52 #else | |
53 #set $sep = ' ' | |
54 #if $sequencer_type.sequencer_selected == "illumina" | |
55 --samples-names | |
56 #for $current in $sequencer_type.input_type.files_by_samples_type.samples | |
57 $sep"${current.name.strip()}" | |
58 #end for | |
59 --input-R1 | |
60 #for $current in $sequencer_type.input_type.files_by_samples_type.samples | |
61 $sep${current.R1_file} | |
62 #end for | |
63 #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_contiged" | |
64 --already-contiged | |
65 #else | |
66 --input-R2 | |
67 #for $current in $sequencer_type.input_type.files_by_samples_type.samples | |
68 $sep${current.R2_file} | |
69 #end for | |
70 --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size | |
71 --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size | |
72 --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate | |
73 #end if | |
74 #else | |
75 --input-R1 | |
76 #for $current in $sequencer_type.input_type.samples | |
77 $sep${current.R1_file} | |
78 #end for | |
79 --samples-names | |
80 #for $current in $sequencer_type.input_type.samples | |
81 $sep"${current.name.strip()}" | |
82 #end for | |
83 #end if | |
84 #end if | |
85 </command> | |
86 <inputs> | |
87 <param name="nb_cpus" type="hidden" label="CPU number" help="The maximum number of CPUs used." value="1" /> | |
88 <conditional name="sequencer_type"> | |
89 <param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences."> | |
90 <option value="illumina" selected="true">Illumina</option> | |
91 <option value="454">454</option> | |
92 </param> | |
93 <when value="illumina"> | |
94 <!-- Samples --> | |
95 <conditional name="input_type"> | |
96 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample."> | |
97 <option value="files_by_samples" selected="true">Files by samples</option> | |
98 <option value="archive">Archive</option> | |
99 </param> | |
100 <when value="archive"> | |
101 <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" /> | |
102 <conditional name="archive_type"> | |
103 <param name="archive_type_selected" type="select" label="Reads already contiged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair."> | |
104 <option value="paired" selected="true">No</option> | |
105 <option value="already_contiged">Yes</option> | |
106 </param> | |
107 <when value="paired"> | |
108 <!-- Reads size --> | |
109 <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" /> | |
110 <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" /> | |
111 <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" /> | |
112 <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" /> | |
113 </when> | |
114 <when value="already_contiged"></when> | |
115 </conditional> | |
116 </when> | |
117 <when value="files_by_samples"> | |
118 <conditional name="files_by_samples_type"> | |
119 <param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair."> | |
120 <option value="paired" selected="true">No</option> | |
121 <option value="already_contiged">Yes</option> | |
122 </param> | |
123 <when value="paired"> | |
124 <!-- Samples --> | |
125 <repeat name="samples" title="Samples" min="1"> | |
126 <param name="name" type="text" label="Name" help="The sample name." optional="false"> | |
127 <validator type="empty_field" message="This parameter is required." /> | |
128 </param> | |
129 <param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." /> | |
130 <param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." /> | |
131 </repeat> | |
132 <!-- Reads size --> | |
133 <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" /> | |
134 <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" /> | |
135 <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" /> | |
136 <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" /> | |
137 </when> | |
138 <when value="already_contiged"> | |
139 <repeat name="samples" title="Samples" min="1"> | |
140 <param name="name" type="text" label="Name" help="The sample name." optional="false"> | |
141 <validator type="empty_field" message="This parameter is required." /> | |
142 </param> | |
143 <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." /> | |
144 </repeat> | |
145 </when> | |
146 </conditional> | |
147 </when> | |
148 </conditional> | |
149 <!-- Amplicons --> | |
150 <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons." value="" optional="false" /> | |
151 <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons." value="" optional="false" /> | |
152 <!-- Primers --> | |
153 <conditional name="sequencing_protocol"> | |
154 <param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers."> | |
155 <option value="standard" selected="true">Illumina standard</option> | |
156 <option value="without_primers">Custom protocol (Kozich et al. 2013)</option> | |
157 </param> | |
158 <when value="standard"> | |
159 <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> | |
160 <validator type="empty_field" message="This parameter is required." /> | |
161 </param> | |
162 <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> | |
163 <validator type="empty_field" message="This parameter is required." /> | |
164 </param> | |
165 </when> | |
166 <when value="without_primers"></when> | |
167 </conditional> | |
168 </when> | |
169 | |
170 <when value="454"> | |
171 <!-- Samples --> | |
172 <conditional name="input_type"> | |
173 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample."> | |
174 <option value="files_by_samples" selected="true">One file by sample</option> | |
175 <option value="archive">Archive</option> | |
176 </param> | |
177 <when value="archive"> | |
178 <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" /> | |
179 </when> | |
180 <when value="files_by_samples"> | |
181 <repeat name="samples" title="Samples" min="1"> | |
182 <param name="name" type="text" label="Name" help="The sample name." optional="false" /> | |
183 <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." /> | |
184 </repeat> | |
185 </when> | |
186 </conditional> | |
187 <!-- Amplicons --> | |
188 <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" /> | |
189 <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" /> | |
190 <!-- Primers --> | |
191 <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> | |
192 <validator type="empty_field" message="This parameter is required." /> | |
193 </param> | |
194 <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> | |
195 <validator type="empty_field" message="This parameter is required." /> | |
196 </param> | |
197 </when> | |
198 </conditional> | |
199 </inputs> | |
200 <outputs> | |
201 <data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" /> | |
202 <data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" /> | |
203 <data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" /> | |
204 </outputs> | |
205 <tests> | |
206 <test> | |
207 <conditional name="sequencer_type"> | |
208 <param name="sequencer_selected" value="illumina"/> | |
209 <conditional name="input_type"> | |
210 <param name="input_type_selected" value="archive"/> | |
211 <param name="archive_file" value="test_dataset.tar.gz"/> | |
212 <conditional name="archive_type"> | |
213 <param name="archive_type_selected" value="paired"/> | |
214 <param name="R1_size" value="250"/> | |
215 <param name="R2_size" value="250"/> | |
216 <param name="expected_amplicon_size" value="420"/> | |
217 <param name="mm_rate" value="0.15"/> | |
218 </conditional> | |
219 </conditional> | |
220 <param name="min_amplicon_size" value="380"/> | |
221 <param name="max_amplicon_size" value="460"/> | |
222 <conditional name="sequencing_protocol"> | |
223 <param name="sequencing_protocol_selected" value="standard"/> | |
224 <param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/> | |
225 <param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/> | |
226 </conditional> | |
227 </conditional> | |
228 <output name="dereplicated_file" file="references/01-prepro.fasta"/> | |
229 <output name="count_file" file="references/01-prepro.tsv"/> | |
230 <output name="summary_file" file="references/01-prepro.html" compare="sim_size" delta="0"/> | |
231 </test> | |
232 </tests> | |
233 <help> | |
234 | |
235 .. image:: static/images/FROGS_logo.png | |
236 :height: 144 | |
237 :width: 110 | |
238 | |
239 | |
240 .. class:: infomark page-header h2 | |
241 | |
242 What it does | |
243 | |
244 FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis. | |
245 | |
246 | |
247 .. class:: infomark page-header h2 | |
248 | |
249 Inputs/Outputs | |
250 | |
251 .. class:: h3 | |
252 | |
253 Inputs | |
254 | |
255 Sample files added one after another or provide in an archive file (tar.gz). | |
256 | |
257 .. container:: row | |
258 | |
259 .. container:: col-md-6 | |
260 | |
261 **Illumina inputs** | |
262 | |
263 :Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region. | |
264 :Files: One R1 and R2 by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_) | |
265 :Example: splA_R1.fastq.gz, splA_R2.fastq.gz, splB_R1.fastq.gz, splB_R2.fastq.gz | |
266 | |
267 OR | |
268 | |
269 :Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged. | |
270 :Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_). | |
271 :Example: splA.fastq.gz, splB.fastq.gz | |
272 | |
273 .. container:: col-md-6 | |
274 | |
275 **454 inputs** | |
276 | |
277 :Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_) | |
278 :Example: splA.fastq.gz, splB.fastq.gz | |
279 | |
280 Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*. To upload an archive, see the "Upload archive" tool or if possible create symbolic link on your Galaxy account. | |
281 | |
282 .. class:: h3 | |
283 | |
284 Outputs | |
285 | |
286 **Sequence file** (dereplicated.fasta): | |
287 | |
288 Only one file with all samples sequences (format `FASTA <https://en.wikipedia.org/wiki/FASTA_format>`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file. | |
289 | |
290 **Count file** (count.tsv): | |
291 | |
292 This file contains the count of all unique sequences in each sample (format `TSV <https://en.wikipedia.org/wiki/Tab-separated_values>`_). | |
293 | |
294 **Summary file** (report.html): | |
295 | |
296 This file reports the number of remaining sequences after each filter (format `HTML <https://en.wikipedia.org/wiki/HTML>`_). | |
297 | |
298 .. image:: static/images/FROGS_preprocess_summary.png | |
299 :height: 355 | |
300 :width: 676 | |
301 | |
302 It also presents the length distribution of the remaining sequences. | |
303 | |
304 .. image:: static/images/FROGS_preprocess_lengthsSamples.png | |
305 :height: 350 | |
306 :width: 676 | |
307 | |
308 .. class:: infomark page-header h2 | |
309 | |
310 How it works | |
311 | |
312 .. csv-table:: | |
313 :header: "Steps", "Illumina", "454" | |
314 :widths: 5, 150, 150 | |
315 :class: table table-striped | |
316 | |
317 "1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region (`FLASh <http://ccb.jhu.edu/software/FLASH/>`_). By default M is set to 10%", "/" | |
318 "2", "Filters merged sequences on their length which must be range between 'Minimum amplicon size' and 'Maximum amplicon size'", "/" | |
319 "3", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand - (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences" | |
320 "4", "Filters sequences on their length and with ambiguous nucleotides", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10" | |
321 "5", "Dereplicates sequences", "Dereplicates sequences" | |
322 | |
323 | |
324 .. class:: infomark page-header h2 | |
325 | |
326 Advices/details on parameters | |
327 | |
328 .. class:: h3 | |
329 | |
330 Primers parameters | |
331 | |
332 The (`Kozich et al. 2013 <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/>`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers. | |
333 | |
334 In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation. | |
335 | |
336 .. role:: alert-info | |
337 | |
338 Example: | |
339 | |
340 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3' | |
341 | |
342 Value for parameter 5' primer: ATGCC | |
343 | |
344 Value for parameter 3' primer: ATTTCAG | |
345 | |
346 .. class:: h3 | |
347 | |
348 Amplicons sizes parameters | |
349 | |
350 The two following images show two examples of perfect values fors sizes parameters. | |
351 | |
352 .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal.png | |
353 :height: 415 | |
354 :width: 676 | |
355 | |
356 .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal.png | |
357 :height: 415 | |
358 :width: 676 | |
359 | |
360 Don't worry the "Expected amplicon size" does not need to be very accurate. | |
361 | |
362 .. class:: h3 | |
363 | |
364 If the filter 'overlapped' reduce drasticaly the number of sequences: | |
365 | |
366 In un-merged Illumina data, the reduction of dataset by the overlapped filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem. | |
367 | |
368 If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC <http://www.bioinformatics.babraham.ac.uk/projects/fastqc/>`_) you can try to cut the end of your sequences and relaunch the preprocess tool. | |
369 You can either raise the mismatch percent in the overlapped region, but not too much! | |
370 | |
371 ---- | |
372 | |
373 **Contact** | |
374 | |
375 Contacts: frogs@inra.fr | |
376 | |
377 Repository: https://github.com/geraldinepascal/FROGS | |
378 | |
379 Please cite the FROGS Publication: *Frederic Escudie, Lucas Auer, Maria Bernard, Mahendra Mariadassou, Laurent Cauquil, Katia Vidal, Sarah Maman, Guillermina Hernandez-Raquet, Sylvie Combes, Geraldine Pascal; FROGS: Find, Rapidly, OTUs with Galaxy Solution, Bioinformatics, , btx791,* https://doi.org/10.1093/bioinformatics/btx791 | |
380 | |
381 Depending on the help provided you can cite us in acknowledgements, references or both. | |
382 </help> | |
383 <citations> | |
384 <citation type="doi">10.1093/bioinformatics/btx791</citation> | |
385 <citation type="doi">10.1128/AEM.01043-13</citation> | |
386 <citation type="doi">10.14806/ej.17.1.200</citation> | |
387 <citation type="doi">10.1093/bioinformatics/btr507</citation> | |
388 </citations> | |
389 </tool> |