comparison amplicon_analysis_pipeline.xml @ 0:47ec9c6f44b8 draft

planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author pjbriggs
date Thu, 09 Nov 2017 10:13:29 -0500
parents
children 1c1902e12caf
comparison
equal deleted inserted replaced
-1:000000000000 0:47ec9c6f44b8
1 <tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.0.6">
2 <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description>
3 <requirements>
4 <requirement type="package" version="1.1">amplicon_analysis_pipeline</requirement>
5 <requirement type="package" version="1.11">cutadapt</requirement>
6 <requirement type="package" version="1.33">sickle</requirement>
7 <requirement type="package" version="27-08-2013">bioawk</requirement>
8 <requirement type="package" version="2.8.1">pandaseq</requirement>
9 <requirement type="package" version="3.5.0">spades</requirement>
10 <requirement type="package" version="0.11.3">fastqc</requirement>
11 <requirement type="package" version="1.8.0">qiime</requirement>
12 <requirement type="package" version="2.2.26">blast</requirement>
13 <requirement type="package" version="0.2.4">fasta-splitter</requirement>
14 <requirement type="package" version="2.2">rdp-classifier</requirement>
15 <requirement type="package" version="3.2.0">R</requirement>
16 <requirement type="package" version="1.1.3">vsearch</requirement>
17 <requirement type="package" version="2010-04-29">microbiomeutil</requirement>
18 <requirement type="package">fasta_number</requirement>
19 </requirements>
20 <stdio>
21 <exit_code range="1:" />
22 </stdio>
23 <command><![CDATA[
24 ## Set the reference database name
25 #if $reference_database == ""
26 #set reference_database_name = "gg"
27 #else
28 #set reference_database_name = "silva"
29 #end if
30
31 ## Run the amplicon analysis pipeline wrapper
32 python $__tool_directory__/amplicon_analysis_pipeline.py
33 ## Set options
34 #if str( $forward_pcr_primer ) != ""
35 -g "$forward_pcr_primer"
36 #end if
37 #if str( $reverse_pcr_primer ) != ""
38 -G "$reverse_pcr_primer"
39 #end if
40 #if str( $trimming_threshold ) != ""
41 -q $trimming_threshold
42 #end if
43 #if str( $sliding_window_length ) != ""
44 -l $sliding_window_length
45 #end if
46 #if str( $minimum_overlap ) != ""
47 -O $minimum_overlap
48 #end if
49 #if str( $minimum_length ) != ""
50 -L $minimum_length
51 #end if
52 -P $pipeline
53 -r \$AMPLICON_ANALYSIS_REF_DATA_PATH
54 #if str( $reference_database ) != ""
55 "${reference_database}"
56 #end if
57 #if str($categories_file_in) != 'None'
58 -c "${categories_file_in}"
59 #end if
60 ## Input files
61 "${metatable_file_in}"
62 ## FASTQ pairs
63 #if str($input_type.pairs_or_collection) == "collection"
64 #set fastq_pairs = $input_type.fastq_collection
65 #else
66 #set fastq_pairs = $input_type.fastq_pairs
67 #end if
68 #for $fq_pair in $fastq_pairs
69 "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}"
70 #end for
71 &&
72
73 ## Collect outputs
74 cp Metatable_log/Metatable_mod.txt "${metatable_mod}" &&
75 cp ${pipeline}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
76 cp ${pipeline}_OTU_tables/otus.tre "${otus_tre_file}" &&
77 cp RESULTS/${pipeline}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" &&
78 cp RESULTS/${pipeline}_${reference_database_name}/table_summary.txt "${table_summary_file}" &&
79 cp Multiplexed_files/${pipeline}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" &&
80 cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" &&
81 cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" &&
82
83 ## HTML outputs
84
85 ## OTU table
86 mkdir $heatmap_otu_table_html.files_path &&
87 cp -r RESULTS/${pipeline}_${reference_database_name}/Heatmap/js $heatmap_otu_table_html.files_path &&
88 cp RESULTS/${pipeline}_${reference_database_name}/Heatmap/otu_table.html "${heatmap_otu_table_html}" &&
89
90 ## Phylum genus barcharts
91 mkdir $phylum_genus_dist_barcharts_html.files_path &&
92 cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path &&
93 cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path &&
94 cp RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" &&
95
96 ## Beta diversity weighted 2d plots
97 mkdir $beta_div_even_weighted_2d_plots.files_path &&
98 cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path &&
99 cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" &&
100
101 ## Beta diversity unweighted 2d plots
102 mkdir $beta_div_even_unweighted_2d_plots.files_path &&
103 cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path &&
104 cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" &&
105
106 ## Alpha diversity rarefaction plots
107 mkdir $alpha_div_rarefaction_plots.files_path &&
108 cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots &&
109 cp -r RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path &&
110
111 ## Categories data
112 #if str($categories_file_in) != 'None'
113 ## Alpha diversity boxplots
114 mkdir $alpha_div_boxplots.files_path &&
115 cp alpha_diversity_boxplots.html "$alpha_div_boxplots" &&
116 cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path &&
117 #end if
118
119 ## Pipeline outputs (log files etc)
120 mkdir $log_files.files_path &&
121 cp Amplicon_analysis_pipeline.log $log_files.files_path &&
122 cp pipeline.log $log_files.files_path &&
123 cp Pipeline_outputs.txt $log_files.files_path &&
124 cp Metatable_log/Metatable.html $log_files.files_path &&
125 cp pipeline_outputs.html "$log_files"
126 ]]></command>
127 <inputs>
128 <param name="title" type="text" value="test" size="25"
129 label="Title" help="Optional text that will be added to the output dataset names" />
130 <param type="data" name="metatable_file_in" format="tabular"
131 label="Input Metatable.txt file" />
132 <param type="data" name="categories_file_in" format="txt"
133 label="Input Categories.txt file" optional="true"
134 help="(optional)" />
135 <conditional name="input_type">
136 <param name="pairs_or_collection" type="select"
137 label="Input FASTQ type">
138 <option value="pairs_of_files">Pairs of datasets</option>
139 <option value="collection" selected="true">Dataset pairs in a collection</option>
140 </param>
141 <when value="collection">
142 <param name="fastq_collection" type="data_collection"
143 format="fastqsanger,fastq" collection_type="list:paired"
144 label="Collection of FASTQ forward and reverse (R1/R2) pairs"
145 help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " />
146 </when>
147 <when value="pairs_of_files">
148 <repeat name="fastq_pairs" title="Input fastq pairs" min="1">
149 <param type="text" name="name" value=""
150 label="Final name for FASTQ pair" />
151 <param type="data" name="fastq_r1" format="fastqsanger,fastq"
152 label="FASTQ with forward reads (R1)" />
153 <param type="data" name="fastq_r2" format="fastqsanger,fastq"
154 label="FASTQ with reverse reads (R2)" />
155 </repeat>
156 </when>
157 </conditional>
158 <param type="text" name="forward_pcr_primer" value=""
159 label="Forward PCR primer sequence"
160 help="Optional; must not include barcode or adapter sequence (-g)" />
161 <param type="text" name="reverse_pcr_primer" value=""
162 label="Reverse PCR primer sequence"
163 help="Optional; must not include barcode or adapter sequence (-G)" />
164 <param type="integer" name="trimming_threshold" value="20"
165 label="Threshold quality below which read will be trimmed"
166 help="Phred score; default is 20 (-q)" />
167 <param type="integer" name="minimum_overlap" value="10"
168 label="Minimum overlap in bp between forward and reverse reads"
169 help="Default is 10 (-O)" />
170 <param type="integer" name="minimum_length" value="200"
171 label="Minimum length in bp to keep sequence after overlapping"
172 help="Default is 200 (-L)" />
173 <param type="integer" name="sliding_window_length" value="10"
174 label="Minimum length in bp to retain a read after trimming"
175 help="Supplied to Sickle; default is 10 (-l)" />
176 <param type="select" name="pipeline"
177 label="Pipeline to use for analysis">
178 <option value="Vsearch" selected="true" >Vsearch</option>
179 <!--
180 Remove the QIIME and Uparse options for now
181 <option value="QIIME">QIIME</option>
182 <option value="Uparse">Uparse</option>
183 -->
184 </param>
185 <param type="select" name="reference_database"
186 label="Reference database">
187 <option value="" selected="true">GreenGenes</option>
188 <option value="-S">Silva</option>
189 </param>
190 </inputs>
191 <outputs>
192 <data format="tabular" name="metatable_mod"
193 label="${tool.name}:${title} Metatable_mod.txt" />
194 <data format="tabular" name="read_counts_out"
195 label="${tool.name} (${pipeline}):${title} read counts" />
196 <data format="biom" name="tax_otu_table_biom_file"
197 label="${tool.name} (${pipeline}):${title} tax OTU table (biom format)" />
198 <data format="tabular" name="otus_tre_file"
199 label="${tool.name} (${pipeline}):${title} otus.tre" />
200 <data format="html" name="phylum_genus_dist_barcharts_html"
201 label="${tool.name} (${pipeline}):${title} phylum genus dist barcharts HTML" />
202 <data format="tabular" name="otus_count_file"
203 label="${tool.name} (${pipeline}):${title} OTUs count file" />
204 <data format="tabular" name="table_summary_file"
205 label="${tool.name} (${pipeline}):${title} table summary file" />
206 <data format="fasta" name="dereplicated_nonchimera_otus_fasta"
207 label="${tool.name} (${pipeline}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" />
208 <data format="html" name="fastqc_quality_boxplots_html"
209 label="${tool.name} (${pipeline}):${title} FastQC per-base quality boxplots HTML" />
210 <data format="html" name="heatmap_otu_table_html"
211 label="${tool.name} (${pipeline}):${title} heatmap OTU table HTML" />
212 <data format="html" name="beta_div_even_weighted_2d_plots"
213 label="${tool.name} (${pipeline}):${title} beta diversity weighted 2D plots HTML" />
214 <data format="html" name="beta_div_even_unweighted_2d_plots"
215 label="${tool.name} (${pipeline}):${title} beta diversity unweighted 2D plots HTML" />
216 <data format="html" name="alpha_div_rarefaction_plots"
217 label="${tool.name} (${pipeline}):${title} alpha diversity rarefaction plots HTML" />
218 <data format="html" name="alpha_div_boxplots"
219 label="${tool.name} (${pipeline}):${title} alpha diversity boxplots">
220 <filter>categories_file_in is not None</filter>
221 </data>
222 <data format="html" name="log_files"
223 label="${tool.name} (${pipeline}):${title} log files" />
224 </outputs>
225 <tests>
226 </tests>
227 <help><![CDATA[
228
229 What it does
230 ------------
231
232 This pipeline has been designed for the analysis of 16S rRNA data from
233 Illumina Miseq (Casava >= 1.8) paired-end reads.
234
235 Usage
236 -----
237
238 1. Preparation of the mapping file and format of unique sample id
239 *****************************************************************
240
241 Before using the amplicon analysis pipeline it would be necessary to
242 follow the steps as below to avoid analysis failures and ensure samples
243 are labelled appropriately. Sample names for the labelling are derived
244 from the fastq files names that are generated from the sequencing. The
245 labels will include everything between the beginning of the name and
246 the sample number (from C11 to S19 in Fig. 1)
247
248 .. image:: Pipeline_description_Fig1.png
249 :height: 46
250 :width: 382
251
252 **Figure 1**
253
254 If analysing 16S data from multiple runs:
255
256 The samples from different runs may have identical IDs. For example,
257 when sequencing the same samples twice, by chance, these could be at
258 the same position in both the runs. This would cause the fastq files
259 to have exactly the same IDs (Fig. 2).
260
261 .. image:: Pipeline_description_Fig2.png
262 :height: 100
263 :width: 463
264
265 **Figure 2**
266
267 In case of identical sample IDs the pipeline will fail to run and
268 generate an error at the beginning of the analysis.
269
270 To avoid having to change the file names, before uploading the files,
271 ensure that the samples IDs are not repeated.
272
273 2. To upload the file
274 *********************
275
276 Click on **Get Data/Upload File** from the Galaxy tool panel on the
277 left hand side.
278
279 From the pop-up window, choose how to upload the file. The
280 **Choose local file** option can be used for files up to 4Gb. Fastq files
281 from Illumina MiSeq will rarely be bigger than 4Gb and this option is
282 recommended.
283
284 After choosing the files click **Start** to begin the upload. The window can
285 now be closed and the files will be uploaded onto the Galaxy server. You
286 will see the progress on the ``HISTORY`` panel on the right
287 side of the screen. The colour will change from grey (queuing), to yellow
288 (uploading) and finally green (uploaded).
289
290 Once all the files are uploaded, click on the operations on multiple
291 datasets icon and select the fastq files that need to be analysed.
292 Click on the tab **For all selected...** and on the option
293 **Build List of Dataset pairs** (Fig. 3).
294
295 .. image:: Pipeline_description_Fig3.png
296 :height: 247
297 :width: 586
298
299 **Figure 3**
300
301 Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``.
302 The fastq files forward R1 and reverse R2 should now appear in the
303 corresponding columns.
304
305 Select **Autopair**. This creates a collection of paired fastq files for
306 the forward and reverse reads for each sample. The name of the pairs will
307 be the ones used by the pipeline. You are free to change the names at this
308 point as long as they are the same used in the Metatable file
309 (see section 3).
310
311 Name the collection and click on **create list**. This reduces the time
312 required to input the forward and reverse reads for each individual sample.
313
314 3. Create the Metatable files
315 *****************************
316
317 Metatable.txt
318 ~~~~~~~~~~~~~
319
320 Click on the list of pairs you just created to see the name of the single
321 pairs. The name of the pairs will be the ones used by the pipeline,
322 therefore, these are the names that need to be used in the Metatable file.
323
324 The Metatable file has to be in QIIME format. You can find a description
325 of it on QIIME website http://qiime.org/documentation/file_formats.html
326
327 EXAMPLE::
328
329 #SampleID BarcodeSequence LinkerPrimerSequence Disease Gender Description
330 Mock-RUN1 TAAGGCGAGCGTAAGA PsA Male Control
331 Mock-RUN2 CGTACTAGGCGTAAGA PsA Male Control
332 Mock-RUN3 AGGCAGAAGCGTAAGA PsC Female Control
333
334 Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be
335 deleted. The header is very important. ``#SampleID``, ``Barcode``,
336 ``LinkerPrimerSequence`` and ``Description`` are mandatory. Between
337 ``LinkerPrimerSequence`` and ``Description`` you can add as many columns
338 as you want. For every column a PCoA plot will be created (see
339 **Results** section). You can create this file in Excel and it will have
340 to be saved as ``Text(Tab delimited)``.
341
342 During the analysis the Metatable.txt will be checked to ensure that the
343 file has the correct format. If necessary, this will be modified and will
344 be available as Metatable_corrected.txt in the history panel. If you are
345 going to use the metatable file for any other statistical analyses,
346 remember to use the ``Metatable_mod.txt`` one, otherwise the sample
347 names might not match!
348
349 Categories.txt (optional)
350 ~~~~~~~~~~~~~~~~~~~~~~~~~
351
352 This file is required if you want to get box plots for comparison of
353 alpha diversity indices (see **Results** section). The file is a list
354 (without header and IN ONE COLUMN) of categories present in the
355 Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE
356 ONES USED IN THE METATABLE.TXT. You can create this file in Excel and
357 will have to be saved as ``Text(Tab delimited)``.
358
359 EXAMPLE::
360
361 Disease
362 Gender
363
364 Metatable and categories files can be uploaded using Get Data as done
365 with the fatsq files.
366
367 4. Analysis
368 ***********
369
370 Under **Amplicon_Analysis_Pipeline**
371
372 * **Title** Name to distinguish between the runs. It will be shown at
373 the beginning of each output file name.
374
375 * **Input Metatable.txt file** Select the Metatable.txt file related to
376 this analysis
377
378 * **Input Categories.txt file (Optional)** Select the Categories.txt file
379 related to this analysis
380
381 * **Input FASTQ type** select *Dataset pairs in a collection* and, then,
382 the collection of pairs you created earlier.
383
384 * **Forward/Reverse PCR primer sequence** if the PCR primer sequences
385 have not been removed from the MiSeq during the fastq creation, they
386 have to be removed before the analysis. Insert the PCR primer sequence
387 in the corresponding field. DO NOT include any barcode or adapter
388 sequence. If the PCR primers have been already trimmed by the MiSeq,
389 and you include the sequence in this field, this would lead to an error.
390 Only include the sequences if still present in the fastq files.
391
392 * **Threshold quality below which reads will be trimmed** Choose the
393 Phred score used by Sickle to trim the reads at the 3’ end.
394
395 * **Minimum length to retain a read after trimming** If the read length
396 after trimming is shorter than a user defined length, the read, along
397 with the corresponding read pair, will be discarded.
398
399 * **Minimum overlap in bp between forward and reverse reads** Choose the
400 minimum basepair overlap used by Pandaseq to assemble the reads.
401 Default is 10.
402
403 * **Minimum length in bp to keep a sequence after overlapping** Choose the
404 minimum sequence length used by Pandaseq to keep a sequence after the
405 overlapping. This depends on the expected amplicon length. Default is
406 380 (used for V3-V4 16S sequencing; expected length ~440bp)
407
408 * **Pipeline to use for analysis** Choose the pipeline to use for OTU
409 clustering and chimera removal. The Galaxy tool currently supports
410 ``Vsearch`` only. ``Uparse`` and ``QIIME`` are planned to be added
411 shortly (the tools are already available for the stand-alone pipeline).
412
413 * **Reference database** Choose between ``GreenGenes`` and ``Silva``
414 databases for taxa assignment.
415
416 Click on **Execute** to start the analysis.
417
418 5. Results
419 **********
420
421 Results are entirely generated using QIIME scripts. The results will
422 appear in the History panel when the analysis is completed
423
424 * **Vsearch_tax_OTU_table (biom format)** The OTU table in BIOM format
425 (http://biom-format.org/)
426
427 * **Vsearch_OTUs.tree** Phylogenetic tree constructed using
428 ``make_phylogeny.py`` (fasttree) QIIME script
429 (http://qiime.org/scripts/make_phylogeny.html)
430
431 * **Vsearch_phylum_genus_dist_barcharts_HTML** HTML file with bar
432 charts at Phylum, Genus and Species level
433 (http://qiime.org/scripts/summarize_taxa.html and
434 http://qiime.org/scripts/plot_taxa_summary.html)
435
436 * **Vsearch_OTUs_count_file** Summary of OTU counts per sample
437 (http://biom-format.org/documentation/summarizing_biom_tables.html)
438
439 * **Vsearch_table_summary_file** Summary of sequences counts per sample
440 (http://biom-format.org/documentation/summarizing_biom_tables.html)
441
442 * **Vsearch_multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta**
443 Fasta file with OTU sequences
444
445 * **Vsearch_heatmap_OTU_table_HTML** Interactive OTU heatmap
446 (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html )
447
448 * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML
449 format using weighted Unifrac distance measure. Samples are grouped
450 by the column names present in the Metatable file. The samples are
451 firstly rarefied to the minimum sequencing depth
452 (http://qiime.org/scripts/beta_diversity_through_plots.html )
453
454 * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML
455 format using Unweighted Unifrac distance measure. Samples are grouped
456 by the column names present in the Metatable file. The samples are
457 firstly rarefied to the minimum sequencing depth
458 (http://qiime.org/scripts/beta_diversity_through_plots.html )
459
460 Code availability
461 -----------------
462
463 **Code is available at** https://github.com/MTutino/Amplicon_analysis
464
465 Credits
466 -------
467
468 Pipeline author: Mauro Tutino
469
470 Galaxy tool: Peter Briggs
471
472 ]]></help>
473 <citations>
474 <citation type="bibtex">
475 @misc{githubAmplicon_analysis,
476 author = {Tutino, Mauro},
477 year = {2017},
478 title = {Amplicon Analysis Pipeline},
479 publisher = {GitHub},
480 journal = {GitHub repository},
481 url = {https://github.com/MTutino/Amplicon_analysis},
482 }</citation>
483 </citations>
484 </tool>