diff amplicon_analysis_pipeline.xml @ 0:47ec9c6f44b8 draft

planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author pjbriggs
date Thu, 09 Nov 2017 10:13:29 -0500
parents
children 1c1902e12caf
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/amplicon_analysis_pipeline.xml	Thu Nov 09 10:13:29 2017 -0500
@@ -0,0 +1,484 @@
+<tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.0.6">
+  <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description>
+  <requirements>
+    <requirement type="package" version="1.1">amplicon_analysis_pipeline</requirement>
+    <requirement type="package" version="1.11">cutadapt</requirement>
+    <requirement type="package" version="1.33">sickle</requirement>
+    <requirement type="package" version="27-08-2013">bioawk</requirement>
+    <requirement type="package" version="2.8.1">pandaseq</requirement>
+    <requirement type="package" version="3.5.0">spades</requirement>
+    <requirement type="package" version="0.11.3">fastqc</requirement>
+    <requirement type="package" version="1.8.0">qiime</requirement>
+    <requirement type="package" version="2.2.26">blast</requirement>
+    <requirement type="package" version="0.2.4">fasta-splitter</requirement>
+    <requirement type="package" version="2.2">rdp-classifier</requirement>
+    <requirement type="package" version="3.2.0">R</requirement>
+    <requirement type="package" version="1.1.3">vsearch</requirement>
+    <requirement type="package" version="2010-04-29">microbiomeutil</requirement>
+    <requirement type="package">fasta_number</requirement>
+  </requirements>
+  <stdio>
+    <exit_code range="1:" />
+  </stdio>
+  <command><![CDATA[
+  ## Set the reference database name
+  #if $reference_database == ""
+    #set reference_database_name = "gg"
+  #else
+    #set reference_database_name = "silva"
+  #end if
+
+  ## Run the amplicon analysis pipeline wrapper
+  python $__tool_directory__/amplicon_analysis_pipeline.py
+  ## Set options
+  #if str( $forward_pcr_primer ) != ""
+  -g "$forward_pcr_primer"
+  #end if
+  #if str( $reverse_pcr_primer ) != ""
+  -G "$reverse_pcr_primer"
+  #end if
+  #if str( $trimming_threshold ) != ""
+  -q $trimming_threshold
+  #end if
+  #if str( $sliding_window_length ) != ""
+  -l $sliding_window_length
+  #end if
+  #if str( $minimum_overlap ) != ""
+  -O $minimum_overlap
+  #end if
+  #if str( $minimum_length ) != ""
+  -L $minimum_length
+  #end if
+  -P $pipeline
+  -r \$AMPLICON_ANALYSIS_REF_DATA_PATH
+  #if str( $reference_database ) != ""
+    "${reference_database}"
+  #end if
+  #if str($categories_file_in) != 'None'
+    -c "${categories_file_in}"
+  #end if
+  ## Input files
+  "${metatable_file_in}"
+  ## FASTQ pairs
+  #if str($input_type.pairs_or_collection) == "collection"
+    #set fastq_pairs = $input_type.fastq_collection
+  #else
+    #set fastq_pairs = $input_type.fastq_pairs
+  #end if
+  #for $fq_pair in $fastq_pairs
+    "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}"
+  #end for
+  &&
+
+  ## Collect outputs
+  cp Metatable_log/Metatable_mod.txt "${metatable_mod}" &&
+  cp ${pipeline}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
+  cp ${pipeline}_OTU_tables/otus.tre "${otus_tre_file}" &&
+  cp RESULTS/${pipeline}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" &&
+  cp RESULTS/${pipeline}_${reference_database_name}/table_summary.txt "${table_summary_file}" &&
+  cp Multiplexed_files/${pipeline}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" &&
+  cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" &&
+  cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" &&
+
+  ## HTML outputs
+
+  ## OTU table
+  mkdir $heatmap_otu_table_html.files_path &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/Heatmap/js $heatmap_otu_table_html.files_path &&
+  cp RESULTS/${pipeline}_${reference_database_name}/Heatmap/otu_table.html "${heatmap_otu_table_html}" &&
+
+  ## Phylum genus barcharts
+  mkdir $phylum_genus_dist_barcharts_html.files_path &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path &&
+  cp RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" &&
+
+  ## Beta diversity weighted 2d plots
+  mkdir $beta_div_even_weighted_2d_plots.files_path &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path &&
+  cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" &&
+
+  ## Beta diversity unweighted 2d plots
+  mkdir $beta_div_even_unweighted_2d_plots.files_path &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path &&
+  cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" &&
+
+  ## Alpha diversity rarefaction plots
+  mkdir $alpha_div_rarefaction_plots.files_path &&
+  cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots &&
+  cp -r RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path &&
+
+  ## Categories data
+  #if str($categories_file_in) != 'None'
+    ## Alpha diversity boxplots
+    mkdir $alpha_div_boxplots.files_path &&
+    cp alpha_diversity_boxplots.html "$alpha_div_boxplots" &&
+    cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path &&
+  #end if
+
+  ## Pipeline outputs (log files etc)
+  mkdir $log_files.files_path &&
+  cp Amplicon_analysis_pipeline.log $log_files.files_path &&
+  cp pipeline.log $log_files.files_path &&
+  cp Pipeline_outputs.txt $log_files.files_path &&
+  cp Metatable_log/Metatable.html $log_files.files_path &&
+  cp pipeline_outputs.html "$log_files"
+  ]]></command>
+  <inputs>
+    <param name="title" type="text" value="test" size="25"
+	   label="Title" help="Optional text that will be added to the output dataset names" />
+    <param type="data" name="metatable_file_in" format="tabular"
+	   label="Input Metatable.txt file" />
+    <param type="data" name="categories_file_in" format="txt"
+	   label="Input Categories.txt file" optional="true"
+	   help="(optional)" />
+    <conditional name="input_type">
+      <param name="pairs_or_collection" type="select"
+	     label="Input FASTQ type">
+	<option value="pairs_of_files">Pairs of datasets</option>
+	<option value="collection" selected="true">Dataset pairs in a collection</option>
+      </param>
+      <when value="collection">
+	<param name="fastq_collection" type="data_collection"
+	       format="fastqsanger,fastq" collection_type="list:paired"
+	       label="Collection of FASTQ forward and reverse (R1/R2) pairs"
+	       help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " />
+      </when>
+      <when value="pairs_of_files">
+	<repeat name="fastq_pairs" title="Input fastq pairs" min="1">
+	  <param type="text" name="name" value=""
+		 label="Final name for FASTQ pair" />
+	  <param type="data" name="fastq_r1" format="fastqsanger,fastq"
+		 label="FASTQ with forward reads (R1)" />
+	  <param type="data" name="fastq_r2" format="fastqsanger,fastq"
+		 label="FASTQ with reverse reads (R2)" />
+	</repeat>
+      </when>
+    </conditional>
+    <param type="text" name="forward_pcr_primer" value=""
+	   label="Forward PCR primer sequence"
+	   help="Optional; must not include barcode or adapter sequence (-g)" />
+    <param type="text" name="reverse_pcr_primer" value=""
+	   label="Reverse PCR primer sequence"
+	   help="Optional; must not include barcode or adapter sequence (-G)" />
+    <param type="integer" name="trimming_threshold" value="20"
+	   label="Threshold quality below which read will be trimmed"
+	   help="Phred score; default is 20 (-q)" />
+    <param type="integer" name="minimum_overlap" value="10"
+	   label="Minimum overlap in bp between forward and reverse reads"
+	   help="Default is 10 (-O)" />
+    <param type="integer" name="minimum_length" value="200"
+	   label="Minimum length in bp to keep sequence after overlapping"
+	   help="Default is 200 (-L)" />
+    <param type="integer" name="sliding_window_length" value="10"
+	   label="Minimum length in bp to retain a read after trimming"
+	   help="Supplied to Sickle; default is 10 (-l)" />
+    <param type="select" name="pipeline"
+	    label="Pipeline to use for analysis">
+      <option value="Vsearch" selected="true" >Vsearch</option>
+      <!--
+      Remove the QIIME and Uparse options for now
+      <option value="QIIME">QIIME</option>
+      <option value="Uparse">Uparse</option>
+      -->
+    </param>
+    <param type="select" name="reference_database"
+	   label="Reference database">
+      <option value="" selected="true">GreenGenes</option>
+      <option value="-S">Silva</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="metatable_mod"
+	  label="${tool.name}:${title} Metatable_mod.txt" />
+    <data format="tabular" name="read_counts_out"
+	  label="${tool.name} (${pipeline}):${title} read counts" />
+    <data format="biom" name="tax_otu_table_biom_file"
+	  label="${tool.name} (${pipeline}):${title} tax OTU table (biom format)" />
+    <data format="tabular" name="otus_tre_file"
+	  label="${tool.name} (${pipeline}):${title} otus.tre" />
+    <data format="html" name="phylum_genus_dist_barcharts_html"
+	  label="${tool.name} (${pipeline}):${title} phylum genus dist barcharts HTML" />
+    <data format="tabular" name="otus_count_file"
+	  label="${tool.name} (${pipeline}):${title} OTUs count file" />
+    <data format="tabular" name="table_summary_file"
+	  label="${tool.name} (${pipeline}):${title} table summary file" />
+    <data format="fasta" name="dereplicated_nonchimera_otus_fasta"
+	  label="${tool.name} (${pipeline}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" />
+    <data format="html" name="fastqc_quality_boxplots_html"
+	  label="${tool.name} (${pipeline}):${title} FastQC per-base quality boxplots HTML" />
+    <data format="html" name="heatmap_otu_table_html"
+	  label="${tool.name} (${pipeline}):${title} heatmap OTU table HTML" />
+    <data format="html" name="beta_div_even_weighted_2d_plots"
+	  label="${tool.name} (${pipeline}):${title} beta diversity weighted 2D plots HTML" />
+    <data format="html" name="beta_div_even_unweighted_2d_plots"
+	  label="${tool.name} (${pipeline}):${title} beta diversity unweighted 2D plots HTML" />
+    <data format="html" name="alpha_div_rarefaction_plots"
+	  label="${tool.name} (${pipeline}):${title} alpha diversity rarefaction plots HTML" />
+    <data format="html" name="alpha_div_boxplots"
+	  label="${tool.name} (${pipeline}):${title} alpha diversity boxplots">
+      <filter>categories_file_in is not None</filter>
+    </data>
+    <data format="html" name="log_files"
+	  label="${tool.name} (${pipeline}):${title} log files" />
+  </outputs>
+  <tests>
+  </tests>
+  <help><![CDATA[
+
+What it does
+------------
+
+This pipeline has been designed for the analysis of 16S rRNA data from
+Illumina Miseq (Casava >= 1.8) paired-end reads.
+  
+Usage
+-----
+
+1. Preparation of the mapping file and format of unique sample id
+*****************************************************************
+
+Before using the amplicon analysis pipeline it would be necessary to
+follow the steps as below to avoid analysis failures and ensure samples
+are labelled appropriately. Sample names for the labelling are derived
+from the fastq files names that are generated from the sequencing. The
+labels will include everything between the beginning of the name and
+the sample number (from C11 to S19 in Fig. 1)
+
+.. image:: Pipeline_description_Fig1.png
+   :height: 46
+   :width: 382
+
+**Figure 1**
+
+If analysing 16S data from multiple runs: 
+
+The samples from different runs may have identical IDs. For example,
+when sequencing the same samples twice, by chance, these could be at
+the same position in both the runs. This would cause the fastq files
+to have exactly the same IDs (Fig. 2).
+
+.. image:: Pipeline_description_Fig2.png
+   :height: 100
+   :width: 463
+
+**Figure 2**
+
+In case of identical sample IDs the pipeline will fail to run and
+generate an error at the beginning of the analysis.
+
+To avoid having to change the file names, before uploading the files,
+ensure that the samples IDs are not repeated. 
+
+2. To upload the file
+*********************
+
+Click on **Get Data/Upload File** from the Galaxy tool panel on the
+left hand side.
+
+From the pop-up window, choose how to upload the file. The
+**Choose local file** option can be used for files up to 4Gb. Fastq files
+from Illumina MiSeq will rarely be bigger than 4Gb and this option is
+recommended.
+
+After choosing the files click **Start** to begin the upload. The window can
+now be closed and the files will be uploaded onto the Galaxy server. You
+will see the progress on the ``HISTORY`` panel on the right
+side of the screen. The colour will change from grey (queuing), to yellow
+(uploading) and finally green (uploaded).
+
+Once all the files are uploaded, click on the operations on multiple
+datasets icon and select the fastq files that need to be analysed.
+Click on the tab **For all selected...** and on the option
+**Build List of Dataset pairs** (Fig. 3).
+
+.. image:: Pipeline_description_Fig3.png
+   :height: 247
+   :width: 586
+
+**Figure 3**
+
+Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``.
+The fastq files forward R1 and reverse R2 should now appear in the
+corresponding columns.
+
+Select **Autopair**. This creates a collection of paired fastq files for
+the forward and reverse reads for each sample. The name of the pairs will
+be the ones used by the pipeline. You are free to change the names at this
+point as long as they are the same used in the Metatable file
+(see section 3).
+
+Name the collection and click on **create list**. This reduces the time
+required to input the forward and reverse reads for each individual sample.
+
+3. Create the Metatable files
+*****************************
+
+Metatable.txt
+~~~~~~~~~~~~~
+
+Click on the list of pairs you just created to see the name of the single
+pairs. The name of the pairs will be the ones used by the pipeline,
+therefore, these are the names that need to be used in the Metatable file.
+
+The Metatable file has to be in QIIME format. You can find a description
+of it on QIIME website http://qiime.org/documentation/file_formats.html
+
+EXAMPLE::
+
+    #SampleID    BarcodeSequence    LinkerPrimerSequence    Disease    Gender    Description
+    Mock-RUN1    TAAGGCGAGCGTAAGA                           PsA        Male      Control
+    Mock-RUN2    CGTACTAGGCGTAAGA                           PsA        Male      Control
+    Mock-RUN3    AGGCAGAAGCGTAAGA                           PsC        Female    Control
+
+Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be
+deleted. The header is very important. ``#SampleID``, ``Barcode``,
+``LinkerPrimerSequence`` and ``Description`` are mandatory. Between
+``LinkerPrimerSequence`` and ``Description`` you can add as many columns
+as you want. For every column a PCoA plot will be created (see
+**Results** section). You can create this file in Excel and it will have
+to be saved as ``Text(Tab delimited)``.
+
+During the analysis the Metatable.txt will be checked to ensure that the
+file has the correct format. If necessary, this will be modified and will
+be available as Metatable_corrected.txt in the history panel. If you are
+going to use the metatable file for any other statistical analyses,
+remember to use the ``Metatable_mod.txt`` one, otherwise the sample
+names might not match!
+
+Categories.txt (optional) 
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This file is required if you want to get box plots for comparison of
+alpha diversity indices (see **Results** section). The file is a list
+(without header and IN ONE COLUMN) of categories present in the
+Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE
+ONES USED IN THE METATABLE.TXT. You can create this file in Excel and
+will have to be saved as ``Text(Tab delimited)``.
+
+EXAMPLE::
+
+    Disease
+    Gender
+
+Metatable and categories files can be uploaded using Get Data as done
+with the fatsq files.
+
+4. Analysis
+***********
+
+Under **Amplicon_Analysis_Pipeline**
+
+ * **Title** Name to distinguish between the runs. It will be shown at
+   the beginning of each output file name.
+
+ * **Input Metatable.txt file** Select the Metatable.txt file related to
+   this analysis
+
+ * **Input Categories.txt file (Optional)** Select the Categories.txt file
+   related to this analysis
+
+ * **Input FASTQ type** select *Dataset pairs in a collection* and, then,
+   the collection of pairs you created earlier.
+
+ * **Forward/Reverse PCR primer sequence** if the PCR primer sequences
+   have not been removed from the MiSeq during the fastq creation, they
+   have to be removed before the analysis. Insert the PCR primer sequence
+   in the corresponding field. DO NOT include any barcode or adapter
+   sequence. If the PCR primers have been already trimmed by the MiSeq,
+   and you include the sequence in this field, this would lead to an error.
+   Only include the sequences if still present in the fastq files.
+
+ * **Threshold quality below which reads will be trimmed** Choose the
+   Phred score used by Sickle to trim the reads at the 3’ end.
+
+ * **Minimum length to retain a read after trimming** If the read length
+   after trimming is shorter than a user defined length, the read, along
+   with the corresponding read pair, will be discarded.
+
+ * **Minimum overlap in bp between forward and reverse reads** Choose the
+   minimum basepair overlap used by Pandaseq to assemble the reads.
+   Default is 10.
+
+ * **Minimum length in bp to keep a sequence after overlapping** Choose the
+   minimum sequence length used by Pandaseq to keep a sequence after the
+   overlapping. This depends on the expected amplicon length. Default is
+   380 (used for V3-V4 16S sequencing; expected length ~440bp)
+
+ * **Pipeline to use for analysis** Choose the pipeline to use for OTU
+   clustering and chimera removal. The Galaxy tool currently supports
+   ``Vsearch`` only. ``Uparse`` and ``QIIME`` are planned to be added
+   shortly (the tools are already available for the stand-alone pipeline).
+
+ * **Reference database** Choose between ``GreenGenes`` and ``Silva``
+   databases for taxa assignment.
+
+Click on **Execute** to start the analysis.
+
+5. Results
+**********
+
+Results are entirely generated using QIIME scripts. The results will 
+appear in the History panel when the analysis is completed
+
+ * **Vsearch_tax_OTU_table (biom format)** The OTU table in BIOM format
+   (http://biom-format.org/)
+
+ * **Vsearch_OTUs.tree** Phylogenetic tree constructed using
+   ``make_phylogeny.py`` (fasttree) QIIME script
+   (http://qiime.org/scripts/make_phylogeny.html)
+
+ * **Vsearch_phylum_genus_dist_barcharts_HTML** HTML file with bar
+   charts at Phylum, Genus and Species level
+   (http://qiime.org/scripts/summarize_taxa.html and
+   http://qiime.org/scripts/plot_taxa_summary.html)
+
+ * **Vsearch_OTUs_count_file** Summary of OTU counts per sample
+   (http://biom-format.org/documentation/summarizing_biom_tables.html)
+
+ * **Vsearch_table_summary_file** Summary of sequences counts per sample
+   (http://biom-format.org/documentation/summarizing_biom_tables.html)
+
+ * **Vsearch_multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta**
+   Fasta file with OTU sequences
+
+ * **Vsearch_heatmap_OTU_table_HTML** Interactive OTU heatmap
+   (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html )
+
+ * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML
+   format using weighted Unifrac distance measure. Samples are grouped
+   by the column names present in the Metatable file. The samples are
+   firstly rarefied to the minimum sequencing depth
+   (http://qiime.org/scripts/beta_diversity_through_plots.html )
+
+ * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML
+   format using Unweighted Unifrac distance measure. Samples are grouped
+   by the column names present in the Metatable file. The samples are
+   firstly rarefied to the minimum sequencing depth
+   (http://qiime.org/scripts/beta_diversity_through_plots.html )
+
+Code availability
+-----------------
+
+**Code is available at** https://github.com/MTutino/Amplicon_analysis
+
+Credits
+-------
+
+Pipeline author: Mauro Tutino
+
+Galaxy tool: Peter Briggs
+
+	]]></help>
+  <citations>
+    <citation type="bibtex">
+      @misc{githubAmplicon_analysis,
+      author = {Tutino, Mauro},
+      year = {2017},
+      title = {Amplicon Analysis Pipeline},
+      publisher = {GitHub},
+      journal = {GitHub repository},
+      url = {https://github.com/MTutino/Amplicon_analysis},
+}</citation>
+  </citations>
+</tool>