view amplicon_analysis_pipeline.xml @ 1:1c1902e12caf draft

Updated to version 1.2.1.0
author pjbriggs
date Wed, 25 Apr 2018 03:54:00 -0400
parents 47ec9c6f44b8
children 43d6f81bc667
line wrap: on
line source

<tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.2.1.0">
  <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description>
  <requirements>
    <requirement type="package" version="1.2.1">amplicon_analysis_pipeline</requirement>
    <requirement type="package" version="1.11">cutadapt</requirement>
    <requirement type="package" version="1.33">sickle</requirement>
    <requirement type="package" version="27-08-2013">bioawk</requirement>
    <requirement type="package" version="2.8.1">pandaseq</requirement>
    <requirement type="package" version="3.5.0">spades</requirement>
    <requirement type="package" version="0.11.3">fastqc</requirement>
    <requirement type="package" version="1.8.0">qiime</requirement>
    <requirement type="package" version="2.2.26">blast</requirement>
    <requirement type="package" version="0.2.4">fasta-splitter</requirement>
    <requirement type="package" version="2.2">rdp-classifier</requirement>
    <requirement type="package" version="3.2.0">R</requirement>
    <requirement type="package" version="1.1.3">vsearch</requirement>
    <requirement type="package" version="2010-04-29">microbiomeutil</requirement>
    <requirement type="package">fasta_number</requirement>
  </requirements>
  <stdio>
    <exit_code range="1:" />
  </stdio>
  <command><![CDATA[
  ## Set the reference database name
  #if $reference_database == "-S"
    #set reference_database_name = "silva"
  #else if $reference_database == "-H"
    #set reference_database_name = "homd"
  #else
    #set reference_database_name = "gg"
  #end if

  ## Run the amplicon analysis pipeline wrapper
  python $__tool_directory__/amplicon_analysis_pipeline.py
  ## Set options
  #if str( $forward_pcr_primer ) != ""
  -g "$forward_pcr_primer"
  #end if
  #if str( $reverse_pcr_primer ) != ""
  -G "$reverse_pcr_primer"
  #end if
  #if str( $trimming_threshold ) != ""
  -q $trimming_threshold
  #end if
  #if str( $sliding_window_length ) != ""
  -l $sliding_window_length
  #end if
  #if str( $minimum_overlap ) != ""
  -O $minimum_overlap
  #end if
  #if str( $minimum_length ) != ""
  -L $minimum_length
  #end if
  -P $pipeline
  -r \$AMPLICON_ANALYSIS_REF_DATA_PATH
  #if str( $reference_database ) != ""
    ${reference_database}
  #end if
  #if str($categories_file_in) != 'None'
    -c "${categories_file_in}"
  #end if
  ## Input files
  "${metatable_file_in}"
  ## FASTQ pairs
  #if str($input_type.pairs_or_collection) == "collection"
    #set fastq_pairs = $input_type.fastq_collection
  #else
    #set fastq_pairs = $input_type.fastq_pairs
  #end if
  #for $fq_pair in $fastq_pairs
    "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}"
  #end for
  &&

  ## Collect outputs
  cp Metatable_log/Metatable_mod.txt "${metatable_mod}" &&
  cp ${pipeline}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
  cp ${pipeline}_OTU_tables/otus.tre "${otus_tre_file}" &&
  cp RESULTS/${pipeline}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" &&
  cp RESULTS/${pipeline}_${reference_database_name}/table_summary.txt "${table_summary_file}" &&
  cp Multiplexed_files/${pipeline}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" &&
  cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" &&
  cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" &&

  ## HTML outputs

  ## OTU table
  mkdir $heatmap_otu_table_html.files_path &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/Heatmap/js $heatmap_otu_table_html.files_path &&
  cp RESULTS/${pipeline}_${reference_database_name}/Heatmap/otu_table.html "${heatmap_otu_table_html}" &&

  ## Phylum genus barcharts
  mkdir $phylum_genus_dist_barcharts_html.files_path &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path &&
  cp RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" &&

  ## Beta diversity weighted 2d plots
  mkdir $beta_div_even_weighted_2d_plots.files_path &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path &&
  cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" &&

  ## Beta diversity unweighted 2d plots
  mkdir $beta_div_even_unweighted_2d_plots.files_path &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path &&
  cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" &&

  ## Alpha diversity rarefaction plots
  mkdir $alpha_div_rarefaction_plots.files_path &&
  cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots &&
  cp -r RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path &&

  ## Categories data
  #if str($categories_file_in) != 'None'
    ## Alpha diversity boxplots
    mkdir $alpha_div_boxplots.files_path &&
    cp alpha_diversity_boxplots.html "$alpha_div_boxplots" &&
    cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path &&
  #end if

  ## Pipeline outputs (log files etc)
  mkdir $log_files.files_path &&
  cp Amplicon_analysis_pipeline.log $log_files.files_path &&
  cp pipeline.log $log_files.files_path &&
  cp Pipeline_outputs.txt $log_files.files_path &&
  cp Metatable_log/Metatable.html $log_files.files_path &&
  cp pipeline_outputs.html "$log_files"
  ]]></command>
  <inputs>
    <param name="title" type="text" value="test" size="25"
	   label="Title" help="Optional text that will be added to the output dataset names" />
    <param type="data" name="metatable_file_in" format="tabular"
	   label="Input Metatable.txt file" />
    <param type="data" name="categories_file_in" format="txt"
	   label="Input Categories.txt file" optional="true"
	   help="(optional)" />
    <conditional name="input_type">
      <param name="pairs_or_collection" type="select"
	     label="Input FASTQ type">
	<option value="pairs_of_files">Pairs of datasets</option>
	<option value="collection" selected="true">Dataset pairs in a collection</option>
      </param>
      <when value="collection">
	<param name="fastq_collection" type="data_collection"
	       format="fastqsanger,fastq" collection_type="list:paired"
	       label="Collection of FASTQ forward and reverse (R1/R2) pairs"
	       help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " />
      </when>
      <when value="pairs_of_files">
	<repeat name="fastq_pairs" title="Input fastq pairs" min="1">
	  <param type="text" name="name" value=""
		 label="Final name for FASTQ pair" />
	  <param type="data" name="fastq_r1" format="fastqsanger,fastq"
		 label="FASTQ with forward reads (R1)" />
	  <param type="data" name="fastq_r2" format="fastqsanger,fastq"
		 label="FASTQ with reverse reads (R2)" />
	</repeat>
      </when>
    </conditional>
    <param type="text" name="forward_pcr_primer" value=""
	   label="Forward PCR primer sequence"
	   help="Optional; must not include barcode or adapter sequence (-g)" />
    <param type="text" name="reverse_pcr_primer" value=""
	   label="Reverse PCR primer sequence"
	   help="Optional; must not include barcode or adapter sequence (-G)" />
    <param type="integer" name="trimming_threshold" value="20"
	   label="Threshold quality below which read will be trimmed"
	   help="Phred score; default is 20 (-q)" />
    <param type="integer" name="minimum_overlap" value="10"
	   label="Minimum overlap in bp between forward and reverse reads"
	   help="Default is 10 (-O)" />
    <param type="integer" name="minimum_length" value="200"
	   label="Minimum length in bp to keep sequence after overlapping"
	   help="Default is 200 (-L)" />
    <param type="integer" name="sliding_window_length" value="10"
	   label="Minimum length in bp to retain a read after trimming"
	   help="Supplied to Sickle; default is 10 (-l)" />
    <param type="select" name="pipeline"
	    label="Pipeline to use for analysis">
      <option value="Vsearch" selected="true" >Vsearch</option>
      <!--
      Remove the QIIME and Uparse options for now
      <option value="QIIME">QIIME</option>
      <option value="Uparse">Uparse</option>
      -->
    </param>
    <param type="select" name="reference_database"
	   label="Reference database">
      <option value="" selected="true">GreenGenes</option>
      <option value="-S">Silva</option>
      <option value="-H">Human Oral Microbiome Database (HOMD)</option>
    </param>
  </inputs>
  <outputs>
    <data format="tabular" name="metatable_mod"
	  label="${tool.name}:${title} Metatable_mod.txt" />
    <data format="tabular" name="read_counts_out"
	  label="${tool.name} (${pipeline}):${title} read counts" />
    <data format="biom" name="tax_otu_table_biom_file"
	  label="${tool.name} (${pipeline}):${title} tax OTU table (biom format)" />
    <data format="tabular" name="otus_tre_file"
	  label="${tool.name} (${pipeline}):${title} otus.tre" />
    <data format="html" name="phylum_genus_dist_barcharts_html"
	  label="${tool.name} (${pipeline}):${title} phylum genus dist barcharts HTML" />
    <data format="tabular" name="otus_count_file"
	  label="${tool.name} (${pipeline}):${title} OTUs count file" />
    <data format="tabular" name="table_summary_file"
	  label="${tool.name} (${pipeline}):${title} table summary file" />
    <data format="fasta" name="dereplicated_nonchimera_otus_fasta"
	  label="${tool.name} (${pipeline}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" />
    <data format="html" name="fastqc_quality_boxplots_html"
	  label="${tool.name} (${pipeline}):${title} FastQC per-base quality boxplots HTML" />
    <data format="html" name="heatmap_otu_table_html"
	  label="${tool.name} (${pipeline}):${title} heatmap OTU table HTML" />
    <data format="html" name="beta_div_even_weighted_2d_plots"
	  label="${tool.name} (${pipeline}):${title} beta diversity weighted 2D plots HTML" />
    <data format="html" name="beta_div_even_unweighted_2d_plots"
	  label="${tool.name} (${pipeline}):${title} beta diversity unweighted 2D plots HTML" />
    <data format="html" name="alpha_div_rarefaction_plots"
	  label="${tool.name} (${pipeline}):${title} alpha diversity rarefaction plots HTML" />
    <data format="html" name="alpha_div_boxplots"
	  label="${tool.name} (${pipeline}):${title} alpha diversity boxplots">
      <filter>categories_file_in is not None</filter>
    </data>
    <data format="html" name="log_files"
	  label="${tool.name} (${pipeline}):${title} log files" />
  </outputs>
  <tests>
  </tests>
  <help><![CDATA[

What it does
------------

This pipeline has been designed for the analysis of 16S rRNA data from
Illumina Miseq (Casava >= 1.8) paired-end reads.
  
Usage
-----

1. Preparation of the mapping file and format of unique sample id
*****************************************************************

Before using the amplicon analysis pipeline it would be necessary to
follow the steps as below to avoid analysis failures and ensure samples
are labelled appropriately. Sample names for the labelling are derived
from the fastq files names that are generated from the sequencing. The
labels will include everything between the beginning of the name and
the sample number (from C11 to S19 in Fig. 1)

.. image:: Pipeline_description_Fig1.png
   :height: 46
   :width: 382

**Figure 1**

If analysing 16S data from multiple runs: 

The samples from different runs may have identical IDs. For example,
when sequencing the same samples twice, by chance, these could be at
the same position in both the runs. This would cause the fastq files
to have exactly the same IDs (Fig. 2).

.. image:: Pipeline_description_Fig2.png
   :height: 100
   :width: 463

**Figure 2**

In case of identical sample IDs the pipeline will fail to run and
generate an error at the beginning of the analysis.

To avoid having to change the file names, before uploading the files,
ensure that the samples IDs are not repeated. 

2. To upload the file
*********************

Click on **Get Data/Upload File** from the Galaxy tool panel on the
left hand side.

From the pop-up window, choose how to upload the file. The
**Choose local file** option can be used for files up to 4Gb. Fastq files
from Illumina MiSeq will rarely be bigger than 4Gb and this option is
recommended.

After choosing the files click **Start** to begin the upload. The window can
now be closed and the files will be uploaded onto the Galaxy server. You
will see the progress on the ``HISTORY`` panel on the right
side of the screen. The colour will change from grey (queuing), to yellow
(uploading) and finally green (uploaded).

Once all the files are uploaded, click on the operations on multiple
datasets icon and select the fastq files that need to be analysed.
Click on the tab **For all selected...** and on the option
**Build List of Dataset pairs** (Fig. 3).

.. image:: Pipeline_description_Fig3.png
   :height: 247
   :width: 586

**Figure 3**

Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``.
The fastq files forward R1 and reverse R2 should now appear in the
corresponding columns.

Select **Autopair**. This creates a collection of paired fastq files for
the forward and reverse reads for each sample. The name of the pairs will
be the ones used by the pipeline. You are free to change the names at this
point as long as they are the same used in the Metatable file
(see section 3).

Name the collection and click on **create list**. This reduces the time
required to input the forward and reverse reads for each individual sample.

3. Create the Metatable files
*****************************

Metatable.txt
~~~~~~~~~~~~~

Click on the list of pairs you just created to see the name of the single
pairs. The name of the pairs will be the ones used by the pipeline,
therefore, these are the names that need to be used in the Metatable file.

The Metatable file has to be in QIIME format. You can find a description
of it on QIIME website http://qiime.org/documentation/file_formats.html

EXAMPLE::

    #SampleID    BarcodeSequence    LinkerPrimerSequence    Disease    Gender    Description
    Mock-RUN1    TAAGGCGAGCGTAAGA                           PsA        Male      Control
    Mock-RUN2    CGTACTAGGCGTAAGA                           PsA        Male      Control
    Mock-RUN3    AGGCAGAAGCGTAAGA                           PsC        Female    Control

Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be
deleted. The header is very important. ``#SampleID``, ``Barcode``,
``LinkerPrimerSequence`` and ``Description`` are mandatory. Between
``LinkerPrimerSequence`` and ``Description`` you can add as many columns
as you want. For every column a PCoA plot will be created (see
**Results** section). You can create this file in Excel and it will have
to be saved as ``Text(Tab delimited)``.

During the analysis the Metatable.txt will be checked to ensure that the
file has the correct format. If necessary, this will be modified and will
be available as Metatable_corrected.txt in the history panel. If you are
going to use the metatable file for any other statistical analyses,
remember to use the ``Metatable_mod.txt`` one, otherwise the sample
names might not match!

Categories.txt (optional) 
~~~~~~~~~~~~~~~~~~~~~~~~~

This file is required if you want to get box plots for comparison of
alpha diversity indices (see **Results** section). The file is a list
(without header and IN ONE COLUMN) of categories present in the
Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE
ONES USED IN THE METATABLE.TXT. You can create this file in Excel and
will have to be saved as ``Text(Tab delimited)``.

EXAMPLE::

    Disease
    Gender

Metatable and categories files can be uploaded using Get Data as done
with the fatsq files.

4. Analysis
***********

Under **Amplicon_Analysis_Pipeline**

 * **Title** Name to distinguish between the runs. It will be shown at
   the beginning of each output file name.

 * **Input Metatable.txt file** Select the Metatable.txt file related to
   this analysis

 * **Input Categories.txt file (Optional)** Select the Categories.txt file
   related to this analysis

 * **Input FASTQ type** select *Dataset pairs in a collection* and, then,
   the collection of pairs you created earlier.

 * **Forward/Reverse PCR primer sequence** if the PCR primer sequences
   have not been removed from the MiSeq during the fastq creation, they
   have to be removed before the analysis. Insert the PCR primer sequence
   in the corresponding field. DO NOT include any barcode or adapter
   sequence. If the PCR primers have been already trimmed by the MiSeq,
   and you include the sequence in this field, this would lead to an error.
   Only include the sequences if still present in the fastq files.

 * **Threshold quality below which reads will be trimmed** Choose the
   Phred score used by Sickle to trim the reads at the 3’ end.

 * **Minimum length to retain a read after trimming** If the read length
   after trimming is shorter than a user defined length, the read, along
   with the corresponding read pair, will be discarded.

 * **Minimum overlap in bp between forward and reverse reads** Choose the
   minimum basepair overlap used by Pandaseq to assemble the reads.
   Default is 10.

 * **Minimum length in bp to keep a sequence after overlapping** Choose the
   minimum sequence length used by Pandaseq to keep a sequence after the
   overlapping. This depends on the expected amplicon length. Default is
   380 (used for V3-V4 16S sequencing; expected length ~440bp)

 * **Pipeline to use for analysis** Choose the pipeline to use for OTU
   clustering and chimera removal. The Galaxy tool currently supports
   ``Vsearch`` only. ``Uparse`` and ``QIIME`` are planned to be added
   shortly (the tools are already available for the stand-alone pipeline).

 * **Reference database** Choose between ``GreenGenes`` and ``Silva``
   databases for taxa assignment.

Click on **Execute** to start the analysis.

5. Results
**********

Results are entirely generated using QIIME scripts. The results will 
appear in the History panel when the analysis is completed

 * **Vsearch_tax_OTU_table (biom format)** The OTU table in BIOM format
   (http://biom-format.org/)

 * **Vsearch_OTUs.tree** Phylogenetic tree constructed using
   ``make_phylogeny.py`` (fasttree) QIIME script
   (http://qiime.org/scripts/make_phylogeny.html)

 * **Vsearch_phylum_genus_dist_barcharts_HTML** HTML file with bar
   charts at Phylum, Genus and Species level
   (http://qiime.org/scripts/summarize_taxa.html and
   http://qiime.org/scripts/plot_taxa_summary.html)

 * **Vsearch_OTUs_count_file** Summary of OTU counts per sample
   (http://biom-format.org/documentation/summarizing_biom_tables.html)

 * **Vsearch_table_summary_file** Summary of sequences counts per sample
   (http://biom-format.org/documentation/summarizing_biom_tables.html)

 * **Vsearch_multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta**
   Fasta file with OTU sequences

 * **Vsearch_heatmap_OTU_table_HTML** Interactive OTU heatmap
   (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html )

 * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML
   format using weighted Unifrac distance measure. Samples are grouped
   by the column names present in the Metatable file. The samples are
   firstly rarefied to the minimum sequencing depth
   (http://qiime.org/scripts/beta_diversity_through_plots.html )

 * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML
   format using Unweighted Unifrac distance measure. Samples are grouped
   by the column names present in the Metatable file. The samples are
   firstly rarefied to the minimum sequencing depth
   (http://qiime.org/scripts/beta_diversity_through_plots.html )

Code availability
-----------------

**Code is available at** https://github.com/MTutino/Amplicon_analysis

Credits
-------

Pipeline author: Mauro Tutino

Galaxy tool: Peter Briggs

	]]></help>
  <citations>
    <citation type="bibtex">
      @misc{githubAmplicon_analysis,
      author = {Tutino, Mauro},
      year = {2017},
      title = {Amplicon Analysis Pipeline},
      publisher = {GitHub},
      journal = {GitHub repository},
      url = {https://github.com/MTutino/Amplicon_analysis},
}</citation>
  </citations>
</tool>