Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
view amplicon_analysis_pipeline.xml @ 1:1c1902e12caf draft
Updated to version 1.2.1.0
author | pjbriggs |
---|---|
date | Wed, 25 Apr 2018 03:54:00 -0400 |
parents | 47ec9c6f44b8 |
children | 43d6f81bc667 |
line wrap: on
line source
<tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.2.1.0"> <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description> <requirements> <requirement type="package" version="1.2.1">amplicon_analysis_pipeline</requirement> <requirement type="package" version="1.11">cutadapt</requirement> <requirement type="package" version="1.33">sickle</requirement> <requirement type="package" version="27-08-2013">bioawk</requirement> <requirement type="package" version="2.8.1">pandaseq</requirement> <requirement type="package" version="3.5.0">spades</requirement> <requirement type="package" version="0.11.3">fastqc</requirement> <requirement type="package" version="1.8.0">qiime</requirement> <requirement type="package" version="2.2.26">blast</requirement> <requirement type="package" version="0.2.4">fasta-splitter</requirement> <requirement type="package" version="2.2">rdp-classifier</requirement> <requirement type="package" version="3.2.0">R</requirement> <requirement type="package" version="1.1.3">vsearch</requirement> <requirement type="package" version="2010-04-29">microbiomeutil</requirement> <requirement type="package">fasta_number</requirement> </requirements> <stdio> <exit_code range="1:" /> </stdio> <command><![CDATA[ ## Set the reference database name #if $reference_database == "-S" #set reference_database_name = "silva" #else if $reference_database == "-H" #set reference_database_name = "homd" #else #set reference_database_name = "gg" #end if ## Run the amplicon analysis pipeline wrapper python $__tool_directory__/amplicon_analysis_pipeline.py ## Set options #if str( $forward_pcr_primer ) != "" -g "$forward_pcr_primer" #end if #if str( $reverse_pcr_primer ) != "" -G "$reverse_pcr_primer" #end if #if str( $trimming_threshold ) != "" -q $trimming_threshold #end if #if str( $sliding_window_length ) != "" -l $sliding_window_length #end if #if str( $minimum_overlap ) != "" -O $minimum_overlap #end if #if str( $minimum_length ) != "" -L $minimum_length #end if -P $pipeline -r \$AMPLICON_ANALYSIS_REF_DATA_PATH #if str( $reference_database ) != "" ${reference_database} #end if #if str($categories_file_in) != 'None' -c "${categories_file_in}" #end if ## Input files "${metatable_file_in}" ## FASTQ pairs #if str($input_type.pairs_or_collection) == "collection" #set fastq_pairs = $input_type.fastq_collection #else #set fastq_pairs = $input_type.fastq_pairs #end if #for $fq_pair in $fastq_pairs "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}" #end for && ## Collect outputs cp Metatable_log/Metatable_mod.txt "${metatable_mod}" && cp ${pipeline}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" && cp ${pipeline}_OTU_tables/otus.tre "${otus_tre_file}" && cp RESULTS/${pipeline}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" && cp RESULTS/${pipeline}_${reference_database_name}/table_summary.txt "${table_summary_file}" && cp Multiplexed_files/${pipeline}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" && cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" && cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" && ## HTML outputs ## OTU table mkdir $heatmap_otu_table_html.files_path && cp -r RESULTS/${pipeline}_${reference_database_name}/Heatmap/js $heatmap_otu_table_html.files_path && cp RESULTS/${pipeline}_${reference_database_name}/Heatmap/otu_table.html "${heatmap_otu_table_html}" && ## Phylum genus barcharts mkdir $phylum_genus_dist_barcharts_html.files_path && cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path && cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path && cp RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" && ## Beta diversity weighted 2d plots mkdir $beta_div_even_weighted_2d_plots.files_path && cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path && cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" && ## Beta diversity unweighted 2d plots mkdir $beta_div_even_unweighted_2d_plots.files_path && cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path && cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" && ## Alpha diversity rarefaction plots mkdir $alpha_div_rarefaction_plots.files_path && cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots && cp -r RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path && ## Categories data #if str($categories_file_in) != 'None' ## Alpha diversity boxplots mkdir $alpha_div_boxplots.files_path && cp alpha_diversity_boxplots.html "$alpha_div_boxplots" && cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path && #end if ## Pipeline outputs (log files etc) mkdir $log_files.files_path && cp Amplicon_analysis_pipeline.log $log_files.files_path && cp pipeline.log $log_files.files_path && cp Pipeline_outputs.txt $log_files.files_path && cp Metatable_log/Metatable.html $log_files.files_path && cp pipeline_outputs.html "$log_files" ]]></command> <inputs> <param name="title" type="text" value="test" size="25" label="Title" help="Optional text that will be added to the output dataset names" /> <param type="data" name="metatable_file_in" format="tabular" label="Input Metatable.txt file" /> <param type="data" name="categories_file_in" format="txt" label="Input Categories.txt file" optional="true" help="(optional)" /> <conditional name="input_type"> <param name="pairs_or_collection" type="select" label="Input FASTQ type"> <option value="pairs_of_files">Pairs of datasets</option> <option value="collection" selected="true">Dataset pairs in a collection</option> </param> <when value="collection"> <param name="fastq_collection" type="data_collection" format="fastqsanger,fastq" collection_type="list:paired" label="Collection of FASTQ forward and reverse (R1/R2) pairs" help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " /> </when> <when value="pairs_of_files"> <repeat name="fastq_pairs" title="Input fastq pairs" min="1"> <param type="text" name="name" value="" label="Final name for FASTQ pair" /> <param type="data" name="fastq_r1" format="fastqsanger,fastq" label="FASTQ with forward reads (R1)" /> <param type="data" name="fastq_r2" format="fastqsanger,fastq" label="FASTQ with reverse reads (R2)" /> </repeat> </when> </conditional> <param type="text" name="forward_pcr_primer" value="" label="Forward PCR primer sequence" help="Optional; must not include barcode or adapter sequence (-g)" /> <param type="text" name="reverse_pcr_primer" value="" label="Reverse PCR primer sequence" help="Optional; must not include barcode or adapter sequence (-G)" /> <param type="integer" name="trimming_threshold" value="20" label="Threshold quality below which read will be trimmed" help="Phred score; default is 20 (-q)" /> <param type="integer" name="minimum_overlap" value="10" label="Minimum overlap in bp between forward and reverse reads" help="Default is 10 (-O)" /> <param type="integer" name="minimum_length" value="200" label="Minimum length in bp to keep sequence after overlapping" help="Default is 200 (-L)" /> <param type="integer" name="sliding_window_length" value="10" label="Minimum length in bp to retain a read after trimming" help="Supplied to Sickle; default is 10 (-l)" /> <param type="select" name="pipeline" label="Pipeline to use for analysis"> <option value="Vsearch" selected="true" >Vsearch</option> <!-- Remove the QIIME and Uparse options for now <option value="QIIME">QIIME</option> <option value="Uparse">Uparse</option> --> </param> <param type="select" name="reference_database" label="Reference database"> <option value="" selected="true">GreenGenes</option> <option value="-S">Silva</option> <option value="-H">Human Oral Microbiome Database (HOMD)</option> </param> </inputs> <outputs> <data format="tabular" name="metatable_mod" label="${tool.name}:${title} Metatable_mod.txt" /> <data format="tabular" name="read_counts_out" label="${tool.name} (${pipeline}):${title} read counts" /> <data format="biom" name="tax_otu_table_biom_file" label="${tool.name} (${pipeline}):${title} tax OTU table (biom format)" /> <data format="tabular" name="otus_tre_file" label="${tool.name} (${pipeline}):${title} otus.tre" /> <data format="html" name="phylum_genus_dist_barcharts_html" label="${tool.name} (${pipeline}):${title} phylum genus dist barcharts HTML" /> <data format="tabular" name="otus_count_file" label="${tool.name} (${pipeline}):${title} OTUs count file" /> <data format="tabular" name="table_summary_file" label="${tool.name} (${pipeline}):${title} table summary file" /> <data format="fasta" name="dereplicated_nonchimera_otus_fasta" label="${tool.name} (${pipeline}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" /> <data format="html" name="fastqc_quality_boxplots_html" label="${tool.name} (${pipeline}):${title} FastQC per-base quality boxplots HTML" /> <data format="html" name="heatmap_otu_table_html" label="${tool.name} (${pipeline}):${title} heatmap OTU table HTML" /> <data format="html" name="beta_div_even_weighted_2d_plots" label="${tool.name} (${pipeline}):${title} beta diversity weighted 2D plots HTML" /> <data format="html" name="beta_div_even_unweighted_2d_plots" label="${tool.name} (${pipeline}):${title} beta diversity unweighted 2D plots HTML" /> <data format="html" name="alpha_div_rarefaction_plots" label="${tool.name} (${pipeline}):${title} alpha diversity rarefaction plots HTML" /> <data format="html" name="alpha_div_boxplots" label="${tool.name} (${pipeline}):${title} alpha diversity boxplots"> <filter>categories_file_in is not None</filter> </data> <data format="html" name="log_files" label="${tool.name} (${pipeline}):${title} log files" /> </outputs> <tests> </tests> <help><![CDATA[ What it does ------------ This pipeline has been designed for the analysis of 16S rRNA data from Illumina Miseq (Casava >= 1.8) paired-end reads. Usage ----- 1. Preparation of the mapping file and format of unique sample id ***************************************************************** Before using the amplicon analysis pipeline it would be necessary to follow the steps as below to avoid analysis failures and ensure samples are labelled appropriately. Sample names for the labelling are derived from the fastq files names that are generated from the sequencing. The labels will include everything between the beginning of the name and the sample number (from C11 to S19 in Fig. 1) .. image:: Pipeline_description_Fig1.png :height: 46 :width: 382 **Figure 1** If analysing 16S data from multiple runs: The samples from different runs may have identical IDs. For example, when sequencing the same samples twice, by chance, these could be at the same position in both the runs. This would cause the fastq files to have exactly the same IDs (Fig. 2). .. image:: Pipeline_description_Fig2.png :height: 100 :width: 463 **Figure 2** In case of identical sample IDs the pipeline will fail to run and generate an error at the beginning of the analysis. To avoid having to change the file names, before uploading the files, ensure that the samples IDs are not repeated. 2. To upload the file ********************* Click on **Get Data/Upload File** from the Galaxy tool panel on the left hand side. From the pop-up window, choose how to upload the file. The **Choose local file** option can be used for files up to 4Gb. Fastq files from Illumina MiSeq will rarely be bigger than 4Gb and this option is recommended. After choosing the files click **Start** to begin the upload. The window can now be closed and the files will be uploaded onto the Galaxy server. You will see the progress on the ``HISTORY`` panel on the right side of the screen. The colour will change from grey (queuing), to yellow (uploading) and finally green (uploaded). Once all the files are uploaded, click on the operations on multiple datasets icon and select the fastq files that need to be analysed. Click on the tab **For all selected...** and on the option **Build List of Dataset pairs** (Fig. 3). .. image:: Pipeline_description_Fig3.png :height: 247 :width: 586 **Figure 3** Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``. The fastq files forward R1 and reverse R2 should now appear in the corresponding columns. Select **Autopair**. This creates a collection of paired fastq files for the forward and reverse reads for each sample. The name of the pairs will be the ones used by the pipeline. You are free to change the names at this point as long as they are the same used in the Metatable file (see section 3). Name the collection and click on **create list**. This reduces the time required to input the forward and reverse reads for each individual sample. 3. Create the Metatable files ***************************** Metatable.txt ~~~~~~~~~~~~~ Click on the list of pairs you just created to see the name of the single pairs. The name of the pairs will be the ones used by the pipeline, therefore, these are the names that need to be used in the Metatable file. The Metatable file has to be in QIIME format. You can find a description of it on QIIME website http://qiime.org/documentation/file_formats.html EXAMPLE:: #SampleID BarcodeSequence LinkerPrimerSequence Disease Gender Description Mock-RUN1 TAAGGCGAGCGTAAGA PsA Male Control Mock-RUN2 CGTACTAGGCGTAAGA PsA Male Control Mock-RUN3 AGGCAGAAGCGTAAGA PsC Female Control Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be deleted. The header is very important. ``#SampleID``, ``Barcode``, ``LinkerPrimerSequence`` and ``Description`` are mandatory. Between ``LinkerPrimerSequence`` and ``Description`` you can add as many columns as you want. For every column a PCoA plot will be created (see **Results** section). You can create this file in Excel and it will have to be saved as ``Text(Tab delimited)``. During the analysis the Metatable.txt will be checked to ensure that the file has the correct format. If necessary, this will be modified and will be available as Metatable_corrected.txt in the history panel. If you are going to use the metatable file for any other statistical analyses, remember to use the ``Metatable_mod.txt`` one, otherwise the sample names might not match! Categories.txt (optional) ~~~~~~~~~~~~~~~~~~~~~~~~~ This file is required if you want to get box plots for comparison of alpha diversity indices (see **Results** section). The file is a list (without header and IN ONE COLUMN) of categories present in the Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE ONES USED IN THE METATABLE.TXT. You can create this file in Excel and will have to be saved as ``Text(Tab delimited)``. EXAMPLE:: Disease Gender Metatable and categories files can be uploaded using Get Data as done with the fatsq files. 4. Analysis *********** Under **Amplicon_Analysis_Pipeline** * **Title** Name to distinguish between the runs. It will be shown at the beginning of each output file name. * **Input Metatable.txt file** Select the Metatable.txt file related to this analysis * **Input Categories.txt file (Optional)** Select the Categories.txt file related to this analysis * **Input FASTQ type** select *Dataset pairs in a collection* and, then, the collection of pairs you created earlier. * **Forward/Reverse PCR primer sequence** if the PCR primer sequences have not been removed from the MiSeq during the fastq creation, they have to be removed before the analysis. Insert the PCR primer sequence in the corresponding field. DO NOT include any barcode or adapter sequence. If the PCR primers have been already trimmed by the MiSeq, and you include the sequence in this field, this would lead to an error. Only include the sequences if still present in the fastq files. * **Threshold quality below which reads will be trimmed** Choose the Phred score used by Sickle to trim the reads at the 3’ end. * **Minimum length to retain a read after trimming** If the read length after trimming is shorter than a user defined length, the read, along with the corresponding read pair, will be discarded. * **Minimum overlap in bp between forward and reverse reads** Choose the minimum basepair overlap used by Pandaseq to assemble the reads. Default is 10. * **Minimum length in bp to keep a sequence after overlapping** Choose the minimum sequence length used by Pandaseq to keep a sequence after the overlapping. This depends on the expected amplicon length. Default is 380 (used for V3-V4 16S sequencing; expected length ~440bp) * **Pipeline to use for analysis** Choose the pipeline to use for OTU clustering and chimera removal. The Galaxy tool currently supports ``Vsearch`` only. ``Uparse`` and ``QIIME`` are planned to be added shortly (the tools are already available for the stand-alone pipeline). * **Reference database** Choose between ``GreenGenes`` and ``Silva`` databases for taxa assignment. Click on **Execute** to start the analysis. 5. Results ********** Results are entirely generated using QIIME scripts. The results will appear in the History panel when the analysis is completed * **Vsearch_tax_OTU_table (biom format)** The OTU table in BIOM format (http://biom-format.org/) * **Vsearch_OTUs.tree** Phylogenetic tree constructed using ``make_phylogeny.py`` (fasttree) QIIME script (http://qiime.org/scripts/make_phylogeny.html) * **Vsearch_phylum_genus_dist_barcharts_HTML** HTML file with bar charts at Phylum, Genus and Species level (http://qiime.org/scripts/summarize_taxa.html and http://qiime.org/scripts/plot_taxa_summary.html) * **Vsearch_OTUs_count_file** Summary of OTU counts per sample (http://biom-format.org/documentation/summarizing_biom_tables.html) * **Vsearch_table_summary_file** Summary of sequences counts per sample (http://biom-format.org/documentation/summarizing_biom_tables.html) * **Vsearch_multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta** Fasta file with OTU sequences * **Vsearch_heatmap_OTU_table_HTML** Interactive OTU heatmap (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html ) * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML format using weighted Unifrac distance measure. Samples are grouped by the column names present in the Metatable file. The samples are firstly rarefied to the minimum sequencing depth (http://qiime.org/scripts/beta_diversity_through_plots.html ) * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML format using Unweighted Unifrac distance measure. Samples are grouped by the column names present in the Metatable file. The samples are firstly rarefied to the minimum sequencing depth (http://qiime.org/scripts/beta_diversity_through_plots.html ) Code availability ----------------- **Code is available at** https://github.com/MTutino/Amplicon_analysis Credits ------- Pipeline author: Mauro Tutino Galaxy tool: Peter Briggs ]]></help> <citations> <citation type="bibtex"> @misc{githubAmplicon_analysis, author = {Tutino, Mauro}, year = {2017}, title = {Amplicon Analysis Pipeline}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/MTutino/Amplicon_analysis}, }</citation> </citations> </tool>