Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
diff amplicon_analysis_pipeline.xml @ 0:47ec9c6f44b8 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author | pjbriggs |
---|---|
date | Thu, 09 Nov 2017 10:13:29 -0500 |
parents | |
children | 1c1902e12caf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/amplicon_analysis_pipeline.xml Thu Nov 09 10:13:29 2017 -0500 @@ -0,0 +1,484 @@ +<tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.0.6"> + <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description> + <requirements> + <requirement type="package" version="1.1">amplicon_analysis_pipeline</requirement> + <requirement type="package" version="1.11">cutadapt</requirement> + <requirement type="package" version="1.33">sickle</requirement> + <requirement type="package" version="27-08-2013">bioawk</requirement> + <requirement type="package" version="2.8.1">pandaseq</requirement> + <requirement type="package" version="3.5.0">spades</requirement> + <requirement type="package" version="0.11.3">fastqc</requirement> + <requirement type="package" version="1.8.0">qiime</requirement> + <requirement type="package" version="2.2.26">blast</requirement> + <requirement type="package" version="0.2.4">fasta-splitter</requirement> + <requirement type="package" version="2.2">rdp-classifier</requirement> + <requirement type="package" version="3.2.0">R</requirement> + <requirement type="package" version="1.1.3">vsearch</requirement> + <requirement type="package" version="2010-04-29">microbiomeutil</requirement> + <requirement type="package">fasta_number</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + ## Set the reference database name + #if $reference_database == "" + #set reference_database_name = "gg" + #else + #set reference_database_name = "silva" + #end if + + ## Run the amplicon analysis pipeline wrapper + python $__tool_directory__/amplicon_analysis_pipeline.py + ## Set options + #if str( $forward_pcr_primer ) != "" + -g "$forward_pcr_primer" + #end if + #if str( $reverse_pcr_primer ) != "" + -G "$reverse_pcr_primer" + #end if + #if str( $trimming_threshold ) != "" + -q $trimming_threshold + #end if + #if str( $sliding_window_length ) != "" + -l $sliding_window_length + #end if + #if str( $minimum_overlap ) != "" + -O $minimum_overlap + #end if + #if str( $minimum_length ) != "" + -L $minimum_length + #end if + -P $pipeline + -r \$AMPLICON_ANALYSIS_REF_DATA_PATH + #if str( $reference_database ) != "" + "${reference_database}" + #end if + #if str($categories_file_in) != 'None' + -c "${categories_file_in}" + #end if + ## Input files + "${metatable_file_in}" + ## FASTQ pairs + #if str($input_type.pairs_or_collection) == "collection" + #set fastq_pairs = $input_type.fastq_collection + #else + #set fastq_pairs = $input_type.fastq_pairs + #end if + #for $fq_pair in $fastq_pairs + "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}" + #end for + && + + ## Collect outputs + cp Metatable_log/Metatable_mod.txt "${metatable_mod}" && + cp ${pipeline}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" && + cp ${pipeline}_OTU_tables/otus.tre "${otus_tre_file}" && + cp RESULTS/${pipeline}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" && + cp RESULTS/${pipeline}_${reference_database_name}/table_summary.txt "${table_summary_file}" && + cp Multiplexed_files/${pipeline}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" && + cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" && + cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" && + + ## HTML outputs + + ## OTU table + mkdir $heatmap_otu_table_html.files_path && + cp -r RESULTS/${pipeline}_${reference_database_name}/Heatmap/js $heatmap_otu_table_html.files_path && + cp RESULTS/${pipeline}_${reference_database_name}/Heatmap/otu_table.html "${heatmap_otu_table_html}" && + + ## Phylum genus barcharts + mkdir $phylum_genus_dist_barcharts_html.files_path && + cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path && + cp -r RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path && + cp RESULTS/${pipeline}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" && + + ## Beta diversity weighted 2d plots + mkdir $beta_div_even_weighted_2d_plots.files_path && + cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path && + cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" && + + ## Beta diversity unweighted 2d plots + mkdir $beta_div_even_unweighted_2d_plots.files_path && + cp -r RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path && + cp RESULTS/${pipeline}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" && + + ## Alpha diversity rarefaction plots + mkdir $alpha_div_rarefaction_plots.files_path && + cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots && + cp -r RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path && + + ## Categories data + #if str($categories_file_in) != 'None' + ## Alpha diversity boxplots + mkdir $alpha_div_boxplots.files_path && + cp alpha_diversity_boxplots.html "$alpha_div_boxplots" && + cp RESULTS/${pipeline}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path && + #end if + + ## Pipeline outputs (log files etc) + mkdir $log_files.files_path && + cp Amplicon_analysis_pipeline.log $log_files.files_path && + cp pipeline.log $log_files.files_path && + cp Pipeline_outputs.txt $log_files.files_path && + cp Metatable_log/Metatable.html $log_files.files_path && + cp pipeline_outputs.html "$log_files" + ]]></command> + <inputs> + <param name="title" type="text" value="test" size="25" + label="Title" help="Optional text that will be added to the output dataset names" /> + <param type="data" name="metatable_file_in" format="tabular" + label="Input Metatable.txt file" /> + <param type="data" name="categories_file_in" format="txt" + label="Input Categories.txt file" optional="true" + help="(optional)" /> + <conditional name="input_type"> + <param name="pairs_or_collection" type="select" + label="Input FASTQ type"> + <option value="pairs_of_files">Pairs of datasets</option> + <option value="collection" selected="true">Dataset pairs in a collection</option> + </param> + <when value="collection"> + <param name="fastq_collection" type="data_collection" + format="fastqsanger,fastq" collection_type="list:paired" + label="Collection of FASTQ forward and reverse (R1/R2) pairs" + help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " /> + </when> + <when value="pairs_of_files"> + <repeat name="fastq_pairs" title="Input fastq pairs" min="1"> + <param type="text" name="name" value="" + label="Final name for FASTQ pair" /> + <param type="data" name="fastq_r1" format="fastqsanger,fastq" + label="FASTQ with forward reads (R1)" /> + <param type="data" name="fastq_r2" format="fastqsanger,fastq" + label="FASTQ with reverse reads (R2)" /> + </repeat> + </when> + </conditional> + <param type="text" name="forward_pcr_primer" value="" + label="Forward PCR primer sequence" + help="Optional; must not include barcode or adapter sequence (-g)" /> + <param type="text" name="reverse_pcr_primer" value="" + label="Reverse PCR primer sequence" + help="Optional; must not include barcode or adapter sequence (-G)" /> + <param type="integer" name="trimming_threshold" value="20" + label="Threshold quality below which read will be trimmed" + help="Phred score; default is 20 (-q)" /> + <param type="integer" name="minimum_overlap" value="10" + label="Minimum overlap in bp between forward and reverse reads" + help="Default is 10 (-O)" /> + <param type="integer" name="minimum_length" value="200" + label="Minimum length in bp to keep sequence after overlapping" + help="Default is 200 (-L)" /> + <param type="integer" name="sliding_window_length" value="10" + label="Minimum length in bp to retain a read after trimming" + help="Supplied to Sickle; default is 10 (-l)" /> + <param type="select" name="pipeline" + label="Pipeline to use for analysis"> + <option value="Vsearch" selected="true" >Vsearch</option> + <!-- + Remove the QIIME and Uparse options for now + <option value="QIIME">QIIME</option> + <option value="Uparse">Uparse</option> + --> + </param> + <param type="select" name="reference_database" + label="Reference database"> + <option value="" selected="true">GreenGenes</option> + <option value="-S">Silva</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="metatable_mod" + label="${tool.name}:${title} Metatable_mod.txt" /> + <data format="tabular" name="read_counts_out" + label="${tool.name} (${pipeline}):${title} read counts" /> + <data format="biom" name="tax_otu_table_biom_file" + label="${tool.name} (${pipeline}):${title} tax OTU table (biom format)" /> + <data format="tabular" name="otus_tre_file" + label="${tool.name} (${pipeline}):${title} otus.tre" /> + <data format="html" name="phylum_genus_dist_barcharts_html" + label="${tool.name} (${pipeline}):${title} phylum genus dist barcharts HTML" /> + <data format="tabular" name="otus_count_file" + label="${tool.name} (${pipeline}):${title} OTUs count file" /> + <data format="tabular" name="table_summary_file" + label="${tool.name} (${pipeline}):${title} table summary file" /> + <data format="fasta" name="dereplicated_nonchimera_otus_fasta" + label="${tool.name} (${pipeline}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" /> + <data format="html" name="fastqc_quality_boxplots_html" + label="${tool.name} (${pipeline}):${title} FastQC per-base quality boxplots HTML" /> + <data format="html" name="heatmap_otu_table_html" + label="${tool.name} (${pipeline}):${title} heatmap OTU table HTML" /> + <data format="html" name="beta_div_even_weighted_2d_plots" + label="${tool.name} (${pipeline}):${title} beta diversity weighted 2D plots HTML" /> + <data format="html" name="beta_div_even_unweighted_2d_plots" + label="${tool.name} (${pipeline}):${title} beta diversity unweighted 2D plots HTML" /> + <data format="html" name="alpha_div_rarefaction_plots" + label="${tool.name} (${pipeline}):${title} alpha diversity rarefaction plots HTML" /> + <data format="html" name="alpha_div_boxplots" + label="${tool.name} (${pipeline}):${title} alpha diversity boxplots"> + <filter>categories_file_in is not None</filter> + </data> + <data format="html" name="log_files" + label="${tool.name} (${pipeline}):${title} log files" /> + </outputs> + <tests> + </tests> + <help><![CDATA[ + +What it does +------------ + +This pipeline has been designed for the analysis of 16S rRNA data from +Illumina Miseq (Casava >= 1.8) paired-end reads. + +Usage +----- + +1. Preparation of the mapping file and format of unique sample id +***************************************************************** + +Before using the amplicon analysis pipeline it would be necessary to +follow the steps as below to avoid analysis failures and ensure samples +are labelled appropriately. Sample names for the labelling are derived +from the fastq files names that are generated from the sequencing. The +labels will include everything between the beginning of the name and +the sample number (from C11 to S19 in Fig. 1) + +.. image:: Pipeline_description_Fig1.png + :height: 46 + :width: 382 + +**Figure 1** + +If analysing 16S data from multiple runs: + +The samples from different runs may have identical IDs. For example, +when sequencing the same samples twice, by chance, these could be at +the same position in both the runs. This would cause the fastq files +to have exactly the same IDs (Fig. 2). + +.. image:: Pipeline_description_Fig2.png + :height: 100 + :width: 463 + +**Figure 2** + +In case of identical sample IDs the pipeline will fail to run and +generate an error at the beginning of the analysis. + +To avoid having to change the file names, before uploading the files, +ensure that the samples IDs are not repeated. + +2. To upload the file +********************* + +Click on **Get Data/Upload File** from the Galaxy tool panel on the +left hand side. + +From the pop-up window, choose how to upload the file. The +**Choose local file** option can be used for files up to 4Gb. Fastq files +from Illumina MiSeq will rarely be bigger than 4Gb and this option is +recommended. + +After choosing the files click **Start** to begin the upload. The window can +now be closed and the files will be uploaded onto the Galaxy server. You +will see the progress on the ``HISTORY`` panel on the right +side of the screen. The colour will change from grey (queuing), to yellow +(uploading) and finally green (uploaded). + +Once all the files are uploaded, click on the operations on multiple +datasets icon and select the fastq files that need to be analysed. +Click on the tab **For all selected...** and on the option +**Build List of Dataset pairs** (Fig. 3). + +.. image:: Pipeline_description_Fig3.png + :height: 247 + :width: 586 + +**Figure 3** + +Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``. +The fastq files forward R1 and reverse R2 should now appear in the +corresponding columns. + +Select **Autopair**. This creates a collection of paired fastq files for +the forward and reverse reads for each sample. The name of the pairs will +be the ones used by the pipeline. You are free to change the names at this +point as long as they are the same used in the Metatable file +(see section 3). + +Name the collection and click on **create list**. This reduces the time +required to input the forward and reverse reads for each individual sample. + +3. Create the Metatable files +***************************** + +Metatable.txt +~~~~~~~~~~~~~ + +Click on the list of pairs you just created to see the name of the single +pairs. The name of the pairs will be the ones used by the pipeline, +therefore, these are the names that need to be used in the Metatable file. + +The Metatable file has to be in QIIME format. You can find a description +of it on QIIME website http://qiime.org/documentation/file_formats.html + +EXAMPLE:: + + #SampleID BarcodeSequence LinkerPrimerSequence Disease Gender Description + Mock-RUN1 TAAGGCGAGCGTAAGA PsA Male Control + Mock-RUN2 CGTACTAGGCGTAAGA PsA Male Control + Mock-RUN3 AGGCAGAAGCGTAAGA PsC Female Control + +Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be +deleted. The header is very important. ``#SampleID``, ``Barcode``, +``LinkerPrimerSequence`` and ``Description`` are mandatory. Between +``LinkerPrimerSequence`` and ``Description`` you can add as many columns +as you want. For every column a PCoA plot will be created (see +**Results** section). You can create this file in Excel and it will have +to be saved as ``Text(Tab delimited)``. + +During the analysis the Metatable.txt will be checked to ensure that the +file has the correct format. If necessary, this will be modified and will +be available as Metatable_corrected.txt in the history panel. If you are +going to use the metatable file for any other statistical analyses, +remember to use the ``Metatable_mod.txt`` one, otherwise the sample +names might not match! + +Categories.txt (optional) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This file is required if you want to get box plots for comparison of +alpha diversity indices (see **Results** section). The file is a list +(without header and IN ONE COLUMN) of categories present in the +Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE +ONES USED IN THE METATABLE.TXT. You can create this file in Excel and +will have to be saved as ``Text(Tab delimited)``. + +EXAMPLE:: + + Disease + Gender + +Metatable and categories files can be uploaded using Get Data as done +with the fatsq files. + +4. Analysis +*********** + +Under **Amplicon_Analysis_Pipeline** + + * **Title** Name to distinguish between the runs. It will be shown at + the beginning of each output file name. + + * **Input Metatable.txt file** Select the Metatable.txt file related to + this analysis + + * **Input Categories.txt file (Optional)** Select the Categories.txt file + related to this analysis + + * **Input FASTQ type** select *Dataset pairs in a collection* and, then, + the collection of pairs you created earlier. + + * **Forward/Reverse PCR primer sequence** if the PCR primer sequences + have not been removed from the MiSeq during the fastq creation, they + have to be removed before the analysis. Insert the PCR primer sequence + in the corresponding field. DO NOT include any barcode or adapter + sequence. If the PCR primers have been already trimmed by the MiSeq, + and you include the sequence in this field, this would lead to an error. + Only include the sequences if still present in the fastq files. + + * **Threshold quality below which reads will be trimmed** Choose the + Phred score used by Sickle to trim the reads at the 3’ end. + + * **Minimum length to retain a read after trimming** If the read length + after trimming is shorter than a user defined length, the read, along + with the corresponding read pair, will be discarded. + + * **Minimum overlap in bp between forward and reverse reads** Choose the + minimum basepair overlap used by Pandaseq to assemble the reads. + Default is 10. + + * **Minimum length in bp to keep a sequence after overlapping** Choose the + minimum sequence length used by Pandaseq to keep a sequence after the + overlapping. This depends on the expected amplicon length. Default is + 380 (used for V3-V4 16S sequencing; expected length ~440bp) + + * **Pipeline to use for analysis** Choose the pipeline to use for OTU + clustering and chimera removal. The Galaxy tool currently supports + ``Vsearch`` only. ``Uparse`` and ``QIIME`` are planned to be added + shortly (the tools are already available for the stand-alone pipeline). + + * **Reference database** Choose between ``GreenGenes`` and ``Silva`` + databases for taxa assignment. + +Click on **Execute** to start the analysis. + +5. Results +********** + +Results are entirely generated using QIIME scripts. The results will +appear in the History panel when the analysis is completed + + * **Vsearch_tax_OTU_table (biom format)** The OTU table in BIOM format + (http://biom-format.org/) + + * **Vsearch_OTUs.tree** Phylogenetic tree constructed using + ``make_phylogeny.py`` (fasttree) QIIME script + (http://qiime.org/scripts/make_phylogeny.html) + + * **Vsearch_phylum_genus_dist_barcharts_HTML** HTML file with bar + charts at Phylum, Genus and Species level + (http://qiime.org/scripts/summarize_taxa.html and + http://qiime.org/scripts/plot_taxa_summary.html) + + * **Vsearch_OTUs_count_file** Summary of OTU counts per sample + (http://biom-format.org/documentation/summarizing_biom_tables.html) + + * **Vsearch_table_summary_file** Summary of sequences counts per sample + (http://biom-format.org/documentation/summarizing_biom_tables.html) + + * **Vsearch_multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta** + Fasta file with OTU sequences + + * **Vsearch_heatmap_OTU_table_HTML** Interactive OTU heatmap + (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html ) + + * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML + format using weighted Unifrac distance measure. Samples are grouped + by the column names present in the Metatable file. The samples are + firstly rarefied to the minimum sequencing depth + (http://qiime.org/scripts/beta_diversity_through_plots.html ) + + * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML + format using Unweighted Unifrac distance measure. Samples are grouped + by the column names present in the Metatable file. The samples are + firstly rarefied to the minimum sequencing depth + (http://qiime.org/scripts/beta_diversity_through_plots.html ) + +Code availability +----------------- + +**Code is available at** https://github.com/MTutino/Amplicon_analysis + +Credits +------- + +Pipeline author: Mauro Tutino + +Galaxy tool: Peter Briggs + + ]]></help> + <citations> + <citation type="bibtex"> + @misc{githubAmplicon_analysis, + author = {Tutino, Mauro}, + year = {2017}, + title = {Amplicon Analysis Pipeline}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/MTutino/Amplicon_analysis}, +}</citation> + </citations> +</tool>