Mercurial > repos > frogs > frogs_2_0_0

diff preprocess.xml @ 0:76c750c5f0d1 draft default tip
planemo upload for repository https://github.com/oinizan/FROGS-wrappers commit 0b900a51e220ce6f17c1e76292c06a5f4d934055-dirty
author: frogs
date: Thu, 25 Oct 2018 05:01:13 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocess.xml	Thu Oct 25 05:01:13 2018 -0400
@@ -0,0 +1,389 @@
+<?xml version="1.0"?>
+<!--
+# Copyright (C) 2015 INRA
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<tool id="FROGS_preprocess" name="FROGS Pre-process" version="2.0.1">
+        <description>Step 1 in metagenomics analysis: denoising and dereplication.</description>
+ 	<requirements>
+                <requirement type="package" version="2.0.1">frogs</requirement>
+        </requirements>
+        <stdio>
+                <exit_code range="1:" />
+                <exit_code range=":-1" />
+        </stdio>
+	<command>
+		preprocess.py $sequencer_type.sequencer_selected
+		--output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file 
+		--nb-cpus $nb_cpus
+		#if $sequencer_type.sequencer_selected == "illumina"
+		    --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size 
+		    #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard"
+		        --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer  
+		    #else
+		        --without-primers
+		    #end if
+		#else
+		    --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size 
+		    --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer 
+		#end if
+		
+		#if $sequencer_type.input_type.input_type_selected == "archive" 
+		    --input-archive $sequencer_type.input_type.archive_file
+		    #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_contiged"
+			    --already-contiged
+		    #elif $sequencer_type.sequencer_selected == "illumina"
+	            --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size 
+	            --expected-amplicon-size $sequencer_type.input_type.archive_type.expected_amplicon_size 
+	            --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate
+		    #end if
+		#else
+		    #set $sep = ' '
+		    #if $sequencer_type.sequencer_selected == "illumina"
+	            --samples-names  
+	            #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
+	                $sep"${current.name.strip()}"
+	            #end for
+		        --input-R1 
+		        #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
+		            $sep${current.R1_file}
+		        #end for
+		        #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_contiged"
+		            --already-contiged
+		        #else
+		            --input-R2
+		            #for $current in $sequencer_type.input_type.files_by_samples_type.samples 
+		                $sep${current.R2_file}
+		            #end for
+		            --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size 
+		            --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size 
+		            --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate
+		        #end if
+		    #else
+		        --input-R1 
+		        #for $current in $sequencer_type.input_type.samples 
+		            $sep${current.R1_file}
+		        #end for
+		        --samples-names  
+		        #for $current in $sequencer_type.input_type.samples 
+		            $sep"${current.name.strip()}"
+		        #end for
+		    #end if
+		#end if
+	</command>
+	<inputs>
+		<param name="nb_cpus" type="hidden" label="CPU number" help="The maximum number of CPUs used." value="1" />
+		<conditional name="sequencer_type">
+			<param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences.">
+				<option value="illumina" selected="true">Illumina</option>
+				<option value="454">454</option>
+			</param>
+			<when value="illumina">
+				<!-- Samples -->
+				<conditional name="input_type">
+					<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample.">
+						<option value="files_by_samples" selected="true">Files by samples</option>
+						<option value="archive">Archive</option>
+					</param>
+					<when value="archive">
+						<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" />
+						<conditional name="archive_type">
+							<param name="archive_type_selected" type="select" label="Reads already contiged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair.">
+								<option value="paired" selected="true">No</option>
+								<option value="already_contiged">Yes</option>
+							</param>
+							<when value="paired">
+								<!-- Reads size -->
+								<param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
+								<param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
+								<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
+								<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" />
+							</when>
+							<when value="already_contiged"></when>
+						</conditional>
+					</when>
+					<when value="files_by_samples">
+						<conditional name="files_by_samples_type">
+							<param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair.">
+								<option value="paired" selected="true">No</option>
+								<option value="already_contiged">Yes</option>
+							</param>
+							<when value="paired">
+								<!-- Samples -->
+								<repeat name="samples" title="Samples" min="1">
+									<param name="name" type="text" label="Name" help="The sample name." optional="false">
+										<validator type="empty_field" message="This parameter is required." />
+									</param>
+									<param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." />
+									<param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." />
+								</repeat>
+								<!-- Reads size -->
+								<param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" />
+								<param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" />
+								<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
+								<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" />
+							</when>
+							<when value="already_contiged">
+								<repeat name="samples" title="Samples" min="1">
+									<param name="name" type="text" label="Name" help="The sample name." optional="false">
+										<validator type="empty_field" message="This parameter is required." />
+									</param>
+									<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." />
+								</repeat>
+							</when>
+						</conditional>
+					</when>
+				</conditional>
+				<!-- Amplicons -->
+				<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons." value="" optional="false" />
+				<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons." value="" optional="false" />
+				<!-- Primers -->
+				<conditional name="sequencing_protocol">
+					<param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers.">
+						<option value="standard" selected="true">Illumina standard</option>
+						<option value="without_primers">Custom protocol (Kozich et al. 2013)</option>
+					</param>
+					<when value="standard">
+						<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
+							<validator type="empty_field" message="This parameter is required." />
+						</param>
+						<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
+							<validator type="empty_field" message="This parameter is required." />
+						</param>
+					</when>
+					<when value="without_primers"></when>
+				</conditional>
+			</when>
+			
+			<when value="454">
+				<!-- Samples -->
+				<conditional name="input_type">
+					<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample.">
+						<option value="files_by_samples" selected="true">One file by sample</option>
+						<option value="archive">Archive</option>
+					</param>
+					<when value="archive">
+						<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" />
+					</when>
+					<when value="files_by_samples">
+						<repeat name="samples" title="Samples" min="1">
+							<param name="name" type="text" label="Name" help="The sample name." optional="false" />
+							<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." />
+						</repeat>
+					</when>
+				</conditional>
+				<!-- Amplicons -->
+				<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" />
+				<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" />
+				<!-- Primers -->
+				<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
+					<validator type="empty_field" message="This parameter is required." />
+				</param>
+				<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
+					<validator type="empty_field" message="This parameter is required." />
+				</param>
+			</when>
+		</conditional>
+	</inputs>
+	<outputs>
+		<data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" />
+		<data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" />
+		<data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" />
+	</outputs>
+        <tests>
+                <test>
+                        <conditional name="sequencer_type">
+				<param name="sequencer_selected" value="illumina"/>
+                                <conditional name="input_type">
+					<param name="input_type_selected" value="archive"/>
+					<param name="archive_file" value="test_dataset.tar.gz"/>
+					<conditional name="archive_type">
+						<param name="archive_type_selected" value="paired"/>
+						<param name="R1_size" value="250"/>
+						<param name="R2_size" value="250"/>
+						<param name="expected_amplicon_size" value="420"/>
+						<param name="mm_rate" value="0.15"/>
+					</conditional>
+				</conditional>
+				<param name="min_amplicon_size" value="380"/>
+				<param name="max_amplicon_size" value="460"/>
+				<conditional name="sequencing_protocol">
+					<param name="sequencing_protocol_selected" value="standard"/>
+					<param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/>
+					<param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/>
+				</conditional>
+                        </conditional>
+			<output name="dereplicated_file" file="references/01-prepro.fasta"/>
+			<output name="count_file" file="references/01-prepro.tsv"/>
+			<output name="summary_file" file="references/01-prepro.html" compare="sim_size" delta="0"/>
+                </test>
+        </tests>
+	<help>
+
+.. image:: static/images/FROGS_logo.png
+   :height: 144
+   :width: 110
+
+
+.. class:: infomark page-header h2
+
+What it does
+
+FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis.
+
+
+.. class:: infomark page-header h2
+
+Inputs/Outputs
+
+.. class:: h3
+
+Inputs
+
+Sample files added one after another or provide in an archive file (tar.gz).
+
+.. container:: row
+
+ .. container:: col-md-6
+
+  **Illumina inputs**
+
+   :Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region.
+   :Files: One R1 and R2 by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
+   :Example: splA_R1.fastq.gz,  splA_R2.fastq.gz,  splB_R1.fastq.gz,  splB_R2.fastq.gz
+
+  OR
+
+   :Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged.
+   :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_).
+   :Example: splA.fastq.gz,  splB.fastq.gz
+
+ .. container:: col-md-6
+
+  **454 inputs**
+
+   :Files: One sequence file by sample (format `FASTQ &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_)
+   :Example: splA.fastq.gz,  splB.fastq.gz
+
+Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*. To upload an archive, see the "Upload archive" tool or if possible create symbolic link on your Galaxy account.
+
+.. class:: h3
+
+Outputs
+
+**Sequence file** (dereplicated.fasta):
+
+ Only one file with all samples sequences (format `FASTA &lt;https://en.wikipedia.org/wiki/FASTA_format&gt;`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file.
+
+**Count file** (count.tsv):
+
+ This file contains the count of all unique sequences in each sample (format `TSV &lt;https://en.wikipedia.org/wiki/Tab-separated_values&gt;`_).
+
+**Summary file** (report.html):
+
+ This file reports the number of remaining sequences after each filter (format `HTML &lt;https://en.wikipedia.org/wiki/HTML&gt;`_).
+ 
+ .. image:: static/images/FROGS_preprocess_summary.png 
+     :height: 355
+     :width: 676 
+
+ It also presents the length distribution of the remaining sequences.
+ 
+ .. image:: static/images/FROGS_preprocess_lengthsSamples.png 
+     :height: 350
+     :width: 676 
+
+.. class:: infomark page-header h2
+
+How it works
+
+.. csv-table:: 
+   :header: "Steps", "Illumina", "454"
+   :widths: 5, 150, 150
+   :class: table table-striped
+
+   "1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region (`FLASh &lt;http://ccb.jhu.edu/software/FLASH/&gt;`_). By default M is set to 10%", "/"
+   "2", "Filters merged sequences on their length which must be range between 'Minimum amplicon size' and 'Maximum amplicon size'", "/"
+   "3", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand -  (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences"
+   "4", "Filters sequences on their length and with ambiguous nucleotides", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10" 
+   "5", "Dereplicates sequences", "Dereplicates sequences"
+
+
+.. class:: infomark page-header h2
+
+Advices/details on parameters
+
+.. class:: h3
+
+Primers parameters
+
+The (`Kozich et al. 2013 &lt;http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/&gt;`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers.
+
+In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation. 
+
+.. role:: alert-info
+
+Example:
+
+ 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3'
+ 
+ Value for parameter 5' primer: ATGCC
+ 
+ Value for parameter 3' primer: ATTTCAG
+
+.. class:: h3
+
+Amplicons sizes parameters
+
+ The two following images show two examples of perfect values fors sizes parameters.
+
+ .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal.png
+    :height: 415
+    :width: 676
+ 
+ .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal.png
+    :height: 415
+    :width: 676
+
+ Don't worry the "Expected amplicon size" does not need to be very accurate.
+
+.. class:: h3
+
+If the filter 'overlapped' reduce drasticaly the number of sequences:
+
+ In un-merged Illumina data, the reduction of dataset by the overlapped filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem.
+ 
+ If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC &lt;http://www.bioinformatics.babraham.ac.uk/projects/fastqc/&gt;`_) you can try to cut the end of your sequences and relaunch the preprocess tool. 
+ You can either raise the mismatch percent in the overlapped region, but not too much!
+
+----
+
+**Contact**
+
+Contacts: frogs@inra.fr
+
+Repository: https://github.com/geraldinepascal/FROGS
+
+Please cite the FROGS Publication: *Frederic Escudie, Lucas Auer, Maria Bernard, Mahendra Mariadassou, Laurent Cauquil, Katia Vidal, Sarah Maman, Guillermina Hernandez-Raquet, Sylvie Combes, Geraldine Pascal; FROGS: Find, Rapidly, OTUs with Galaxy Solution, Bioinformatics, , btx791,* https://doi.org/10.1093/bioinformatics/btx791
+
+Depending on the help provided you can cite us in acknowledgements, references or both.
+	</help> 
+	<citations>
+		<citation type="doi">10.1093/bioinformatics/btx791</citation>
+		<citation type="doi">10.1128/AEM.01043-13</citation>
+		<citation type="doi">10.14806/ej.17.1.200</citation>
+		<citation type="doi">10.1093/bioinformatics/btr507</citation>
+	</citations>
+</tool>
author	frogs
date	Thu, 25 Oct 2018 05:01:13 -0400
parents
children