Mercurial > repos > frogs > frogs_2_0_0
diff preprocess.xml @ 0:76c750c5f0d1 draft default tip
planemo upload for repository https://github.com/oinizan/FROGS-wrappers commit 0b900a51e220ce6f17c1e76292c06a5f4d934055-dirty
author | frogs |
---|---|
date | Thu, 25 Oct 2018 05:01:13 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.xml Thu Oct 25 05:01:13 2018 -0400 @@ -0,0 +1,389 @@ +<?xml version="1.0"?> +<!-- +# Copyright (C) 2015 INRA +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +--> +<tool id="FROGS_preprocess" name="FROGS Pre-process" version="2.0.1"> + <description>Step 1 in metagenomics analysis: denoising and dereplication.</description> + <requirements> + <requirement type="package" version="2.0.1">frogs</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <command> + preprocess.py $sequencer_type.sequencer_selected + --output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file + --nb-cpus $nb_cpus + #if $sequencer_type.sequencer_selected == "illumina" + --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size + #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard" + --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer + #else + --without-primers + #end if + #else + --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size + --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer + #end if + + #if $sequencer_type.input_type.input_type_selected == "archive" + --input-archive $sequencer_type.input_type.archive_file + #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_contiged" + --already-contiged + #elif $sequencer_type.sequencer_selected == "illumina" + --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size + --expected-amplicon-size $sequencer_type.input_type.archive_type.expected_amplicon_size + --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate + #end if + #else + #set $sep = ' ' + #if $sequencer_type.sequencer_selected == "illumina" + --samples-names + #for $current in $sequencer_type.input_type.files_by_samples_type.samples + $sep"${current.name.strip()}" + #end for + --input-R1 + #for $current in $sequencer_type.input_type.files_by_samples_type.samples + $sep${current.R1_file} + #end for + #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_contiged" + --already-contiged + #else + --input-R2 + #for $current in $sequencer_type.input_type.files_by_samples_type.samples + $sep${current.R2_file} + #end for + --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size + --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size + --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate + #end if + #else + --input-R1 + #for $current in $sequencer_type.input_type.samples + $sep${current.R1_file} + #end for + --samples-names + #for $current in $sequencer_type.input_type.samples + $sep"${current.name.strip()}" + #end for + #end if + #end if + </command> + <inputs> + <param name="nb_cpus" type="hidden" label="CPU number" help="The maximum number of CPUs used." value="1" /> + <conditional name="sequencer_type"> + <param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences."> + <option value="illumina" selected="true">Illumina</option> + <option value="454">454</option> + </param> + <when value="illumina"> + <!-- Samples --> + <conditional name="input_type"> + <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample."> + <option value="files_by_samples" selected="true">Files by samples</option> + <option value="archive">Archive</option> + </param> + <when value="archive"> + <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" /> + <conditional name="archive_type"> + <param name="archive_type_selected" type="select" label="Reads already contiged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair."> + <option value="paired" selected="true">No</option> + <option value="already_contiged">Yes</option> + </param> + <when value="paired"> + <!-- Reads size --> + <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" /> + <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" /> + <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" /> + <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" /> + </when> + <when value="already_contiged"></when> + </conditional> + </when> + <when value="files_by_samples"> + <conditional name="files_by_samples_type"> + <param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair."> + <option value="paired" selected="true">No</option> + <option value="already_contiged">Yes</option> + </param> + <when value="paired"> + <!-- Samples --> + <repeat name="samples" title="Samples" min="1"> + <param name="name" type="text" label="Name" help="The sample name." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + <param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." /> + <param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." /> + </repeat> + <!-- Reads size --> + <param name="R1_size" type="integer" label="Reads 1 size" help="The read1 size." value="" optional="false" /> + <param name="R2_size" type="integer" label="Reads 2 size" help="The read2 size." value="" optional="false" /> + <param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" /> + <param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" /> + </when> + <when value="already_contiged"> + <repeat name="samples" title="Samples" min="1"> + <param name="name" type="text" label="Name" help="The sample name." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." /> + </repeat> + </when> + </conditional> + </when> + </conditional> + <!-- Amplicons --> + <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons." value="" optional="false" /> + <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons." value="" optional="false" /> + <!-- Primers --> + <conditional name="sequencing_protocol"> + <param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers."> + <option value="standard" selected="true">Illumina standard</option> + <option value="without_primers">Custom protocol (Kozich et al. 2013)</option> + </param> + <when value="standard"> + <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + </when> + <when value="without_primers"></when> + </conditional> + </when> + + <when value="454"> + <!-- Samples --> + <conditional name="input_type"> + <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample."> + <option value="files_by_samples" selected="true">One file by sample</option> + <option value="archive">Archive</option> + </param> + <when value="archive"> + <param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" /> + </when> + <when value="files_by_samples"> + <repeat name="samples" title="Samples" min="1"> + <param name="name" type="text" label="Name" help="The sample name." optional="false" /> + <param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." /> + </repeat> + </when> + </conditional> + <!-- Amplicons --> + <param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" /> + <param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" /> + <!-- Primers --> + <param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + <param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false"> + <validator type="empty_field" message="This parameter is required." /> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" /> + <data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" /> + <data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" /> + </outputs> + <tests> + <test> + <conditional name="sequencer_type"> + <param name="sequencer_selected" value="illumina"/> + <conditional name="input_type"> + <param name="input_type_selected" value="archive"/> + <param name="archive_file" value="test_dataset.tar.gz"/> + <conditional name="archive_type"> + <param name="archive_type_selected" value="paired"/> + <param name="R1_size" value="250"/> + <param name="R2_size" value="250"/> + <param name="expected_amplicon_size" value="420"/> + <param name="mm_rate" value="0.15"/> + </conditional> + </conditional> + <param name="min_amplicon_size" value="380"/> + <param name="max_amplicon_size" value="460"/> + <conditional name="sequencing_protocol"> + <param name="sequencing_protocol_selected" value="standard"/> + <param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/> + <param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/> + </conditional> + </conditional> + <output name="dereplicated_file" file="references/01-prepro.fasta"/> + <output name="count_file" file="references/01-prepro.tsv"/> + <output name="summary_file" file="references/01-prepro.html" compare="sim_size" delta="0"/> + </test> + </tests> + <help> + +.. image:: static/images/FROGS_logo.png + :height: 144 + :width: 110 + + +.. class:: infomark page-header h2 + +What it does + +FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis. + + +.. class:: infomark page-header h2 + +Inputs/Outputs + +.. class:: h3 + +Inputs + +Sample files added one after another or provide in an archive file (tar.gz). + +.. container:: row + + .. container:: col-md-6 + + **Illumina inputs** + + :Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region. + :Files: One R1 and R2 by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_) + :Example: splA_R1.fastq.gz, splA_R2.fastq.gz, splB_R1.fastq.gz, splB_R2.fastq.gz + + OR + + :Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged. + :Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_). + :Example: splA.fastq.gz, splB.fastq.gz + + .. container:: col-md-6 + + **454 inputs** + + :Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_) + :Example: splA.fastq.gz, splB.fastq.gz + +Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*. To upload an archive, see the "Upload archive" tool or if possible create symbolic link on your Galaxy account. + +.. class:: h3 + +Outputs + +**Sequence file** (dereplicated.fasta): + + Only one file with all samples sequences (format `FASTA <https://en.wikipedia.org/wiki/FASTA_format>`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file. + +**Count file** (count.tsv): + + This file contains the count of all unique sequences in each sample (format `TSV <https://en.wikipedia.org/wiki/Tab-separated_values>`_). + +**Summary file** (report.html): + + This file reports the number of remaining sequences after each filter (format `HTML <https://en.wikipedia.org/wiki/HTML>`_). + + .. image:: static/images/FROGS_preprocess_summary.png + :height: 355 + :width: 676 + + It also presents the length distribution of the remaining sequences. + + .. image:: static/images/FROGS_preprocess_lengthsSamples.png + :height: 350 + :width: 676 + +.. class:: infomark page-header h2 + +How it works + +.. csv-table:: + :header: "Steps", "Illumina", "454" + :widths: 5, 150, 150 + :class: table table-striped + + "1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region (`FLASh <http://ccb.jhu.edu/software/FLASH/>`_). By default M is set to 10%", "/" + "2", "Filters merged sequences on their length which must be range between 'Minimum amplicon size' and 'Maximum amplicon size'", "/" + "3", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand - (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences" + "4", "Filters sequences on their length and with ambiguous nucleotides", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10" + "5", "Dereplicates sequences", "Dereplicates sequences" + + +.. class:: infomark page-header h2 + +Advices/details on parameters + +.. class:: h3 + +Primers parameters + +The (`Kozich et al. 2013 <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/>`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers. + +In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation. + +.. role:: alert-info + +Example: + + 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3' + + Value for parameter 5' primer: ATGCC + + Value for parameter 3' primer: ATTTCAG + +.. class:: h3 + +Amplicons sizes parameters + + The two following images show two examples of perfect values fors sizes parameters. + + .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal.png + :height: 415 + :width: 676 + + .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal.png + :height: 415 + :width: 676 + + Don't worry the "Expected amplicon size" does not need to be very accurate. + +.. class:: h3 + +If the filter 'overlapped' reduce drasticaly the number of sequences: + + In un-merged Illumina data, the reduction of dataset by the overlapped filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem. + + If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC <http://www.bioinformatics.babraham.ac.uk/projects/fastqc/>`_) you can try to cut the end of your sequences and relaunch the preprocess tool. + You can either raise the mismatch percent in the overlapped region, but not too much! + +---- + +**Contact** + +Contacts: frogs@inra.fr + +Repository: https://github.com/geraldinepascal/FROGS + +Please cite the FROGS Publication: *Frederic Escudie, Lucas Auer, Maria Bernard, Mahendra Mariadassou, Laurent Cauquil, Katia Vidal, Sarah Maman, Guillermina Hernandez-Raquet, Sylvie Combes, Geraldine Pascal; FROGS: Find, Rapidly, OTUs with Galaxy Solution, Bioinformatics, , btx791,* https://doi.org/10.1093/bioinformatics/btx791 + +Depending on the help provided you can cite us in acknowledgements, references or both. + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btx791</citation> + <citation type="doi">10.1128/AEM.01043-13</citation> + <citation type="doi">10.14806/ej.17.1.200</citation> + <citation type="doi">10.1093/bioinformatics/btr507</citation> + </citations> +</tool>