Mercurial > repos > iuc > recentrifuge

<?xml version="1.0" encoding="UTF-8"?>

<tool id="recentrifuge" name="Recentrifuge" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Robust comparative analysis and contamination removal for metagenomics
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <expand macro='edam'/>
    <expand macro='xrefs'/>
    <expand macro="requirements" />
    <expand macro="version_command" />
    <command detect_errors="aggressive"><![CDATA[
      #import re
      #*======================================
            Make a dir with all input files
      ======================================*#
        mkdir input_dir &&
        #for $input in $input_file
            #set input_identifier = re.sub('[^\s\w\-]', '_', str($input.element_identifier))
            #if $file_type.filetype == "centrifuge"
                ln -s '$input' 'input_dir/${input_identifier}.out' &&
            #else if $file_type.filetype == "clark"
                ln -s '$input' 'input_dir/${input_identifier}.csv' &&
            #else if $file_type.filetype == "kraken"
                ln -s '$input' 'input_dir/${input_identifier}.krk' &&
            #else
                ln -s '$input' 'input_dir/$input_identifier' &&
            #end if
        #end for
        #*======================================
                  database input
        ======================================*#
        rcf
        -n $database.database_name.fields.path
        #*======================================
                  Recentrifuge input file
        ======================================*#

        #if $file_type.filetype == "centrifuge"
            -f input_dir
            #set $default_scoring = 'SHEL'
        #else if $file_type.filetype == "kraken"
            -k input_dir
            #set $default_scoring = 'KRAKEN'
        #else if $file_type.filetype == "clark"
            -r input_dir
            #set $default_scoring = 'SHEL'
        #else if $file_type.filetype == "lmat"
            -l input_dir
            #set $default_scoring = 'LMAT'
        #else if $file_type.filetype == "generic"
            -g input_dir
            --format '$file_type.format'
            #set $default_scoring = 'GENERIC'
        #end if

        #*======================================
                  Output options
        ======================================*#
        -e $output_option.extra
        -o output
        $output_option.nohtml
        #*======================================
                  Advanced options
        ======================================*#
        #if $advanced_option.controls
            --controls '$advanced_option.controls'
        #end if
        #if $advanced_option.scoring
            --scoring '$advanced_option.scoring'
        #else
            --scoring '$default_scoring'
        #end if
        #if $advanced_option.minscore_value
            --minscore '$advanced_option.minscore_value'
        #end if
        #if $advanced_option.mintaxa
            --mintaxa '$advanced_option.mintaxa'
        #end if
        #if $advanced_option.exclude_taxa_name
            --exclude '$advanced_option.exclude_taxa_name'
        #end if
        #if $advanced_option.include_taxa_name
            --include '$advanced_option.include_taxa_name'
        #end if
        $advanced_option.avoidcross
        #*======================================
                  More advanced options
        ======================================*#
        #if $more_advanced_option.ctrlminscore
            --ctrlminscore '$more_advanced_option.ctrlminscore'
        #end if
        #if $more_advanced_option.ctrlmintaxa
            --ctrlmintaxa '$more_advanced_option.ctrlmintaxa'
        #end if
            --summary $more_advanced_option.summary
        $more_advanced_option.takeoutroot
        $more_advanced_option.nokollapse
        $more_advanced_option.strain
        $more_advanced_option.sequential
        #*======================================
                  Log file output
        ======================================*#
        | tee "$logfile"
        ]]>
    </command>
    <inputs>
        <!-- INPUT FILES -->
        <param name="input_file" type="data" format="tabular" multiple="true" optional="false" label="Select taxonomy file tabular formated"/>
        <conditional name="file_type">
            <param name="filetype" type="select" display="radio" label="Type of input file (Centrifuge, CLARK, Generic, Kraken, LMAT)" help="(-f, -r, -g, -k, -l)">
                <option value="centrifuge">Centrifuge</option>
                <option value="clark">CLARK</option>
                <option value="generic">Generic</option>
                <option value="lmat" >LMAT</option>
                <option value="kraken" >Kraken</option>
            </param>
            <when value="centrifuge"/>
            <when value="lmat"/>
            <when value="clark"/>
            <when value="kraken"/>
            <when value="generic">
                <param argument="--format" type="text" label="Format of the output files from a generic classifier"
                    help="string like 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'
                    where valid file Types are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
                    for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment">
                </param>
            </when>
        </conditional>
    <!-- taxa databases -->
        <section name="database" title="Database type" expanded="true">
            <param name="database_name" type="select" label="Cached database whith taxa ID">
                <options from_data_table="ncbi_taxonomy">
                    <validator message="No NCBI database is available" type="no_options"/>
                </options>
            </param>
        </section>
    <!-- output name -->
        <section name="output_option" title="Output options">
            <param argument="--extra" type="select" label="Type of extra output to be generated (default on CSV)" help="Default : CSV">
                <option value="CSV" selected="true" >CSV</option>
                <option value="DYNOMICS">DYNOMICS</option>
                <option value="FULL">FULL</option>
                <option value="TSV" >TSV</option>
            </param>
            <param argument="--nohtml" type="boolean" truevalue="--nohtml" falsevalue="" label="Suppress saving the HTML output file" help="remove html output"/>
            <param name="no_logfile" type="boolean" truevalue="True" falsevalue="False" label="Remove the log file"/>
        </section>
    <!-- ADVANCED OPTIONS -->
        <section name="advanced_option" title="Coarse tuning of algorithm parameters">
            <param name="controls" type="integer" optional="true" value="0" label="Number of first samples will be treated as negative controls" help="Add control samples (default is 0)"/>
            <param name="scoring" type="select" optional="true" label="Type of scoring to be applied" help="Scoring algorithm, depending of the input files, see the help section of the tools">
                <option value="SHEL">SHEL</option>
                <option value="LENGTH">LENGTH</option>
                <option value="LOGLENGTH">LOGLENGTH</option>
                <option value="NORMA">NORMA</option>
                <option value="LMAT">LMAT (only for LMAT input)</option>
                <option value="CLARK_C">CLARK_C (only for Clark input)</option>
                <option value="CLARK_G">CLARK_G (only for Clark input)</option>
                <option value="KRAKEN">KRAKEN (only for kraken input)</option>
                <option value="GENERIC">GENERIC (only for generic input)</option>
            </param>
            <param name="minscore_value" type="integer" optional="true" min="0" value="0" label="Minimum score/confidence of the classification of a
            read to pass the quality filter" help="All pass by default (--minscore)"/>
            <param name="mintaxa" type="integer" optional="true" min="0" value="0" label="Minimum taxa to avoid collapsing one level into the
            parent" help="If not specified a value will be automatically
            assigned (--mintaxa)"/>
            <param name="exclude_taxa_name" argument="--exclude" type="text" optional="true" label="NCBI taxid code to exclude a taxon and all underneath" help="Default: no exclude">
                <sanitizer invalid_char="">
                    <valid initial="string.letters,string.digits">
                        <add value="," />
                    </valid>
                </sanitizer>
                <validator type="regex">[A-Za-z0-9,]+</validator>
            </param>
            <param name="include_taxa_name" argument="--include" type="text" optional="true"  label="NCBI taxid code to include a taxon and all underneath" help="Default: all included">
                <sanitizer invalid_char="">
                    <valid initial="string.letters,string.digits">
                        <add value="," />
                    </valid>
                </sanitizer>
                <validator type="regex">[A-Za-z0-9,]+</validator>
            </param>
            <param argument="--avoidcross" type="boolean" truevalue="--avoidcross" falsevalue="" label="Avoid cross analysis" help="Default: no"/>
        </section>
    <!-- Detailed more fine parameters -->
        <section name="more_advanced_option" title=" Fine tuning of algorithm parameters">
            <param argument="--ctrlminscore" type="integer" optional="true" value="0" label="Minimum score in control samples to pass the filter" help="Minimum score/confidence of the classification of a read in control samples to pass the quality filter; it defaults to minscore"/>
            <param argument="--ctrlmintaxa"  type="integer" optional="true" value="0" label="Minimum taxa to avoid collapsing one level into the parent" help="If not specified a value will be automatically assigned"/>
            <param argument="--summary" type="select" label="Add a summary of the analysis (default is Add)" help="Select to 'add' summary samples to other samples, or to 'only' show summary samples or to 'avoid' summaries at all">
                <option value="ADD" selected="true">Add</option>
                <option value="ONLY">Only</option>
                <option value="AVOID">Avoid</option>
            </param>
            <param argument="--takeoutroot" type="boolean" truevalue="--takeoutroot" falsevalue="" label="Remove counts directly assigned to the root level"/>
            <param argument="--nokollapse"  type="boolean" truevalue="--nokollapse" falsevalue=""  label="Show the cellular organisms taxon" />
            <param argument="--strain"      type="boolean" truevalue="--strain" falsevalue=""      label="Strain level instead of species as the resolution limit for the robust contamination removal algorithm; use with caution, this is an experimental feature"/>
            <param argument="--sequential"  type="boolean" truevalue="--sequential" falsevalue=""  label="Deactivate parallel processing"/>
        </section>
    </inputs>
  <!-- OUTPUT FILE, TYPE DEPENDING ON extra PARAMETER -->
    <outputs>
        <data name="html_report" format="html" from_work_dir="output.rcf.html" label="${tool.name} on ${on_string}: html report">
            <filter> output_option['nohtml'] == False</filter>
        </data>
        <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
            <filter> output_option['no_logfile'] == False </filter>
        </data>
        <data name="data_csv" format="tabular" from_work_dir="output.rcf.data.csv" label="${tool.name} on ${on_string}: data.csv">
            <filter> output_option['extra']  == 'CSV' </filter>
        </data>
        <data name="stat_csv" format="tabular" from_work_dir="output.rcf.stat.csv" label="${tool.name} on ${on_string}: stat.csv">
            <filter> output_option['extra']  == 'CSV' </filter>
        </data>
        <data name="data_tsv" format="tabular" from_work_dir="output.rcf.data.tsv" label="${tool.name} on ${on_string}: data tsv">
            <filter> output_option['extra'] == 'TSV' </filter>
        </data>
        <data name="stat_tsv" format="tabular" from_work_dir="output.rcf.stat.tsv" label="${tool.name} on ${on_string}: stat tsv">
            <filter> output_option['extra'] == 'TSV' </filter>
        </data>
        <data name="xls_report" format="xlsx" from_work_dir="output.rcf.xlsx" label="${tool.name} on ${on_string}: xlsx report">
            <filter> output_option['extra'] == 'FULL' or output_option['extra'] == 'DYNOMICS'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="4"> <!-- kraken input and CSV output TEST_1-->
            <section name="database">
                    <param name="database_name" value="test-db-2022"/>
            </section>
            <param name="input_file" value="kraken_test/kraken.out"/>
            <conditional name="file_type">
                <param name="filetype" value="kraken"/>
            </conditional>
            <section name="output_option">
                <param name="output_type" value="default_type"/>
            </section>
            <section name="more_advanced_option">
                <param name="summary" value="AVOID"/>
            </section>
            <output name="data_csv" file="kraken_test/test1_csv.rcf.data.csv" lines_diff="2"/>
            <output name="stat_csv" file="kraken_test/test1_csv.rcf.stat.csv" lines_diff="2"/>
            <output name="html_report" file="kraken_test/test1_csv.rcf.html" lines_diff="2"/>
            <output name="logfile" file="kraken_test/test1_csv.log" lines_diff="12"/>
        </test>
        <test expect_num_outputs="3"> <!-- centrifuge input and full options with imported database TEST_2 -->
            <section name="database">
                <param name="database_name" value="test-db-2022"/>
            </section>
            <param name="input_file" value="centrifuge_test/centrifuge.out"/>
            <conditional name="file_type">
                <param name="filetype" value="centrifuge"/>
            </conditional>
            <section name="output_option">
                <param name="extra" value="CSV"/>
                <param name="nohtml" value="true"/>
            </section>
            <section name="advanced_option">
                <param name="controls" value="0"/>
                <param name="scoring" value="NORMA"/>
                <param name="minscore_value" value="1"/>
                <param name="avoidcross" value="true"/>
            </section>
            <section name="more_advanced_option">
                <param name="ctrlminscore" value="1"/>
                <param name="summary" value="AVOID"/>
            </section>
            <output name="stat_csv" file="centrifuge_test/test2_csv.rcf.stat.csv" lines_diff="2"/>
            <output name="data_csv" file="centrifuge_test/test2_csv.rcf.data.csv" lines_diff="2"/>
            <output name="logfile" file="centrifuge_test/test2_csv.log" lines_diff="12"/>
        </test>
        <test expect_num_outputs="3"> <!-- kraken input cached DB several option added TEST_3 -->
            <section name="database">
                <param name="database_name" value="test-db-2022"/>
            </section>
            <param name="input_file" value="kraken_test/kraken.out"/>
            <conditional name="file_type">
                <param name="filetype" value="kraken"/>
            </conditional>
            <section name="output_option" >
                <param name="extra" value="TSV"/>
                <param name="nohtml" value="true"/>
            </section>
            <section name="advanced_option">
                <param name="scoring" value="LOGLENGTH"/>
            </section>
            <section name="more_advanced_option">
                <param name="summary" value="ONLY"/>
                <param name="strain" value="true"/>
            </section>
            <output name="data_tsv" file="kraken_test/test3_rcf.data.tsv" lines_diff="2"/>
            <output name="stat_tsv" file="kraken_test/test3_rcf.stat.tsv" lines_diff="2"/>
            <output name="logfile" file="kraken_test/test3_tsv.log" lines_diff="12"/>
        </test>
        <test expect_num_outputs="3"> <!-- Test sumarized output TEST_4 -->
            <section name="database">
                <param name="database_name" value="test-db-2022"/>
            </section>
            <param name="input_file" value="centrifuge_test/centrifuge.out,centrifuge_test/centrifuge_1.out,centrifuge_test/centrifuge_2.out"/>
            <conditional name="file_type">
                <param name="filetype" value="centrifuge"/>
            </conditional>
            <section name="output_option" >
                <param name="extra" value="TSV"/>
                <param name="no_logfile" value="True"/>
            </section>
            <section name="advanced_option">
                <param name="scoring" value="LENGTH"/>
            </section>
            <section name="more_advanced_option">
                <param name="summary" value="ADD"/>
                <param name="strain" value="true"/>
            </section>
            <output name="data_tsv" file="centrifuge_test/test4_rcf.data.tsv" lines_diff="2"/>
            <output name="stat_tsv" file="centrifuge_test/test4_rcf.stat.tsv" lines_diff="2"/>
            <output name="html_report" file="centrifuge_test/test4_tsv.html" lines_diff="12"/>
        </test>
    </tests>
  <help><![CDATA[
**What it does**

With Recentrifuge, researchers can interactively explore what organisms are in their samples and at which level of confidence, enabling robust comparative analysis of multiple samples in any metagenomic study.

  * Removes diverse contaminants, including crossovers, using a novel robust contamination removal algorithm.
  * Provides a confidence level for every result, since the calculated score propagates to all the downstream analysis and comparisons.
  * Unveils the generalities and specificities in the metagenomic samples, thanks to a new comparative analysis engine.


  Recentrifuge is especially useful when a more reliable detection of minority organisms is needed (e.g. in the case of low microbial biomass metagenomic studies)
  in clinical, environmental, or forensic analysis. Beyond the standard confidence levels, Recentrifuge implements others devoted to variable length reads,
  very convenient for complex datasets generated by nanopore sequencers.

**Input option**
Recentrifuge can deal with some different taxonomic output files.
Input files can come from centrifuge, kraken, clark of lmat software.
A generic fonction to accept other files is available but need to add information of the file content.
If generic is choose, the option format need a string like : 'TYP:csv,TID:1,LEN:3,SCO:6,UNC:0'.
Where TYP are csv/tsv/ssv, and the rest of fields indicate the number of column used (starting in 1)
for the TaxIDs assigned,the LENgth of the read, the SCOre given to the assignment"


**Database for recentrifuge**
  Recentrifuge first need the taxonomic database from NCBI (nodes.dmp and names.dmp).
  We also provide the option to directly load necessary files from history as a dataset list.
  1. cached for already installed taxonomic databases
  2. history to load from your history


**Output options**
  1. Depending of the option provided, the file output format can be csv, tsv or xlsx and be combine in one or more files (extra).
  3. By default a html file is generated to visualize data, could be remove using the nohtml option


**Advanced options**
  1. Recentrifuge can integrate sample in the data which are negative control to normalize the data
  2. Scoring is an option to choose the score method for the read classified by taxonomic tools :
     SHEL (Single Hit Equivalent Length): This is a score value in pair bases roughly equivalent to a single hit to the database.
     KRAKEN: This scoring scheme is only available for this classifier. It divides the k-mer hit count of the top assignment by the total k-mers in the read and multiplies the result by 100 to give a percentage of coverage (the fraction of the read k-mers covered by k-mers belonging to the read final assignment). This is the default scoring scheme for Kraken samples, and it supports the mixing of samples with different read length.
     LENGTH: The score of a read will be its length (or the combined length of mate pairs).
     LOGLENGTH: Logarithm (base 10) of the length score.
     NORMA: This score is the normalized score SHEL / LENGTH in percentage, so it takes into account both the assignment quality and the length of the read. Very useful when both the score assignments and lengths are variable among the reads.
     LMAT: This scoring scheme is only available for this classifier.
     CLARK_C: This scoring scheme is not available for other classifiers. It takes the confidence score as the score for a read, conf=h1/(h1+h2), or 1-conf=h2/(h1+h2) in case the majority of a read is not classified (1st assignment unclassified). See CLARK's README file for details on how h1 and h2 are calculated. If you use this scoring, you will probably want to filter to a minimum of 0.5 (-y 0.5) or beyond, as under 0.5 the assignments have very low confidence.
     CLARK_G: This scheme scores every read with its CLARK gamma score, so it is only available for this classifier.
  3. You can choose a filter for read quality using the minscore option (--minscore)
  4. You can include or exclude specific taxa using the NCBI taxid code


**More advanced options**
  1. You can choose a filter for read quality specifically on the control samples
  2. You cans specify the minimum taxa value to avoid collapsing one level into parent
  3. A summary option is available produce a summary file
     Some other options are available and explicite in the more advanced panel of the tool


rcf - Release 1.8.1 - Mar 2022

  Copyright (C) 2017–2022, Jose Manuel Martí Martínez

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU Affero General Public License as
  published by the Free Software Foundation, either version 3 of the
  License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Affero General Public License for more details.

  You should have received a copy of the GNU Affero General Public License
  along with this program.  If not, see <https://www.gnu.org/licenses/>.
  ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Thu, 31 Aug 2023 04:46:52 +0000
parents	fe733f05c2f8
children