Mercurial > repos > iuc > nonpareil

<tool id="nonpareil" name="Nonpareil" version="@WRAPPER_VERSION@.0">
    <description>to estimate average coverage and generate Nonpareil curves</description>
    <macros>
        <token name="@WRAPPER_VERSION@">3.1.1</token>
    </macros>
    <requirements>
        <requirement type="package" version="@WRAPPER_VERSION@">nonpareil</requirement>
    </requirements>
    <version_command>nonpareil -V</version_command>
    <command detect_errors="exit_code">
<![CDATA[
nonpareil
    -s '$input'
    -T '$algo'
    #if $input.is_of_type("fasta")
        -f 'fasta'
    #else if $input.is_of_type("fastq")
        -f 'fastq'
    #end if
    -d '$subsampling'
    -n '$subsample_per_point'
    -L '$min_overlapping'
    -X '$max_query_reads'
    -R \${NONPAREIL_MAX_MEMORY:-1024}
    -t \${GALAXY_SLOTS:-2}
    -b output
    -a '$all_data_output'
    -C '$mating_vector_output'
    #if $log_test
        -l '$log'
    #end if
    -o '$summary'
    $use_portion_in_output
    -m '$sampling.min_sampling_portion'
    -M '$sampling.max_sampling_portion'
    -i '$sampling.sampling_portion_interval'
    $mating.use_rev_comp
    $mating.n_as_mismatches
    #if str($mating.sim_thres) != ''
        -S '$mating.sim_thres'
    #end if
    -k '$mating.kmer_size'
    #if str($mating.proba) != ''
        -x '$mating.proba'
    #end if
    -r '$seed'
    ]]></command>
    <inputs>
        <param name="input" type="data" format="fastq,fasta" label="Input sequences" help="FastQ format is recommended for kmer algorithm and Fasta for the alignment" argument="-s"/>
        <param name="algo" type="select" label="Nonpareil algorithm?" argument="-T">
            <option value="kmer">Kmer</option>
            <option value="alignment">Alignment</option>
        </param>
        <param name="subsampling" type="float" value="0.7" min="0" label="Factor for subsampling" help="Subsample iteratively applying this factor to the number of reads, resulting in logarithmic subsampling. Use 0 to fall back to linear sampling, control" argument="-d"/>
        <param name="subsample_per_point" type="integer" value="1024" min="0" label="Number of sub-samples to generate per point" help="If it is not a multiple of the number of threads, it is rounded to the next (upper) multiple" argument="-n"/>
        <param name="min_overlapping" type="integer" value="50" min="0" max="100" label="Minimum overlapping percentage of the aligned region on the largest sequence" help="The similarity is evaluated for the aligned region only" argument="-L"/>
        <param name="max_query_reads" type="integer" value="1000" min="0" label="Maximum number of reads to use as query" argument="-X"/>
        <param name="use_portion_in_output" type="boolean" truevalue="-F" falsevalue="" checked="false" label="Report the sampled portions as a fraction of the library instead of the number of reads?" argument="-F"/>
        <section name="sampling" title="Sampling" expanded="False">
            <param name="min_sampling_portion" type="float" value="0" min="0" label="Minimum value of sampling portion" argument="-m"/>
            <param name="max_sampling_portion" type="float" value="1" min="0" label="Maximum value of sampling portion" argument="-M"/>
            <param name="sampling_portion_interval" type="float" value="0.01" min="0" label="Interval between sampling portions" argument="-i"/>
        </section>
        <section name="mating" title="Mating" expanded="False">
            <param name="use_rev_comp" type="boolean" truevalue="-c" falsevalue="" checked="false" label="Do not use reverse-complement?" help="This is useful for single stranded sequences data" argument="-c"/>
            <param name="n_as_mismatches" type="boolean" truevalue="-N" falsevalue="" checked="false" label="Treat Ns as mismatches?" help="By default, Ns (unknown nucleotides) match any nucleotide" argument="-N"/>
            <param name="sim_thres" type="float" optional="true" label="Similarity threshold" help="Reducing this option will increase sensitivity while increasing running time" argument="-S"/>
            <param name="kmer_size" type="integer" value="24" min="0" label="Kmer size" help="You can increase kmer size to increase sensitivity" argument="-k"/>
            <param name="proba" type="float" optional="true" label="Probability of taking a sequence into account as query for the construction of the curve" help="Higher values reduce accuracy but increase speed. If set, it overides the maximum number of reads to use as query" argument="-x"/>
        </section>
        <param name="seed" type="integer" value="1000" min="0" label="Random generator seed?" argument="-r"/>
        <param name="log_test" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output log file?"/>
    </inputs>
    <outputs>
        <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Redundancy summary">
            <actions>
                <action name="column_names" type="metadata" default="sequencing_effort,average_redundancy,std,quartile_1,median,quartile_3" />
            </actions>
        </data>
        <data format="tabular" name="all_data_output" label="${tool.name} on ${on_string}: Redundancy values for all the results">
            <actions>
                <action name="column_names" type="metadata" default="sequencing_effort,replicate_id estimated_redundancy" />
            </actions>
        </data>
        <data format="txt" name="log" label="${tool.name} on ${on_string}: Log">
            <filter>log_test</filter>
        </data>
        <data format="tabular" name="mating_vector_output" label="${tool.name} on ${on_string}: Mates distribution"/>
    </outputs>
    <tests>
        <test>
            <param name="input" value="test.fasta"/>
            <param name="algo" value="alignment"/>
            <param name="subsampling" value="0.7"/>
            <param name="subsample_per_point" value="1024"/>
            <param name="min_overlapping" value="50"/>
            <param name="max_query_reads" value="1000"/>
            <param name="use_portion_in_output" value="" />
            <section name="sampling">
                <param name="min_sampling_portion" value="0"/>
                <param name="max_sampling_portion" value="1"/>
                <param name="sampling_portion_interval" value="0.01"/>
            </section>
            <section name="mating">
                <param name="use_rev_comp" value=""/>
                <param name="n_as_mismatches" value=""/>
                <param name="kmer_size" value="24"/>
            </section>
            <param name="seed" value="1000"/>
            <param name="log_test" value="true"/>
            <output name="all_data_output" value="test1-all_data_output" compare="sim_size" delta="0"/>
            <output name="log">
                <assert_contents>
                    <has_text text="Counting sequences" />
                    <has_text text="Evaluating consistency" />
                    <has_text text="Everything seems correct" />
                </assert_contents>
            </output>
            <output name="summary" value="test1-summary" compare="sim_size" delta="0"/>
            <output name="mating_vector_output" value="test1-mating_vector_output" compare="sim_size" delta="0"/>
        </test>
        <test>
            <param name="input" value="test.fastq"/>
            <param name="algo" value="kmer"/>
            <param name="subsampling" value="0.7"/>
            <param name="subsample_per_point" value="1024"/>
            <param name="min_overlapping" value="50"/>
            <param name="max_query_reads" value="10"/>
            <param name="use_portion_in_output" value="" />
            <section name="sampling">
                <param name="min_sampling_portion" value="0"/>
                <param name="max_sampling_portion" value="1"/>
                <param name="sampling_portion_interval" value="0.01"/>
            </section>
            <section name="mating">
                <param name="use_rev_comp" value=""/>
                <param name="n_as_mismatches" value=""/>
                <param name="kmer_size" value="24"/>
            </section>
            <param name="seed" value="1000"/>
            <param name="log_test" value="false"/>
            <output name="all_data_output" value="test2-all_data_output" compare="sim_size" delta="0"/>
            <output name="summary" value="test2-summary" compare="sim_size" delta="0"/>
            <output name="mating_vector_output" value="test2-mating_vector_output" compare="sim_size" delta="5"/>
        </test>
    </tests>
    <help><![CDATA[
Nonpareil uses the redundancy of the reads in metagenomic datasets to estimate the average coverage and predict the amount of sequences that will be required to achieve "nearly complete coverage".

Nonpareil outputs three files and one optional log file:

- Redundancy summary is a tab-delimited file with six columns. The first column indicates the sequencing effort (in number of reads), and the remaining columns indicate the summary of the distribution of redundancy (from the replicates, 1,024 by default) at the given sequencing effort. These five columns are: average redundancy, standard deviation, quartile 1, median (quartile 2), and quartile 3.
- Redundancy values is a tab-delimited file with three columns. Similar to the redundancy summary file, it contains information about the redundancy at each sequencing effort, but it provides ALL the results from the replicates, not only the summary at each point. The first column indicates the sequencing effort (as a fraction of the dataset), the second column indicates the ID of the replicate (a number used only to introduce some controlled noise in plots), and the third column indicates the estimated redundancy value.
- Mates distribution is a raw list with the number of reads in the dataset matching a query read. A set of query reads is randomly drawn by Nonpareil (1,000 by default), and compared against all reads in the dataset. Each line on this file corresponds to a query read (the order is not important). We have seen certain correspondance between these numbers and the distribution of abundances in the community (compared, for example, as rank-abundance plots), but this file is provided only for quality-control purposes and comparisons with other tools.
- Log is a verbose log of internal Nonpareil processing. The number to the left (inside squared brackets) indicate the CPU time (in minutes). This file also provide quality assessment of the Nonpareil run (automated consistency evaluation). Ideally, the last line should read "Everything seems correct". Otherwise, it suggests alternative parameters that may improve the estimation.

For more details about the tool, please check: http://nonpareil.readthedocs.io/en/latest/index.html
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btt584</citation>
    </citations>
</tool>
author	iuc
date	Sun, 05 Nov 2017 11:40:13 -0500
parents
children	45210df786b9