Mercurial > repos > iuc > khmer_normalize_by_median

<tool id="khmer_normalize_by_median" name="khmer: Normalize By Median" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Filter reads using digital normalization via k-mer abundances</description>
    <macros>
        <token name="@BINARY@">normalize-by-median.py</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version" />
    <command><![CDATA[
#import re
set -u &&
mkdir output &&

@LINK_SEQUENCES@
cd output/ &&
normalize-by-median.py
${paired_switch}
${force_single_switch}
@TABLEPARAMS@
--cutoff=${cutoff}
#if $unpaired_reads_filename
    --unpaired-reads=${unpaired_reads_filename}
#end if
#if $save_countgraph
    --savegraph=${countgraph}
#end if
#if $countgraph_to_load
    --loadgraph=${countgraph_to_load}
#end if
--report=${report}
$gzip
@USE_SEQUENCES@
]]>
    </command>
    <inputs>
        <expand macro="input_sequences_filenames" />
        <param argument="--paired" name="paired_switch" type="boolean" checked="false" truevalue="--paired" falsevalue=""
            label="Require all sequences be properly paired?"
            help="The tool will fail if given improperly paired reads and this option is selected." />
        <param argument="--force_single" name="force_single_switch" type="boolean" checked="false" truevalue="--force_single" falsevalue=""
            label="Ignore all pairing information?"
            help="By default this tool process reads in a pair-aware manner. This option disables that behavior." />
        <param argument="--unpaired-reads" name="unpaired_reads_filename" type="data" format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" optional="true"
            label="Extra unpaired reads"
            help="If all but one of your sequence files are interleaved paired end reads you can include one unpaired file to be processed last without regard to pairing." />
        <param argument="--loadgraph" name="countgraph_to_load" type="data" format="oxlicg" optional="true"
            label="Optional k-mer countgraph"
            help="The inputs file(s) will be processed using the kmer counts in the specified k-mer countgraph file as a starting point." />
        <param argument="--savegraph" name="save_countgraph" type="boolean" label="Save the k-mer countgraph(s) in a file" help="" />
        <param argument="--cutoff" type="integer" min="1" value="20" label="Cutoff" help="" />
        <expand macro="tableinputs" />
    </inputs>
    <outputs>
        <data name="countgraph" format="oxlicg" label="${tool.name} on ${on_string}: k-mer countgraph">
            <filter>save_countgraph == True</filter>
        </data>
        <data name="report" format="csv" label="${tool.name} on ${on_string}: report" />
        <expand macro="output_sequences" extension="keep"/>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <param name="inputs" value="test-abund-read-2.fa" ftype="fasta"/>
            <param name="type" value="specific" />
            <param name="cutoff" value="1" />
            <param name="ksize" value="17" />
            <output name="report" file="normalize-by-median.report.txt" />
            <output_collection name="sequences" type="list">
                <element name="test-abund-read-2.fa" ftype="fasta">
                    <assert_contents>
                        <has_text text="GGTTGACGGGGCTCAGGGGG" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="2">
            <param name="inputs" value="test-abund-read-2.fa.gz"  ftype="fasta.gz"/>
            <param name="type" value="specific" />
            <param name="cutoff" value="2" />
            <param name="ksize" value="17" />
            <output name="report" file="normalize-by-median.c2.report.txt" />
            <output_collection name="sequences" type="list">
                <element name="test-abund-read-2.fa.gz" ftype="fasta.gz">
                    <assert_contents>
                        <has_text text="GGTTGACGGGGCTCAGGGGG" />
                        <has_text text="GGTTGACGGGGCTCAGGG" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="3">
            <param name="inputs" value="test-abund-read-paired.fa" ftype="fasta"/>
            <param name="type" value="specific" />
            <param name="cutoff" value="1" />
            <param name="ksize" value="17" />
            <param name="paired" value="true" />
            <param name="save_countgraph" value="true"/>
            <output name="report" file="normalize-by-median.paired.report.txt" />
            <output_collection name="sequences" type="list">
                <element name="test-abund-read-paired.fa" ftype="fasta">
                    <assert_contents>
                        <has_text text="GGTTGACGGGGCTCAGGGGG" />
                        <has_text text="GGTTGACGGGGCTCAGGG" />
                    </assert_contents>
                </element>
            </output_collection>
            <output name="countgraph">
                <assert_contents>
                    <has_size size="1k"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Do digital normalization (remove mostly redundant sequences)

Discard sequences based on whether or not their median k-mer abundance lies
above a specified cutoff. Kept sequences will be placed in <fileN>.keep.

By default, Paired end reads will be considered together; if either read will
be kept, then both will be kept. (This keeps both reads from a fragment, and
helps with retention of repeats.) Unpaired reads are treated individually.

If `--paired` is set then proper pairing is required and the tool will exit on
unpaired reads, although `--unpaired-reads` can be used to supply a file of
orphan reads to be read after the paired reads.

`--force_single` will ignore all pairing information and treat reads
individually.

With `-s`/`--savegraph`, the k-mer countgraph will be saved to the specified
file after all sequences have been processed. `--loadgraph` will load the
specified k-mer countgraph before processing the specified files.  Note
that the countgraph is in same format as those produced by
`load-into-counting.py` and consumed by `abundance-dist.py`.

@HELP_FOOTER@
]]>
    </help>
    <citations>
        <expand macro="software-citation" />
        <expand macro="diginorm-citation" />
    </citations>
</tool>
author	iuc
date	Thu, 03 Oct 2024 13:46:42 +0000
parents	b1fe2ef3d244
children