view dropletutils.xml @ 1:cfe1e6c28d95 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit 40d4a92c62d75fe931baba8657cde006a26d84cf"
author iuc
date Mon, 26 Aug 2019 05:06:39 -0400
parents 4cd9f0008d9c
children a8aa294401be
line wrap: on
line source

<?xml version="1.0" encoding="utf-8"?>
<tool id="dropletutils" name="DropletUtils" version="@PACKAGE_VERSION@+@GALAXY_VERSION@" >
    <description>Utilities for handling droplet-based single-cell RNA-seq data</description>
    <macros>
        <token name="@PACKAGE_VERSION@" >1.2.1</token>
        <token name="@GALAXY_VERSION@" >2</token>
        <token name="@TXIN@">tenx.input</token>
        <token name="@TXOUT@">tenx.output</token>
        <xml name="test_dirin" >
            <conditional name="tenx_format">
                <param name="use" value="directory" />
                <param name="input" value="in_matrix.mtx" />
                <param name="input_genes" value="in_genes.tsv" />
                <param name="input_barcodes" value="in_barcodes.tsv" />
            </conditional>
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@PACKAGE_VERSION@">bioconductor-dropletutils</requirement>
        <requirement type="package" version="1.2_17" >r-matrix</requirement>
        <requirement type="package" version="1.10.1" >bioconductor-scater</requirement>
    </requirements>
    <version_command><![CDATA[
    Rscript -e 'packageVersion("DropletUtils")' | grep -oP '\d\.\d\.\d'
    ]]>
    </version_command>
    <command detect_errors='exit_code'><![CDATA[
#if str($tenx_format.use) == 'directory':
    mkdir '@TXIN@' &&
    ln -s '$tenx_format.input' '@TXIN@/matrix.mtx' &&
    ln -s '$tenx_format.input_genes' '@TXIN@/genes.tsv' &&
    ln -s '$tenx_format.input_barcodes' '@TXIN@/barcodes.tsv' &&
#end if

#if str($operation.use) == 'filter':
    #if str($operation.outformat) == 'directory':
    mkdir '@TXOUT@' &&
    #end if
#end if

Rscript '$__tool_directory__/scripts/dropletutils.Rscript' '$droplet_conf'
    ]]>
    </command>
    <configfiles>
        <configfile name="droplet_conf" >
## defaults
empty.fdr_threshold = 0.01
eparams=formals(emptyDrops)
dparams=formals(defaultDrops)
bparams=formals(barcodeRanks)

## File params
in.type='$tenx_format.use'
out.type=NULL

files=list()
files\$table='$table'
files\$plot='$plot'
files\$out=NULL

#if str($tenx_format.use) == 'directory':
files\$infile='@TXIN@'
#else
files\$infile='$input'
#end if

#if str($operation.use) == 'filter':
    #if str($operation.method.use) == 'defaultdrops':
do.method="defaultDrops"
dparams\$expected=as.integer('$operation.method.expected')
dparams\$upper.quant=as.numeric('$operation.method.upperquant')
dparams\$lower.prop=as.numeric('$operation.method.lowerprop')
    #else if str($operation.method.use) == 'emptydrops':
do.method="emptyDrops"
eparams\$lower=as.integer('$operation.method.lower')
empty.fdr_threshold=as.numeric('$operation.method.fdr_thresh')
    #end if

out.type='$operation.outformat'
    #if str($operation.outformat) == 'directory':
files\$out='@TXOUT@'
    #else if str($operation.outformat) == 'h5ad':
files\$out='$fileout_h5ad'
    #else if str($operation.outformat) == 'tsv':
files\$out='$fileout_tsv'
    #end if

#else if str($operation.use) == 'barcode_rank':
do.method="barcodeRankings"
bparams\$lower=as.integer('$operation.lower')
#end if

seed.val=as.integer('$seed')

        </configfile>
    </configfiles>
    <inputs>
        <conditional name="tenx_format">
            <param name="use" type="select" label="Format for the input matrix">
                <option value="directory" >Bundled (barcodes.tsv, genes.tsv, matrix.mtx)</option>
                <option value="h5ad" selected="true" >AnnData/H5</option>
                <option value="tsv">Tabular</option>
            </param>
            <when value="directory">
                <param name="input" type="data" format="mtx,txt" label="Count Data"/>
                <param name="input_genes" type="data" format="tabular,tsv,txt" label="Genes List"/>
                <param name="input_barcodes" type="data" format="tabular,tsv,txt" label="Barcodes List"/>
            </when>
            <when value="h5ad">
                <param name="input" type="data" format="h5ad" label="Count Data"/>
            </when>
            <when value="tsv">
                <param name="input" type="data" format="tabular,tsv" label="Count Data" help="The first column must contain gene names, and the first row must contain barcode identifiers." />
            </when>
        </conditional>
        <conditional name="operation" >
            <param name="use" type="select" label="Operation" help="The filtering uses intelligent methods to generate output 10X matrices as would be otherwise generated by CellRanger. The ranking option serves to illustrate the number of detected barcodes in the sample.">
                <option value="filter" selected="true">Filter for Barcodes</option>
                <option value="barcode_rank">Rank Barcodes</option>
            </param>
            <when value="filter">
                <conditional name="method">
                    <param name="use" type="select" label="Method" help="The DefaultDrops method calls cells according to the number of UMIs associated with each barcode as implemented by CellRanger. The EmptyDrops method identifies empty droplets using a smart ambient background model." >
                        <option value="defaultdrops" selected="true">DefaultDrops</option>
                        <option value="emptydrops">EmptyDrops</option>
                    </param>
                    <when value="defaultdrops">
                        <param name="expected" type="integer" min="10" value="3000" label="Expected Number of Cells" help="The expected number of cells in the sample." />
                        <param name="upperquant" type="float" min="0.1" max="1.0" value="0.99" label="Upper Quantile" help="The quantile of the top expected barcodes to consider." />
                        <param name="lowerprop" type="float" min="0.1" max="1.0" value="0.1" label="Lower Proportion" help="The fraction of molecules of the upper quantile that must be exceeded for a cell barcode to be detected." />
                    </when>
                    <when value="emptydrops">
                        <param name="lower" type="integer" min="10" value="100" label="Lower-bound Threshold" help="The lower-bound threshold of the total UMI count at which barcodes beneath this are assumed to be empty droplets." />
                        <param name="fdr_thresh" type="float" min="0" max="1" value="0.01" label="FDR Threshold" help="False Discovery Rate threshold at which droplets with significant deviations from the ambient profile are detected at this threshold." />
                    </when>
                </conditional>
                <param name="outformat" type="select" label="Format for output matrices">
                    <option value="directory" >Bundled (barcodes.tsv, genes.tsv, matrix.mtx)</option>
                    <option value="h5ad" selected="true" >AnnData/H5</option>
                    <option value="tsv">Tabular</option>
                </param>
            </when>
            <when value="barcode_rank">
                <param name="lower" type="integer" min="10" value="100" label="Lower Bound" help="Lower bound on the total UMI count, at or below which all barcodes are assumed to correspond to empty droplets."/>
            </when>
        </conditional>
        <param name="seed" type="integer" value="100" label="Random Seed" />
    </inputs>
    <outputs>
        <data name="fileout_tsv" format="tsv" label="${tool.name} Count Matrix on ${on_string}">
            <filter>operation['use']=='filter' and operation['outformat'] == 'tsv' </filter>
        </data>
        <data name="fileout_h5ad" format="h5ad" label="${tool.name} AnnData on ${on_string}">
            <filter>operation['use']=='filter' and operation['outformat'] == 'h5ad'</filter>
        </data>
        <data name="barcodes_out" format="tsv" from_work_dir="@TXOUT@/barcodes.tsv" label="${tool.name} 10X Barcodes on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="genes_out" format="tsv" from_work_dir="@TXOUT@/genes.tsv" label="${tool.name} 10X Genes on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="matrix_out" format="mtx" from_work_dir="@TXOUT@/matrix.mtx" label="${tool.name} 10X Matrices on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="plot" format="png" label="${tool.name} Plot on ${on_string}">
            <filter>(operation['use']=='filter' and operation['method']['use']=="emptydrops") or operation['use']=='barcode_rank'</filter>
        </data>
        <data name="table" format="tsv" label="${tool.name} Table on ${on_string}">
            <filter>operation['use']=='filter' and operation['method']['use']=="emptydrops"</filter>
        </data>
    </outputs>
    <tests>
        <!-- Directory input tests -->
        <!-- ::: Default Drops -->
        <test expect_num_outputs="1">
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5ad" /><!-- H5AD -->
            </conditional>
            <output name="fileout_h5ad" value="defs_defaultdrops.h5ad" compare="sim_size" delta="10" />
        </test>
        <test expect_num_outputs="1">
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="tsv" /><!-- TSV -->
            </conditional>
            <output name="fileout_tsv" >
                <assert_contents>
                    <has_n_columns n="1026" />
                    <has_text_matching expression="^\sTCGCCAAGAA\sGGACTTAGAA\sTGCGCCTGAA\sACGGACACAA\sGCGGACACAA" />
                    <has_text_matching expression="GENE50\s4\s0\s0\s0\s0\s5\s11\s8\s5" />
                    <has_text_matching expression="GENE100\s6\s3\s5\s0\s0\s16\s31\s15\s26" />
                </assert_contents>
            </output>
        </test>
        <!-- :: Barcode Ranks -->
        <test expect_num_outputs="1">
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="barcode_rank" />
                <param name="lower" value="120" />
            </conditional>
            <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="200"/>
        </test>
        <!-- ::: Empty Drops -->
        <test expect_num_outputs="3">
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5ad" />
                <conditional name="method">
                    <param name="use" value="emptydrops" />
                    <param name="lower" value="150" />
                    <param name="fdr_thresh" value="0.02" />
                </conditional>
            </conditional>
            <output name="fileout_h5ad" value="defs_emptydrops_150_0002.h5ad" compare="sim_size" delta="10" />
            <output name="table" >
                <assert_contents>
                    <has_n_columns n="9" />
                    <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" />
                    <has_line_matching expression="^1\sGGAAGAAGAA\s0\sNA\sNA\sNA\sNA\sNA\sNA" />
                    <has_line_matching expression="^11100\sGCTGAAGCAA\s71\sNA\sNA\sNA\sNA\sNA\sNA" />
                </assert_contents>
            </output>
            <output name="plot" value="defs_emptydrops_150_0002.png" compare="sim_size" delta="100" />
        </test>
        <!-- Other format input tests -->
        <!-- ::: Empty Drops, same as above but input is h5ad -->
        <test expect_num_outputs="3">
            <conditional name="tenx_format" >
                <param name="use" value="h5ad" />
                <param name="input" value="in_matrix.h5ad" />
            </conditional>
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5ad" />
                <conditional name="method">
                    <param name="use" value="emptydrops" />
                    <param name="lower" value="150" />
                    <param name="fdr_thresh" value="0.02" />
                </conditional>
            </conditional>
            <output name="fileout_h5ad" value="defs_emptydrops_150_0002a.h5ad" compare="sim_size" delta="200" />
            <output name="table" >
                <assert_contents>
                    <has_n_columns n="9" />
                    <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" />
                    <has_line_matching expression="^1\sGGAAGAAGAA\s20\sNA\sNA\sNA\sNA\sNA\sNA" />
                    <has_line_matching expression="^11100\sGCTGAAGCAA\s203\s-222.833111872316\s9.99900009999e-05\sTRUE\s0.000126279506880773\sTRUE\sTRUE" />
                </assert_contents>
            </output>
            <output name="plot" value="defs_emptydrops_150_0002a.png" compare="sim_size" delta="100" />
        </test>
        <!-- Output a directory -->
        <test expect_num_outputs="3">
            <conditional name="tenx_format" >
                <param name="use" value="tsv" />
                <param name="input" value="in_matrix.tsv" />
            </conditional>
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="directory" />
                <conditional name="method">
                    <param name="use" value="defaultdrops" />
                    <param name="expected" value="2000" />
                    <param name="upperquant" value="0.85" />
                    <param name="lowerpopr" value="0.2" />
                </conditional>
            </conditional>
            <output name="genes_out" file="in_genes.tsv" ftype="tsv" /><!-- same as the input in this case -->
            <output name="barcodes_out" ftype="tsv" >
                <assert_contents>
                    <has_n_columns n="1" />
                    <has_line line="GGAAGAAGAA" />
                    <has_line line="TCGAATGGAA" />
                </assert_contents>
            </output>
            <output name="matrix_out" ftype="mtx" >
                <assert_contents>
                    <has_line_matching expression="^100\s2650\s64072$" />
                    <has_line_matching expression="^19\s1\s1$" />
                    <has_line_matching expression="^100\s2650\s1$" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Utilities to process 10X data in a variety of formats. Methods include:

*Filter data:*

* **DefaultDrops** emulates the native 10X method to produce a filtered count matrix
* **EmptyDrops** is a more informative quantile-based approach to produce a filtered count matrix.

*Metrics:*

* **RankBarcodes** produces a gap statistic plot to estimate the number of barcodes detected in the sample matrix.

*Background*

Droplet-based single-cell RNA sequencing (scRNA-seq) technologies allow researchers to obtain transcriptome-wide expression profiles for thousands of cells at once. Briefly, each cell is encapsulated in a droplet in a oil-water emulsion, along with a bead containing reverse transcription primers with a unique barcode sequence. After reverse transcription inside the droplet, each cell’s cDNA is labelled with that barcode (referred to a “cell barcode”). Bursting of the droplets yields a pool of cDNA for library preparation and sequencing. Debarcoding of the sequences can then be performed to obtain the expression profile for each cell.

This tool implements some general utilities for handling these data after quantification of expression. In particular, we focus on the 10X Genomics platform, providing functions to load in the matrix of unique molecule identifier (UMI) counts as well as the raw molecule information. Functions are also available for downsampling the UMI count matrix or the raw reads; for distinguishing cells from empty droplets, based on the UMI counts; and to eliminate the effects of barcode swapping on Illumina 4000 sequencing machine.
]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-019-1662-y</citation>
    </citations>
</tool>