view dropletutils.xml @ 8:a9caad671439 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit 9d465ee72398a8a74fe499a2e4b74f507a5845cc"
author iuc
date Mon, 05 Jul 2021 13:40:00 +0000
parents 2c1200fba922
children
line wrap: on
line source

<?xml version="1.0" encoding="utf-8"?>
<tool id="dropletutils" name="DropletUtils" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" >
    <description>Utilities for handling droplet-based single-cell RNA-seq data</description>
    <xrefs>
        <xref type="bio.tools">dropletutils</xref>
    </xrefs>
    <edam_topics>
        <edam_topic>topic_0203</edam_topic>
        <edam_topic>topic_3168</edam_topic>
        <edam_topic>topic_3170</edam_topic>
        <edam_topic>topic_3308</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_1812</edam_operation>
        <edam_operation>operation_3200</edam_operation>
    </edam_operations>
    <macros>
        <token name="@TOOL_VERSION@">1.10.0</token>
        <token name="@VERSION_SUFFIX@">2</token>
        <token name="@TXIN@">tenx.input</token>
        <token name="@TXOUT@">tenx.output</token>
        <xml name="test_dirin" >
            <conditional name="tenx_format">
                <param name="use" value="directory" />
                <param name="input" value="in_matrix.mtx" />
                <param name="input_barcodes" value="in_barcodes.tsv" />
                <param name="input_genes" value="in_genes.tsv" />
            </conditional>
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">bioconductor-dropletutils</requirement>
        <requirement type="package" version="1.18.0" >bioconductor-scater</requirement>
        <requirement type="package" version="4.0">r-base</requirement>
        <requirement type="package" version="1.2_18">r-matrix</requirement>
        <requirement type="package" version="1">fonts-conda-ecosystem</requirement>
    </requirements>
    <version_command><![CDATA[
    Rscript -e 'packageVersion("DropletUtils")' | grep -oP '\d\.\d\.\d'
    ]]>
    </version_command>
    <command detect_errors='exit_code'><![CDATA[
#if str($tenx_format.use) == 'directory':
    mkdir '@TXIN@' &&
    ln -s '$tenx_format.input' '@TXIN@/matrix.mtx' &&
    ln -s '$tenx_format.input_genes' '@TXIN@/genes.tsv' &&
    ln -s '$tenx_format.input_barcodes' '@TXIN@/barcodes.tsv' &&
#end if

#if str($operation.use) == 'filter':
    #if str($operation.outformat) == 'directory':
    mkdir '@TXOUT@' &&
    #end if
#end if

Rscript '$__tool_directory__/scripts/dropletutils.Rscript' '$droplet_conf'
    ]]>
    </command>
    <configfiles>
        <configfile name="droplet_conf" >
## defaults
empty_fdr_threshold = 0.01
eparams = formals(emptyDrops)
dparams = formals(defaultDrops)
bparams = formals(barcodeRanks)

## File params
intype='$tenx_format.use'
outtype=NULL

files=list()
files\$table='$table'
files\$plot='$plot'
files\$out=NULL

#if str($tenx_format.use) == 'directory':
files\$infile='@TXIN@'
#else
files\$infile='$input'
#end if

#if str($operation.use) == 'filter':
    #if str($operation.method.use) == 'defaultdrops':
do.method="defaultDrops"
dparams\$expected=as.integer('$operation.method.expected')
dparams\$upper.quant=as.numeric('$operation.method.upperquant')
dparams\$lower.prop=as.numeric('$operation.method.lowerprop')
    #else if str($operation.method.use) == 'emptydrops':
do.method="emptyDrops"
eparams\$lower=as.integer('$operation.method.lower')
empty_fdr_threshold=as.numeric('$operation.method.fdr_thresh')
    #end if

outtype='$operation.outformat'
    #if str($operation.outformat) == 'directory':
files\$out='@TXOUT@'
    #else if str($operation.outformat) == 'h5':
files\$out='$fileout_h5'
    #else if str($operation.outformat) == 'tsv':
files\$out='$fileout_tsv'
    #end if

#else if str($operation.use) == 'barcode_rank':
do.method="barcodeRankings"
bparams\$lower=as.integer('$operation.lower')
#end if

seed.val=as.integer('$seed')

        </configfile>
    </configfiles>
    <inputs>
        <conditional name="tenx_format">
            <param name="use" type="select" label="Format for the input matrix">
                <option value="directory" >Bundled (barcodes.tsv, genes.tsv, matrix.mtx)</option>
                <option value="h5" selected="true" >H5</option>
                <option value="tsv">Tabular</option>
            </param>
            <when value="directory">
                <param name="input" type="data" format="mtx,txt" label="Count Data"/>
                <param name="input_genes" type="data" format="tabular,tsv,txt" label="Genes List"/>
                <param name="input_barcodes" type="data" format="tabular,tsv,txt" label="Barcodes List"/>
            </when>
            <when value="h5">
                <param name="input" type="data" format="h5" label="Count Data"/>
            </when>
            <when value="tsv">
                <param name="input" type="data" format="tabular,tsv" label="Count Data" help="The first column must contain gene names, and the first row must contain barcode identifiers." />
            </when>
        </conditional>
        <conditional name="operation" >
            <param name="use" type="select" label="Operation" help="The filtering uses intelligent methods to generate output 10X matrices as would be otherwise generated by CellRanger. The ranking option serves to illustrate the number of detected barcodes in the sample.">
                <option value="filter" selected="true">Filter for Barcodes</option>
                <option value="barcode_rank">Rank Barcodes</option>
            </param>
            <when value="filter">
                <conditional name="method">
                    <param name="use" type="select" label="Method" help="The DefaultDrops method calls cells according to the number of UMIs associated with each barcode as implemented by CellRanger. The EmptyDrops method identifies empty droplets using a smart ambient background model." >
                        <option value="defaultdrops" selected="true">DefaultDrops</option>
                        <option value="emptydrops">EmptyDrops</option>
                    </param>
                    <when value="defaultdrops">
                        <param name="expected" type="integer" min="1" value="3000" label="Expected Number of Cells" help="The expected number of cells in the sample." />
                        <param name="upperquant" type="float" min="0" max="1.0" value="0.99" label="Upper Quantile" help="The quantile of the top expected barcodes to consider." />
                        <param name="lowerprop" type="float" min="0" max="1.0" value="0.1" label="Lower Proportion" help="The fraction of molecules of the upper quantile that must be exceeded for a cell barcode to be detected." />
                    </when>
                    <when value="emptydrops">
                        <param name="lower" type="integer" min="1" value="100" label="Lower-bound Threshold" help="The lower-bound threshold of the total UMI count at which barcodes beneath this are assumed to be empty droplets." />
                        <param name="fdr_thresh" type="float" min="0" max="1" value="0.01" label="FDR Threshold" help="False Discovery Rate threshold at which droplets with significant deviations from the ambient profile are detected at this threshold. If set to 0, the threshold is ignored." />
                    </when>
                </conditional>
                <param name="outformat" type="select" label="Format for output matrices">
                    <option value="directory" >Bundled (barcodes.tsv, genes.tsv, matrix.mtx)</option>
                    <option value="h5" selected="true" >H5</option>
                    <option value="tsv">Tabular</option>
                </param>
            </when>
            <when value="barcode_rank">
                <param name="lower" type="integer" min="1" value="100" label="Lower Bound" help="Lower bound on the total UMI count, at or below which all barcodes are assumed to correspond to empty droplets."/>
            </when>
        </conditional>
        <param name="seed" type="integer" value="100" label="Random Seed" />
    </inputs>
    <outputs>
        <data name="fileout_tsv" format="tsv" label="${tool.name} Count Matrix on ${on_string}">
            <filter>operation['use']=='filter' and operation['outformat'] == 'tsv' </filter>
        </data>
        <data name="fileout_h5" format="h5" label="${tool.name} H5 Matrix on ${on_string}">
            <filter>operation['use']=='filter' and operation['outformat'] == 'h5'</filter>
        </data>
        <data name="barcodes_out" format="tsv" from_work_dir="@TXOUT@/barcodes.tsv" label="${tool.name} 10X Barcodes on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="genes_out" format="tsv" from_work_dir="@TXOUT@/genes.tsv" label="${tool.name} 10X Genes on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="matrix_out" format="mtx" from_work_dir="@TXOUT@/matrix.mtx" label="${tool.name} 10X Matrices on ${on_string}" >
            <filter>operation['use']=='filter' and operation['outformat'] == 'directory'</filter>
        </data>
        <data name="plot" format="png" label="${tool.name} Plot on ${on_string}">
            <filter>(operation['use']=='filter' and operation['method']['use']=="emptydrops") or operation['use']=='barcode_rank'</filter>
        </data>
        <data name="table" format="tsv" label="${tool.name} Table on ${on_string}">
            <filter>operation['use']=='filter' and operation['method']['use']=="emptydrops"</filter>
        </data>
    </outputs>
    <tests>
        <!-- Directory input tests -->
        <test expect_num_outputs="1">
            <!-- ::: Default Drops -->
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5" /><!-- H5 -->
            </conditional>
            <output name="fileout_h5" value="defs_defaultdrops.h5" compare="sim_size" delta="10" />
        </test>
        <test expect_num_outputs="1">
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="tsv" /><!-- TSV -->
            </conditional>
            <output name="fileout_tsv" >
                <assert_contents>
                    <has_n_columns n="1026" />
                    <has_text_matching expression="^\sTCGCCAAGAA\sGGACTTAGAA\sTGCGCCTGAA\sACGGACACAA\sGCGGACACAA" />
                    <has_text_matching expression="GENE50\s4\s0\s0\s0\s0\s5\s11\s8\s5" />
                    <has_text_matching expression="GENE100\s6\s3\s5\s0\s0\s16\s31\s15\s26" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <!-- :: Barcode Ranks -->
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="barcode_rank" />
                <param name="lower" value="120" />
            </conditional>
            <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="600"/>
            <assert_stdout>
                <has_line line='[1] "knee = 324 , inflection =  151"'/>
            </assert_stdout>
        </test>
        <test expect_num_outputs="3">
            <!-- ::: Empty Drops -->
            <expand macro="test_dirin" />
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5" />
                <conditional name="method">
                    <param name="use" value="emptydrops" />
                    <param name="lower" value="150" />
                    <param name="fdr_thresh" value="0.02" />
                </conditional>
            </conditional>
            <output name="fileout_h5" value="defs_emptydrops_150_0002.h5" compare="sim_size" delta="10" />
            <output name="table" >
                <assert_contents>
                    <has_n_columns n="9" />
                    <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" />
                    <has_line_matching expression="^994\sGGCATTACAA\s338\s-246\.(.*TRUE){3}$" />
                    <has_line_matching expression="^998\sCATGAAGCAA\s151\s-166\.(.*TRUE){3}$" />
                </assert_contents>
            </output>
            <output name="plot" value="defs_emptydrops_150_0002.png" compare="sim_size" delta="400" />
        </test>
        <!-- Other format input tests -->
        <test expect_num_outputs="3">
            <!-- ::: Empty Drops, same as above but input is h5 -->
            <conditional name="tenx_format" >
                <param name="use" value="h5" />
                <param name="input" value="in_matrix.h5" />
            </conditional>
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="h5" />
                <conditional name="method">
                    <param name="use" value="emptydrops" />
                    <param name="lower" value="150" />
                    <param name="fdr_thresh" value="0.02" />
                </conditional>
            </conditional>
            <output name="fileout_h5" value="defs_emptydrops_150_0002a.h5" compare="sim_size" delta="400" />
            <output name="table" >
                <assert_contents>
                    <has_n_columns n="9" />
                    <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" />
                    <has_line_matching expression="^1100\sCCGGAAGCAA\s169\s-198\.(.*TRUE){3}$" />
                    <has_line_matching expression="^1114\sTCCGAAGCAA\s182\s-196\.(.*TRUE){3}$" />
                </assert_contents>
            </output>
            <output name="plot" value="defs_emptydrops_150_0002a.png" compare="sim_size" delta="400" />
        </test>
        <!-- Output a directory -->
        <test expect_num_outputs="3">
            <conditional name="tenx_format" >
                <param name="use" value="tsv" />
                <param name="input" value="in_matrix.tsv" />
            </conditional>
            <conditional name="operation">
                <param name="use" value="filter" />
                <param name="outformat" value="directory" />
                <conditional name="method">
                    <param name="use" value="defaultdrops" />
                    <param name="expected" value="2000" />
                    <param name="upperquant" value="0.85" />
                    <param name="lowerpopr" value="0.2" />
                </conditional>
            </conditional>
            <output name="genes_out" file="in_genes.tsv" ftype="tsv" /><!-- same as the input in this case -->
            <output name="barcodes_out" ftype="tsv" >
                <assert_contents>
                    <has_n_columns n="1" />
                    <has_line line="GGAAGAAGAA" />
                    <has_line line="TCGAATGGAA" />
                </assert_contents>
            </output>
            <output name="matrix_out" ftype="mtx" >
                <assert_contents>
                    <has_line_matching expression="^100\s2650\s64072$" />
                    <has_line_matching expression="^19\s1\s1$" />
                    <has_line_matching expression="^100\s2650\s1$" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Utilities to process 10X data in a variety of formats. Methods include:

*Filter data:*

* **DefaultDrops** emulates the native 10X method to produce a filtered count matrix
* **EmptyDrops** is a more informative quantile-based approach to produce a filtered count matrix.

*Metrics:*

* **RankBarcodes** produces a gap statistic plot to estimate the number of barcodes detected in the sample matrix.

*Background*

Droplet-based single-cell RNA sequencing (scRNA-seq) technologies allow researchers to obtain transcriptome-wide expression profiles for thousands of cells at once. Briefly, each cell is encapsulated in a droplet in a oil-water emulsion, along with a bead containing reverse transcription primers with a unique barcode sequence. After reverse transcription inside the droplet, each cell’s cDNA is labelled with that barcode (referred to a “cell barcode”). Bursting of the droplets yields a pool of cDNA for library preparation and sequencing. Debarcoding of the sequences can then be performed to obtain the expression profile for each cell.

This tool implements some general utilities for handling these data after quantification of expression. In particular, we focus on the 10X Genomics platform, providing functions to load in the matrix of unique molecule identifier (UMI) counts as well as the raw molecule information. Functions are also available for downsampling the UMI count matrix or the raw reads; for distinguishing cells from empty droplets, based on the UMI counts; and to eliminate the effects of barcode swapping on Illumina 4000 sequencing machine.
]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-019-1662-y</citation>
    </citations>
</tool>