Mercurial > repos > iuc > dropletutils
changeset 6:8855361fcfc5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dropletutils/ commit ed0625fe59342d14a08745996e3e32c6f922a738"
author | iuc |
---|---|
date | Thu, 10 Dec 2020 13:50:06 +0000 |
parents | cdf4443d5625 |
children | 2c1200fba922 |
files | dropletutils.xml scripts/dropletutils.Rscript test-data/defs_defaultdrops.h5 test-data/defs_emptydrops_150_0002.h5 test-data/defs_emptydrops_150_0002.png test-data/defs_emptydrops_150_0002a.h5 test-data/defs_emptydrops_150_0002a.png |
diffstat | 7 files changed, 132 insertions(+), 99 deletions(-) [+] |
line wrap: on
line diff
--- a/dropletutils.xml Wed Jan 29 15:07:38 2020 -0500 +++ b/dropletutils.xml Thu Dec 10 13:50:06 2020 +0000 @@ -1,9 +1,22 @@ <?xml version="1.0" encoding="utf-8"?> -<tool id="dropletutils" name="DropletUtils" version="@PACKAGE_VERSION@+@GALAXY_VERSION@" > +<tool id="dropletutils" name="DropletUtils" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" > <description>Utilities for handling droplet-based single-cell RNA-seq data</description> + <xrefs> + <xref type="bio.tools">dropletutils</xref> + </xrefs> + <edam_topics> + <edam_topic>topic_0203</edam_topic> + <edam_topic>topic_3168</edam_topic> + <edam_topic>topic_3170</edam_topic> + <edam_topic>topic_3308</edam_topic> + </edam_topics> + <edam_operations> + <edam_operation>operation_1812</edam_operation> + <edam_operation>operation_3200</edam_operation> + </edam_operations> <macros> - <token name="@PACKAGE_VERSION@" >1.2.1</token> - <token name="@GALAXY_VERSION@" >galaxy6</token> + <token name="@TOOL_VERSION@">1.10.0</token> + <token name="@VERSION_SUFFIX@">0</token> <token name="@TXIN@">tenx.input</token> <token name="@TXOUT@">tenx.output</token> <xml name="test_dirin" > @@ -16,9 +29,10 @@ </xml> </macros> <requirements> - <requirement type="package" version="@PACKAGE_VERSION@">bioconductor-dropletutils</requirement> - <requirement type="package" version="1.2_17" >r-matrix</requirement> - <requirement type="package" version="1.10.1" >bioconductor-scater</requirement> + <requirement type="package" version="@TOOL_VERSION@">bioconductor-dropletutils</requirement> + <requirement type="package" version="1.18.0" >bioconductor-scater</requirement> + <requirement type="package" version="4.0">r-base</requirement> + <requirement type="package" version="1.2_18">r-matrix</requirement> <requirement type="package" version="1">fonts-conda-ecosystem</requirement> </requirements> <version_command><![CDATA[ @@ -45,14 +59,14 @@ <configfiles> <configfile name="droplet_conf" > ## defaults -empty.fdr_threshold = 0.01 -eparams=formals(emptyDrops) -dparams=formals(defaultDrops) -bparams=formals(barcodeRanks) +empty_fdr_threshold = 0.01 +eparams = formals(emptyDrops) +dparams = formals(defaultDrops) +bparams = formals(barcodeRanks) ## File params -in.type='$tenx_format.use' -out.type=NULL +intype='$tenx_format.use' +outtype=NULL files=list() files\$table='$table' @@ -74,10 +88,10 @@ #else if str($operation.method.use) == 'emptydrops': do.method="emptyDrops" eparams\$lower=as.integer('$operation.method.lower') -empty.fdr_threshold=as.numeric('$operation.method.fdr_thresh') +empty_fdr_threshold=as.numeric('$operation.method.fdr_thresh') #end if -out.type='$operation.outformat' +outtype='$operation.outformat' #if str($operation.outformat) == 'directory': files\$out='@TXOUT@' #else if str($operation.outformat) == 'h5': @@ -172,8 +186,8 @@ </outputs> <tests> <!-- Directory input tests --> - <!-- ::: Default Drops --> <test expect_num_outputs="1"> + <!-- ::: Default Drops --> <expand macro="test_dirin" /> <conditional name="operation"> <param name="use" value="filter" /> @@ -196,17 +210,17 @@ </assert_contents> </output> </test> - <!-- :: Barcode Ranks --> <test expect_num_outputs="1"> + <!-- :: Barcode Ranks --> <expand macro="test_dirin" /> <conditional name="operation"> <param name="use" value="barcode_rank" /> <param name="lower" value="120" /> </conditional> - <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="400"/> + <output name="plot" value="defs_barcoderankings.png" compare="sim_size" delta="600"/> </test> - <!-- ::: Empty Drops --> <test expect_num_outputs="3"> + <!-- ::: Empty Drops --> <expand macro="test_dirin" /> <conditional name="operation"> <param name="use" value="filter" /> @@ -221,16 +235,16 @@ <output name="table" > <assert_contents> <has_n_columns n="9" /> - <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" /> - <has_line_matching expression="^994\sGGCATTACAA\s338\s-246.922772388055\s9.99900009999e-05\sTRUE\s9.99900009999e-05\sTRUE\sTRUE" /> - <has_line_matching expression="^998\sCATGAAGCAA\s151\s-166.644236503983\s9.99900009999e-05\sTRUE\s9.99900009999e-05\sTRUE\sTRUE" /> + <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" /> + <has_line_matching expression="^994\sGGCATTACAA\s338\s-246\.(.*TRUE){3}$" /> + <has_line_matching expression="^998\sCATGAAGCAA\s151\s-166\.(.*TRUE){3}$" /> </assert_contents> </output> <output name="plot" value="defs_emptydrops_150_0002.png" compare="sim_size" delta="400" /> </test> <!-- Other format input tests --> - <!-- ::: Empty Drops, same as above but input is h5 --> <test expect_num_outputs="3"> + <!-- ::: Empty Drops, same as above but input is h5 --> <conditional name="tenx_format" > <param name="use" value="h5" /> <param name="input" value="in_matrix.h5" /> @@ -248,9 +262,9 @@ <output name="table" > <assert_contents> <has_n_columns n="9" /> - <has_line_matching expression="^\sbar.names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis.Cell\sis.CellAndLimited" /> - <has_line_matching expression="^1100\sCCGGAAGCAA\s169\s-198.117943099773\s9.99900009999e-05\sTRUE\s0.000126279506880773\sTRUE\sTRUE" /> - <has_line_matching expression="^1114\sTCCGAAGCAA\s182\s-196.181449214729\s9.99900009999e-05\sTRUE\s0.000126279506880773\sTRUE\sTRUE" /> + <has_line_matching expression="^\sbar_names\sTotal\sLogProb\sPValue\sLimited\sFDR\sis_cell\sis_cellandlimited" /> + <has_line_matching expression="^1100\sCCGGAAGCAA\s169\s-198\.(.*TRUE){3}$" /> + <has_line_matching expression="^1114\sTCCGAAGCAA\s182\s-196\.(.*TRUE){3}$" /> </assert_contents> </output> <output name="plot" value="defs_emptydrops_150_0002a.png" compare="sim_size" delta="400" />
--- a/scripts/dropletutils.Rscript Wed Jan 29 15:07:38 2020 -0500 +++ b/scripts/dropletutils.Rscript Thu Dec 10 13:50:06 2020 +0000 @@ -1,6 +1,6 @@ ## Load in data -args = commandArgs(trailingOnly = T) -if (length(args) != 1){ +args <- commandArgs(trailingOnly = T) +if (length(args) != 1) { stop("Please provide the config file") } @@ -11,57 +11,60 @@ source(args[1]) ## Helper functions -setSparse <- function(obj){ +set_sparse <- function(obj) { return(as(obj, "dgCMatrix")) } -writeTSV <- function(fileout, obj){ - write.table(as.matrix(obj), file=fileout, col.names=NA, sep='\t', quote=FALSE) +write_tsv <- function(fileout, obj) { + write.table(as.matrix(obj), file = fileout, + col.names = NA, sep = "\t", quote = FALSE) } -determineGeneIDs <- function(object){ - if (!is.null(rowData(object)$Symbol)){ +determine_geneids <- function(object) { + if (!is.null(rowData(object)$Symbol)) { return(rowData(object)$Symbol) } return(rownames(object)) } -getCounts <- function(object){ - return(Matrix(counts(object), sparse=TRUE)) +get_counts <- function(object) { + return(Matrix(counts(object), sparse = TRUE)) } -writeOut <- function(object, fileout, typeout){ - if (typeout == "tsv"){ - writeTSV(fileout, getCounts(object)) +write_out <- function(object, fileout, typeout) { + if (typeout == "tsv") { + write_tsv(fileout, get_counts(object)) } - else if (typeout == "h5"){ - write10xCounts(fileout, getCounts(object), - type="HDF5", - gene.symbol=determineGeneIDs(object), - overwrite=TRUE) + else if (typeout == "h5") { + write10xCounts(fileout, get_counts(object), + type = "HDF5", + gene.symbol = determine_geneids(object), + overwrite = TRUE) } - else if (typeout == "directory"){ - write10xCounts(fileout, getCounts(object), - type="sparse", - gene.symbol=determineGeneIDs(object), - overwrite=TRUE) + else if (typeout == "directory") { + write10xCounts(fileout, get_counts(object), + type = "sparse", + gene.symbol = determine_geneids(object), + overwrite = TRUE) } } -read10xFiles <- function(filein, typein){ +read_10x_files <- function(filein, typein) { sce <- NULL - if (typein == "tsv"){ + if (typein == "tsv") { ## Exploding memory problems occured here ## - solution is to use the readSparseCounts function from scater - sce <- SingleCellExperiment(assays = list(counts = readSparseCounts(filein))) - } - else if (typein == "h5"){ - sce <- read10xCounts(filein, col.names=T, type="HDF5") # use barcodes.tsv as column names + sce <- SingleCellExperiment(assays = + list(counts = readSparseCounts(filein))) } - else if (typein == "directory"){ - sce <- read10xCounts(filein, col.names=T, type="sparse") + else if (typein == "h5") { + # use barcodes.tsv as column names + sce <- read10xCounts(filein, col.names = T, type = "HDF5") } - counts(sce) <- setSparse(counts(sce)) + else if (typein == "directory") { + sce <- read10xCounts(filein, col.names = T, type = "sparse") + } + counts(sce) <- set_sparse(counts(sce)) return(sce) } @@ -69,97 +72,113 @@ ## Methods -doEmptyDrops <- function(files, eparams, in.type="directory", out.type="h5", fdr_threshold = 0.01){ - sce <- read10xFiles(files$infile, in.type) +do_empty_drops <- function(files, eparams, intype = "directory", + outtype = "h5", fdr_threshold = 0.01) { + sce <- read_10x_files(files$infile, intype) + + eparams$... <- NULL ## hack to remove other parameters from being + eparams$m <- Matrix(counts(sce), sparse = TRUE) + + ## Determine sensible lowerbound + m_stats <- summary(colSums(counts(sce))) + print("Cell Library Size Distribution:") + print(m_stats) - eparams$... <- NULL ## hack - eparams$m = Matrix(counts(sce), sparse=TRUE) + if (m_stats["Min."] > eparams$lower) { + print(paste0("CAUTION: Min. Lib. Size (", m_stats["Min."] + , ") < requested lowerbound (", eparams$lower, ")")) + message(paste0("Setting lowerbound to Mean: ", m_stats["Mean"])) + eparams$lower <- m_stats["Mean"] + } - e.out <- do.call(emptyDrops, c(eparams)) + e_out <- do.call(emptyDrops, c(eparams)) - bar.names <- colnames(sce) - if (length(bar.names) != nrow(e.out)){ + bar_names <- colnames(sce) + if (length(bar_names) != nrow(e_out)) { stop("Length of barcodes and output metrics don't match.") } - e.out <- cbind(bar.names, e.out) - e.out$is.Cell <- e.out$FDR <= fdr_threshold - e.out$is.CellAndLimited <- e.out$is.Cell & e.out$Limited + e_out <- cbind(bar_names, e_out) + e_out$is_cell <- e_out$FDR <= fdr_threshold + e_out$is_cellandlimited <- e_out$is_cell & e_out$Limited ## Write to Plot - e.out$is.Cell[is.na(e.out$is.Cell)] <- FALSE - xlim.dat <- e.out[complete.cases(e.out),]$Total + e_out$is_cell[is.na(e_out$is_cell)] <- FALSE + xlim_dat <- e_out[complete.cases(e_out), ]$Total ## Write to table - writeTSV(files$table, e.out[complete.cases(e.out),]) + write_tsv(files$table, e_out[complete.cases(e_out), ]) png(files$plot) - plot(e.out$Total, -e.out$LogProb, col=ifelse(e.out$is.Cell, "red", "black"), - xlab="Total UMI count", ylab="-Log Probability", - xlim=c(min(xlim.dat),max(xlim.dat))) + plot(e_out$Total, -e_out$LogProb, col = ifelse(e_out$is_cell, + "red", "black"), + xlab = "Total UMI count", ylab = "-Log Probability", + xlim = c(min(xlim_dat), max(xlim_dat))) dev.off() ## Filtered called <- NULL - if (fdr_threshold != 0){ - called <- e.out$is.CellAndLimited + if (fdr_threshold != 0) { + called <- e_out$is_cellandlimited } else { - called <- e.out$is.Cell + called <- e_out$is_cell } called[is.na(called)] <- FALSE # replace NA's with FALSE - sce.filtered <- sce[,called] + sce_filtered <- sce[, called] - writeOut(sce.filtered, files$out, out.type) + write_out(sce_filtered, files$out, outtype) - message(paste("Cells:", sum(na.omit(e.out$is.Cell)))) - message(paste("Cells and Limited:", sum(na.omit(e.out$is.CellAndLimited)))) + message(paste("Cells:", sum(na.omit(e_out$is_cell)))) + message(paste("Cells and Limited:", sum(na.omit(e_out$is_cellandlimited)))) } -doDefaultDrops <- function(files, dparams, in.type="directory", out.type="h5"){ - sce <- read10xFiles(files$infile, in.type) +do_default_drops <- function(files, dparams, + intype = "directory", outtype = "h5") { + sce <- read_10x_files(files$infile, intype) - dparams$m = counts(sce) + dparams$m <- counts(sce) called <- do.call(defaultDrops, c(dparams)) # Filtered - sce.filtered <- sce[,called] + sce_filtered <- sce[, called] - writeOut(sce.filtered, files$out, out.type) + write_out(sce_filtered, files$out, outtype) message(paste("Cells:", sum(called))) } - -doBarcodeRankings <- function(files, bparams, in.type="directory"){ - sce <- read10xFiles(files$infile, in.type) +do_barcode_rankings <- function(files, bparams, intype = "directory") { + sce <- read_10x_files(files$infile, intype) bparams$... <- NULL ## hack - bparams$m = counts(sce) + bparams$m <- counts(sce) - br.out <- do.call(barcodeRanks, c(bparams)) + brout <- do.call(barcodeRanks, c(bparams)) png(files$plot) - plot(br.out$rank, br.out$total, log="xy", xlab="(log) Rank", ylab="(log) Total Number of Barcodes") - o <- order(br.out$rank) - lines(br.out$rank[o], br.out$fitted[o], col="red") + plot(brout$rank, brout$total, log = "xy", + xlab = "(log) Rank", ylab = "(log) Total Number of Barcodes") + o <- order(brout$rank) + lines(brout$rank[o], brout$fitted[o], col = "red") - abline(h=br.out$knee, col="dodgerblue", lty=2) - abline(h=br.out$inflection, col="forestgreen", lty=2) - legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection")) + abline(h = brout$knee, col = "dodgerblue", lty = 2) + abline(h = brout$inflection, col = "forestgreen", lty = 2) + legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"), + legend = c("knee", "inflection")) dev.off() - print(paste("knee =", br.out$knee, ", inflection = ", br.out$inflection)) + print(paste("knee =", brout$knee, ", inflection = ", brout$inflection)) } ## Main set.seed(seed.val) if (do.method == "barcodeRankings") { - doBarcodeRankings(files, bparams, in.type) + do_barcode_rankings(files, bparams, intype) } else if (do.method == "defaultDrops") { - doDefaultDrops(files, dparams, in.type, out.type) + do_default_drops(files, dparams, intype, outtype) } else if (do.method == "emptyDrops") { - doEmptyDrops(files, eparams, in.type, out.type, empty.fdr_threshold) + do_empty_drops(files, eparams, intype, outtype, empty_fdr_threshold) }