Mercurial > repos > eschen42 > w4mkmeans
diff w4m_general_purpose_routines.R @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | c415b7dc6f37 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/w4m_general_purpose_routines.R Tue Aug 08 15:30:38 2017 -0400 @@ -0,0 +1,283 @@ +# prepare.data.matrix - Prepare x.datamatrix for multivariate statistical analaysis (MVA) +# - Motivation: +# - Selection: +# - You may want to exclude several samples from your analysis: +# - If so, set the argument 'exclude.samples' to a vector of sample names +# - You may want to exclude several features or features from your analysis: +# - If so, set the argument 'exclude.features' to a vector of feature names +# - Renaming samples: +# - You may want to rename several samples from your analysis: +# - If so, set the argument 'sample.rename.function' to a function accepting a vector +# of sample names and producing a vector of strings of equivalent length +# - MVA is confounded by missing values. +# - By default, this function imputes missing values as zero. +# - For a different imputation, set the 'data.imputation' argument to a function +# accepting a single matrix argument and returning a matrix of the same +# dimensions as the argument. +# - Transformation +# - It may be desirable to transform the intensity data to reduce the range. +# - By default, this function performs an eigth-root transformation: +# - Any root-tranformation has the advantage of never being negative. +# - Calculation of the eight-root is four times faster in my hands than log10. +# - However, it has the disadvantage that calculation of fold-differences +# is not additive as with log-transformation. +# - Rather, you must divide the values and raise to the eighth power. +# - For a different transformation, set the 'data.transformation' argument +# to a function accepting a single matrix argument. +# - The function should be written to return a matrix of the same dimensions +# as the argument. +# arguments: +# - x.matrix - matrix of intensities (or data.frame of sample metadata) +# - one row per sample +# - one column per feature or metadata attribute +# - exclude.samples - vector of labels of matrix rows (samples) to omit from analysis +# - exclude.features - vector of labels of matrix columnss (features) to omit from analysis +# - sample.rename.function - function to be used to rename rows if necessary, or NULL +# - e.g., sample.rename.function = function(x) { +# sub("(.*)_.*","\\1", row.names(x)) +# } +# - data.imputation - function applied to matrix to impute missing values +# - e.g., data.imputation = function(m) { +# m[is.na(m)] <- min(m, na.rm = TRUE) / 100 +# return (m) +# } +# - data.transformation - function applied to matrix cells +# - e.g., data.transformation = function(x) { return( log10(x) ) } +# or, data.transformation = log10 +# result value: +# transformed, imputed x.datamatrix with renamed rows and with neither excluded values nor features +# +################################ +## +## Notes regarding the effectiveness and performance of the data transformation method. +## +## The two transformations that I tried (log10 and 8th root) required different imputation methods. +## +## For the LCMS resin data set that I was working with, separation in MVA was nearly equivalent for: +## data.imputation <- function(x.matrix) { +## x.matrix[is.na(x.matrix)] <- 0 +## return (x.matrix) +## } +## data.transformation <- function(x) { +## sqrt( sqrt( sqrt(x) ) ) +## } +## and +## data.imputation <- function(x.matrix) { +## x.matrix[is.na(x.matrix)] <- min(x.matrix, na.rm = TRUE) / 100 +## return (x.matrix) +## } +## data.transformation <- function(x) { +## log10(x) +## } +## +## Note further that triple application of the square root: +## - may be four times faster than log10: +## - may be three times faster than log2: +## +## system.time( junk <- sqrt( sqrt( sqrt(1:100000000) ) ) ) +## user system elapsed +## 0.832 0.236 1.069 +## system.time( junk <- log10(1:100000000) ) +## user system elapsed +## 3.936 0.400 4.337 +## system.time( junk <- log2(1:100000000) ) +## user system elapsed +## 2.784 0.320 3.101 +## +################################ +# +prepare.data.matrix <- function( + x.matrix +, exclude.samples = NULL +, exclude.features = NULL +, sample.rename.function = NULL +, data.imputation = + function(m) { + # replace NA values with zero + m[is.na(m)] <- 0 + # replace negative values with zero, if applicable (It should never be applicable!) + if (min(m < 0)) { + m <- matrix(lapply(X = m, FUN = function(z) {max(z,0)}), nrow = nrow(m) ) + } + # return matrix as the result + return (m) + } +, data.transformation = function(x) { + sqrt( sqrt( sqrt(x) ) ) + } +, en = new.env() +) { + # MatVar - Compute variance of rows or columns of a matrix + # ref: http://stackoverflow.com/a/25100036 + # For row variance, dim == 1, for col variance, dim == 2 + MatVar <- function(x, dim = 1) { + if (dim == 1) { + dim.x.2 <- dim(x)[2] + if ( dim.x.2 == 0 ) + stop("MatVar: there are zero columns") + if ( dim.x.2 == 1 ) { + stop("MatVar: a single column is insufficient to calculate a variance") + # return ( rep.int(x = 0, times = nrow(x)) ) + } else { + return ( rowSums( (x - rowMeans(x))^2 ) / ( dim(x)[2] - 1 ) ) + } + } else if (dim == 2) { + dim.x.1 <- dim(x)[1] + if ( dim.x.1 == 0 ) { + stop("MatVar: there are zero rows") + } + if ( dim.x.1 == 1 ) { + stop("MatVar: a single row is insufficient to calculate a variance") + # return ( rep.int(x = 0, times = ncol(x)) ) + } else { + return ( rowSums( (t(x) - colMeans(x))^2 ) / ( dim(x)[1] - 1 ) ) + } + } else stop("Please enter valid dimension, for rows, dim = 1; for colums, dim = 2") + } + + nonzero.var <- function(x) { + if (nrow(x) == 0) { + print(str(x)) + stop("matrix has no rows") + } + if (ncol(x) == 0) { + print(str(x)) + stop("matrix has no columns") + } + if ( is.numeric(x) ) { + # exclude any rows with zero variance + row.vars <- MatVar(x, dim = 1) + nonzero.row.vars <- row.vars > 0 + nonzero.rows <- row.vars[nonzero.row.vars] + if ( length(rownames(x)) != length(rownames(nonzero.rows)) ) { + row.names <- attr(nonzero.rows,"names") + x <- x[ row.names, , drop = FALSE ] + } + + # exclude any columns with zero variance + column.vars <- MatVar(x, dim = 2) + nonzero.column.vars <- column.vars > 0 + nonzero.columns <- column.vars[nonzero.column.vars] + if ( length(colnames(x)) != length(colnames(nonzero.columns)) ) { + column.names <- attr(nonzero.columns,"names") + x <- x[ , column.names, drop = FALSE ] + } + } + return (x) + } + + if (is.null(x.matrix)) { + stop("FATAL ERROR - prepare.data.matrix was called with null x.matrix") + } + + en$xpre <- x <- x.matrix + + # exclude any samples as indicated + if ( !is.null(exclude.features) ) { + my.colnames <- colnames(x) + my.col.diff <- setdiff(my.colnames, exclude.features) + x <- x[ , my.col.diff , drop = FALSE ] + } + + # exclude any features as indicated + if ( !is.null(exclude.samples) ) { + my.rownames <- rownames(x) + my.row.diff <- setdiff(my.rownames, exclude.samples) + x <- x[ my.row.diff, , drop = FALSE ] + } + + # rename rows if desired + if ( !is.null(sample.rename.function) ) { + renamed <- sample.rename.function(x) + rownames(x) <- renamed + } + + # save redacted x.datamatrix to environment + en$redacted.data.matrix <- x + + # impute values missing from the x.datamatrix + if ( !is.null(data.imputation) ) { + x <- data.imputation(x) + } + + # perform transformation if desired + if ( !is.null(data.transformation) ) { + x <- data.transformation(x) + } else { + x <- x + } + + # purge rows and columns that have zero variance + if ( is.numeric(x) ) { + x <- nonzero.var(x) + } + + # save imputed, transformed x.datamatrix to environment + en$imputed.transformed.data.matrix <- x + + return(x) +} + + +##----------------------------------------------- +## helper functions for error detection/reporting +##----------------------------------------------- + +# log-printing to stderr +log_print <- function(x, ...) { + cat( + format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z") + , " " + , c(x, ...) + , "\n" + , sep="" + , file=stderr() + ) +} + +# tryCatchFunc produces a list +# On success of expr(), tryCatchFunc produces +# list(success TRUE, value = expr(), msg = "") +# On failure of expr(), tryCatchFunc produces +# list(success = FALSE, value = NA, msg = "the error message") +tryCatchFunc <- function(expr) { + # format error for logging + format_error <- function(e) { + paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ") + } + my_expr <- expr + retval <- NULL + tryCatch( + expr = { + retval <- ( list( success = TRUE, value = my_expr(), msg = "" ) ) + } + , error = function(e) { + retval <<- list( success = FALSE, value = NA, msg = format_error(e) ) + } + ) + return (retval) +} + +# tryCatchProc produces a list +# On success of expr(), tryCatchProc produces +# list(success TRUE, msg = "") +# On failure of expr(), tryCatchProc produces +# list(success = FALSE, msg = "the error message") +tryCatchProc <- function(expr) { + # format error for logging + format_error <- function(e) { + paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ") + } + retval <- NULL + tryCatch( + expr = { + expr() + retval <- ( list( success = TRUE, msg = "" ) ) + } + , error = function(e) { + retval <<- list( success = FALSE, msg = format_error(e) ) + } + ) + return (retval) +} +