Mercurial > repos > metaboflow_cam > dimsp
view dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip
Uploaded
author | metaboflow_cam |
---|---|
date | Tue, 22 Oct 2019 04:06:06 -0400 |
parents | |
children |
line wrap: on
line source
#' wl-06-11-2018, Tue: filter functions for DIMSP #' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to #' "#'" for 'lintr'. suppressPackageStartupMessages({ library(reshape) library(lattice) library(impute) library(pcaMethods) }) #' ========================================================================= #' wl-02-06-2011: Relative standard deviation of matrix/data frame in #' column wise rsd <- function(x) { mn <- colMeans(x, na.rm = TRUE) std <- apply(x, 2, sd, na.rm = TRUE) #' std <- sd(x,na.rm=TRUE) #' sd(<data.frame>) is deprecated. res <- 100 * std / mn return(res) } #' ======================================================================= #' wl-11-12-2007: Statistics and plot for missing values mv.stats <- function(dat, grp = NULL, ...) { #' overall missing values rate mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat)) #' MV stats function for vector vec.func <- function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3) #' vec.func <- function(x) sum(is.na(x)|is.nan(x)) #' number of MV #' sum(is.na(x)|is.nan(x)|(x==0)) #' get number of Na, NaN and zero in each of feature/variable #' mv.rep <- apply(dat, 1, vec.func) mv.var <- apply(dat, 2, vec.func) ret <- list(mv.overall = mv.all, mv.var = mv.var) #' ------------------------------------------------------------------- if (!is.null(grp)) { #' MV rate with respect of variables and class info mv.grp <- sapply(levels(grp), function(y) { idx <- (grp == y) mat <- dat[idx, ] mv <- apply(mat, 2, vec.func) }) #' -------------------------------------------------------------------- #' wl-10-10-2011: Use aggregate. Beware that values passed in the #' function is vector(columns). #' mv.grp <- aggregate(dat, list(cls), vec.func) #' rownames(mv.grp) <- mv.grp[,1] #' mv.grp <- mv.grp[,-1] #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F) #' reshape matrix for lattice mv.grp.1 <- data.frame(mv.grp) mv.grp.1$all <- mv.var #' Combine all var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1)) mv.grp.1 <- stack(mv.grp.1) mv.grp.1$ind <- factor(mv.grp.1$ind, levels = unique.default(mv.grp.1$ind) ) mv.grp.1$var <- var mv.grp.plot <- xyplot(values ~ var | ind, data = mv.grp.1, groups = ind, as.table = T, layout = c(1, nlevels(mv.grp.1$ind)), type = "l", auto.key = list(space = "right"), #' main="Missing Values Percentage With Respect of Variables", xlab = "Index of variables", ylab = "Percentage of missing values", ... ) #' -------------------------------------------------------------------- ret$mv.grp <- mv.grp ret$mv.grp.plot <- mv.grp.plot } ret } #' ========================================================================= #' wl-11-10-2011: replace zero/negative with NA. mv.zene <- function(dat) { vec.func <- function(x) { x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse } dat <- as.data.frame(dat, stringsAsFactors = F) res <- sapply(dat, function(i) vec.func(i)) return(res) } #' ======================================================================== #' wl-06-11-2018, Tue: feature filter index based on missing values #' Arguments: #' x: a data frame where columns are features #' thres_mv: threshold of missing values. Features less than this #' threshold will be kept. #' Return: #' a logical vector of index for keeping features .mv_filter <- function(x, thres_mv = 0.30) { res <- mv.stats(x) idx <- res$mv.var < thres_mv return(idx) } #' ======================================================================== #' wl-06-11-2018, Tue: feature filter index based on RSD #' Arguments: #' x: a data frame where columns are features #' thres_rsd: threshold of RSD. Features less than this threshold will be #' kept. #' Return: #' a logical vector of index for keeping features .rsd_filter <- function(x, thres_rsd = 20) { res <- rsd(x) idx <- res < thres_rsd idx[is.na(idx)] <- FALSE #' some stats if (F) { summary(res) tmp <- hist(res, plot = F)$counts hist(res, xlab = "rsd", ylab = "Counts", col = "lightblue", ylim = c(0, max(tmp) + 10) ) } return(idx) } #' ======================================================================== #' wl-06-11-2018, Tue: Feature filtering based on missing values of samples #' Arguments: #' data: a data matrix list including "sample", "qc" and "blank" #' thres_mv: threshold of missing values on sample. Features less than this #' threshold will be kept. #' Return: #' a filtered data list including "sample", "qc" and "blank" mv_filter <- function(data, thres_mv = 0.30) { idx <- .mv_filter(data$sample, thres_mv = thres_mv) data <- lapply(data, function(x) { x <- x[, idx] }) return(data) } #' ======================================================================== #' wl-06-11-2018, Tue: Feature filtering based on QC's RSD #' wl-14-11-2018, Wed: add flag to missing value filtering #' Arguments: #' data: a data matrix list including "sample", "qc" and "blank" #' thres_rsd: threshold of RSD on QC. Features less than this #' threshold will be kept. #' f_mv: a flag indicating whether or not to performance missing value #' filtering. #' f_mv_qc: a flag for filtering using percentage of missing values on "qc" #' or "sample". TRUE is for "qc". #' thres_mv: threshold of missing values. Features less than this #' threshold will be kept. #' Return: #' a filtered data list including "sample", "qc" and "blank" qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE, f_mv_qc = FALSE, thres_mv = 0.30) { #' ---------------------------------------------------------------------- #' 1) filtering based on missing values: sample or qc. if (f_mv) { if (f_mv_qc) { mat <- data$qc } else { mat <- data$sample } idx <- .mv_filter(mat, thres_mv = thres_mv) data <- lapply(data, function(x) { x <- x[, idx] }) } #' ---------------------------------------------------------------------- #' 2) filtering based rsd of "qc" idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd) data <- lapply(data, function(x) { x <- x[, idx] }) return(data) } #' ======================================================================== #' wl-06-11-2018, Tue: Feature filtering based on blank #' wl-14-11-2018, Wed: change order of missing value filtering #' Arguments: #' data: a data matrix list including "sample", "qc" and "blank" #' method: method for stats. Support "mean", "median" and "max" #' factor: multiplier for blank stats #' f_mv: a flag indicating whether or not to performance missing value #' filtering. #' thres_mv: threshold of missing values on QC. Features less than this #' threshold will be kept. #' Return: #' a filtered data list including "sample", "qc" and "blank" blank_filter <- function(data, method = c("mean", "median", "max"), factor = 1, f_mv = TRUE, thres_mv = 0.30) { method <- match.arg(method) #' ---------------------------------------------------------------------- #' 1) filtering based on missing values of "sample". if (f_mv) { idx <- .mv_filter(data$sample, thres_mv = thres_mv) data <- lapply(data, function(x) { x <- x[, idx] }) } #' ---------------------------------------------------------------------- #' 2) filtering based on characteristics of blank intensities: mean, median #' or max stats.blank <- apply(data$blank, 2, method, na.rm = TRUE) stats.blank <- factor * stats.blank stats.sample <- apply(data$sample, 2, method, na.rm = TRUE) #' keep features with sample stats are larger than blank idx <- stats.sample >= stats.blank idx[is.na(idx)] <- FALSE #' Also keep features whose values are NA in blank idx.1 <- is.na(stats.blank) #' take union idx <- idx | idx.1 data <- lapply(data, function(x) { x <- x[, idx] }) return(data) } #' ======================================================================= #' wl-19-11-2018, Mon: wrapper function for distribution of data stats such #' as rsd and percentage of missing values. #' Arguments: #' x: an vector list including "sample", "qc" and "blank" #' main: plot title #' Return: #' a list of lattice plot objects including histogram and boxplot dist_plot <- function(x, main = "") { x <- melt(x) p.hist <- histogram(~ value | L1, data = x, type = "count", #' type="density", nint = 100, as.table = T, layout = c(1, 3), main = main, scales = list(cex = .75, relation = "free") ) #' --------------------------------------------------------------------- #' histogram with density (problem with missing values) #' --------------------------------------------------------------------- #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50, #' as.table=T, layout = c(1,3), main=main, #' scales=list(cex =.75,relation="free"), #' panel = function(x, subscripts, ...) { #' panel.histogram(x, ...) #' panel.mathdensity(dnorm, col = 'red', ...) #' panel.densityplot(x, plot.points = FALSE, col = 'navy',...) #' } ) #' boxplot p.box <- bwplot(value ~ L1, data = x, main = main) return(list(p.hist = p.hist, p.box = p.box)) } #' ======================================================================== #' lwc-23-04-2010: Fill the zero/NA values by univariate. mv.fill <- function(dat, method = "mean", ze_ne = FALSE) { method <- if (is.function(method)) { method } else if (is.character(method)) { get(method) } else { eval(method) } vec.func <- function(x) { if (ze_ne) { x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse } m <- method(x, na.rm = TRUE) x[is.na(x)] <- m x } dat <- as.data.frame(dat, stringsAsFactors = F) res <- sapply(dat, function(i) vec.func(i)) return(res) } #' ======================================================================= #' wl-20-11-2018, Tue: Missing value imputation mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) { method <- match.arg(method) if (method == "knn") { x <- t(impute::impute.knn(t(x))$data) #' x <- suppressWarnings(t(impute.knn(t(x))$data)) } else if (method == "pca") { x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs x[x < 0] <- min(x[x > 0]) #' in case negative value } else { x <- mv.fill(x, method = method) } x <- as.data.frame(x) }