dimsp: dimsp/fs_filter.R comparison

comparison dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip

Uploaded

author	metaboflow_cam
date	Tue, 22 Oct 2019 04:06:06 -0400
parents
children

comparison

equal deleted inserted replaced

-:b8b3148de013
+:a8155d5840e9
+#' wl-06-11-2018, Tue: filter functions for DIMSP
+#' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to
+#'   "#'" for 'lintr'.
+suppressPackageStartupMessages({
+library(reshape)
+library(lattice)
+library(impute)
+library(pcaMethods)
+})
+#' =========================================================================
+#' wl-02-06-2011: Relative standard deviation of matrix/data frame in
+#' column wise
+rsd <- function(x) {
+mn <- colMeans(x, na.rm = TRUE)
+std <- apply(x, 2, sd, na.rm = TRUE)
+#' std <- sd(x,na.rm=TRUE)           #' sd(<data.frame>) is deprecated.
+res <- 100 * std / mn
+return(res)
+}
+#' =======================================================================
+#' wl-11-12-2007: Statistics and plot for missing values
+mv.stats <- function(dat, grp = NULL, ...) {
+#' overall missing values rate
+mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat))
+#' MV stats function for vector
+vec.func <-
+function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3)
+#' vec.func  <- function(x)  sum(is.na(x)|is.nan(x)) #' number of MV
+#' sum(is.na(x)|is.nan(x)|(x==0))
+#' get number of Na, NaN and zero in each of feature/variable
+#' mv.rep <- apply(dat, 1, vec.func)
+mv.var <- apply(dat, 2, vec.func)
+ret <- list(mv.overall = mv.all, mv.var = mv.var)
+#' -------------------------------------------------------------------
+if (!is.null(grp)) {
+#' MV rate with respect of variables and class info
+mv.grp <- sapply(levels(grp), function(y) {
+idx <- (grp == y)
+mat <- dat[idx, ]
+mv <- apply(mat, 2, vec.func)
+})
+#' --------------------------------------------------------------------
+#' wl-10-10-2011: Use aggregate. Beware that values passed in the
+#' function is vector(columns).
+#' mv.grp <- aggregate(dat, list(cls), vec.func)
+#' rownames(mv.grp) <- mv.grp[,1]
+#' mv.grp <- mv.grp[,-1]
+#' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F)
+#' reshape matrix for lattice
+mv.grp.1 <- data.frame(mv.grp)
+mv.grp.1$all <- mv.var #' Combine all
+var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1))
+mv.grp.1 <- stack(mv.grp.1)
+mv.grp.1$ind <- factor(mv.grp.1$ind,
+levels = unique.default(mv.grp.1$ind)
+)
+mv.grp.1$var <- var
+mv.grp.plot <-
+xyplot(values ~ var | ind,
+data = mv.grp.1, groups = ind, as.table = T,
+layout = c(1, nlevels(mv.grp.1$ind)), type = "l",
+auto.key = list(space = "right"),
+#' main="Missing Values Percentage With Respect of Variables",
+xlab = "Index of variables", ylab = "Percentage of missing values",
+...
+)
+#' --------------------------------------------------------------------
+ret$mv.grp <- mv.grp
+ret$mv.grp.plot <- mv.grp.plot
+}
+ret
+}
+#' =========================================================================
+#' wl-11-10-2011: replace zero/negative with NA.
+mv.zene <- function(dat) {
+vec.func <- function(x) {
+x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse
+}
+dat <- as.data.frame(dat, stringsAsFactors = F)
+res <- sapply(dat, function(i) vec.func(i))
+return(res)
+}
+#' ========================================================================
+#' wl-06-11-2018, Tue: feature filter index based on missing values
+#' Arguments:
+#'   x: a data frame where columns are features
+#'   thres_mv: threshold of missing values. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a logical vector of index for keeping features
+.mv_filter <- function(x, thres_mv = 0.30) {
+res <- mv.stats(x)
+idx <- res$mv.var < thres_mv
+return(idx)
+}
+#' ========================================================================
+#' wl-06-11-2018, Tue: feature filter index based on RSD
+#' Arguments:
+#'   x: a data frame where columns are features
+#'   thres_rsd: threshold of RSD. Features less than this threshold will be
+#'              kept.
+#' Return:
+#'   a logical vector of index for keeping features
+.rsd_filter <- function(x, thres_rsd = 20) {
+res <- rsd(x)
+idx <- res < thres_rsd
+idx[is.na(idx)] <- FALSE
+#' some stats
+if (F) {
+summary(res)
+tmp <- hist(res, plot = F)$counts
+hist(res,
+xlab = "rsd", ylab = "Counts", col = "lightblue",
+ylim = c(0, max(tmp) + 10)
+)
+}
+return(idx)
+}
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on missing values of samples
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   thres_mv: threshold of missing values on sample. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+mv_filter <- function(data, thres_mv = 0.30) {
+idx <- .mv_filter(data$sample, thres_mv = thres_mv)
+data <- lapply(data, function(x) {
+x <- x[, idx]
+})
+return(data)
+}
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on QC's RSD
+#' wl-14-11-2018, Wed: add flag to missing value filtering
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   thres_rsd: threshold of RSD on QC. Features less than this
+#'             threshold will be kept.
+#'   f_mv: a flag indicating whether or not to performance missing value
+#'         filtering.
+#'   f_mv_qc: a flag for filtering using percentage of missing values on "qc"
+#'            or "sample". TRUE is for "qc".
+#'   thres_mv: threshold of missing values. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE,
+f_mv_qc = FALSE, thres_mv = 0.30) {
+#' ----------------------------------------------------------------------
+#' 1) filtering based on missing values: sample or qc.
+if (f_mv) {
+if (f_mv_qc) {
+mat <- data$qc
+} else {
+mat <- data$sample
+}
+idx <- .mv_filter(mat, thres_mv = thres_mv)
+data <- lapply(data, function(x) {
+x <- x[, idx]
+})
+}
+#' ----------------------------------------------------------------------
+#' 2) filtering based rsd of "qc"
+idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd)
+data <- lapply(data, function(x) {
+x <- x[, idx]
+})
+return(data)
+}
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on blank
+#' wl-14-11-2018, Wed: change order of missing value filtering
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   method: method for stats. Support "mean", "median" and "max"
+#'   factor: multiplier for blank stats
+#'   f_mv: a flag indicating whether or not to performance missing value
+#'         filtering.
+#'   thres_mv: threshold of missing values on QC. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+blank_filter <- function(data, method = c("mean", "median", "max"),
+factor = 1, f_mv = TRUE, thres_mv = 0.30) {
+method <- match.arg(method)
+#' ----------------------------------------------------------------------
+#' 1) filtering based on missing values of "sample".
+if (f_mv) {
+idx <- .mv_filter(data$sample, thres_mv = thres_mv)
+data <- lapply(data, function(x) {
+x <- x[, idx]
+})
+}
+#' ----------------------------------------------------------------------
+#' 2) filtering based on characteristics of blank intensities: mean, median
+#'    or max
+stats.blank <- apply(data$blank, 2, method, na.rm = TRUE)
+stats.blank <- factor * stats.blank
+stats.sample <- apply(data$sample, 2, method, na.rm = TRUE)
+#' keep features with sample stats are larger than blank
+idx <- stats.sample >= stats.blank
+idx[is.na(idx)] <- FALSE
+#' Also keep features whose values are NA in blank
+idx.1 <- is.na(stats.blank)
+#' take union
+idx <- idx | idx.1
+data <- lapply(data, function(x) {
+x <- x[, idx]
+})
+return(data)
+}
+#' =======================================================================
+#' wl-19-11-2018, Mon: wrapper function for distribution of data stats such
+#' as rsd and percentage of missing values.
+#' Arguments:
+#'   x: an vector list including "sample", "qc" and "blank"
+#'   main: plot title
+#' Return:
+#'   a list of lattice plot objects including histogram and boxplot
+dist_plot <- function(x, main = "") {
+x <- melt(x)
+p.hist <- histogram(~ value | L1,
+data = x,
+type = "count", #' type="density",
+nint = 100,
+as.table = T, layout = c(1, 3), main = main,
+scales = list(cex = .75, relation = "free")
+)
+#' ---------------------------------------------------------------------
+#' histogram with density (problem with missing values)
+#' ---------------------------------------------------------------------
+#' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50,
+#'                as.table=T,  layout = c(1,3), main=main,
+#'                scales=list(cex =.75,relation="free"),
+#'                panel = function(x, subscripts, ...) {
+#'                  panel.histogram(x, ...)
+#'                  panel.mathdensity(dnorm, col = 'red', ...)
+#'                  panel.densityplot(x, plot.points = FALSE, col = 'navy',...)
+#'                } )
+#' boxplot
+p.box <- bwplot(value ~ L1, data = x, main = main)
+return(list(p.hist = p.hist, p.box = p.box))
+}
+#' ========================================================================
+#' lwc-23-04-2010: Fill the zero/NA values by univariate.
+mv.fill <- function(dat, method = "mean", ze_ne = FALSE) {
+method <-
+if (is.function(method)) {
+method
+} else if (is.character(method)) {
+get(method)
+} else {
+eval(method)
+}
+vec.func <- function(x) {
+if (ze_ne) {
+x <- ifelse(x < .Machine$double.eps, NA, x)
+#' vectorisation of ifelse
+}
+m <- method(x, na.rm = TRUE)
+x[is.na(x)] <- m
+x
+}
+dat <- as.data.frame(dat, stringsAsFactors = F)
+res <- sapply(dat, function(i) vec.func(i))
+return(res)
+}
+#' =======================================================================
+#' wl-20-11-2018, Tue: Missing value imputation
+mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) {
+method <- match.arg(method)
+if (method == "knn") {
+x <- t(impute::impute.knn(t(x))$data)
+#' x <- suppressWarnings(t(impute.knn(t(x))$data))
+} else if (method == "pca") {
+x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs
+x[x < 0] <- min(x[x > 0]) #' in case negative value
+} else {
+x <- mv.fill(x, method = method)
+}
+x <- as.data.frame(x)
+}

Mercurial > repos > metaboflow_cam > dimsp

comparison dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip