diff dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip

Uploaded
author metaboflow_cam
date Tue, 22 Oct 2019 04:06:06 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dimsp/fs_filter.R	Tue Oct 22 04:06:06 2019 -0400
@@ -0,0 +1,320 @@
+#' wl-06-11-2018, Tue: filter functions for DIMSP
+#' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to
+#'   "#'" for 'lintr'.
+
+suppressPackageStartupMessages({
+  library(reshape)
+  library(lattice)
+  library(impute)
+  library(pcaMethods)
+})
+
+#' =========================================================================
+#' wl-02-06-2011: Relative standard deviation of matrix/data frame in
+#' column wise
+rsd <- function(x) {
+  mn <- colMeans(x, na.rm = TRUE)
+  std <- apply(x, 2, sd, na.rm = TRUE)
+  #' std <- sd(x,na.rm=TRUE)           #' sd(<data.frame>) is deprecated.
+  res <- 100 * std / mn
+  return(res)
+}
+
+#' =======================================================================
+#' wl-11-12-2007: Statistics and plot for missing values
+mv.stats <- function(dat, grp = NULL, ...) {
+  #' overall missing values rate
+  mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat))
+
+  #' MV stats function for vector
+  vec.func <-
+    function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3)
+  #' vec.func  <- function(x)  sum(is.na(x)|is.nan(x)) #' number of MV
+  #' sum(is.na(x)|is.nan(x)|(x==0))
+
+  #' get number of Na, NaN and zero in each of feature/variable
+  #' mv.rep <- apply(dat, 1, vec.func)
+  mv.var <- apply(dat, 2, vec.func)
+
+  ret <- list(mv.overall = mv.all, mv.var = mv.var)
+
+  #' -------------------------------------------------------------------
+  if (!is.null(grp)) {
+    #' MV rate with respect of variables and class info
+    mv.grp <- sapply(levels(grp), function(y) {
+      idx <- (grp == y)
+      mat <- dat[idx, ]
+      mv <- apply(mat, 2, vec.func)
+    })
+    #' --------------------------------------------------------------------
+    #' wl-10-10-2011: Use aggregate. Beware that values passed in the
+    #' function is vector(columns).
+    #' mv.grp <- aggregate(dat, list(cls), vec.func)
+    #' rownames(mv.grp) <- mv.grp[,1]
+    #' mv.grp <- mv.grp[,-1]
+    #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F)
+
+    #' reshape matrix for lattice
+    mv.grp.1 <- data.frame(mv.grp)
+    mv.grp.1$all <- mv.var #' Combine all
+
+    var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1))
+    mv.grp.1 <- stack(mv.grp.1)
+    mv.grp.1$ind <- factor(mv.grp.1$ind,
+      levels = unique.default(mv.grp.1$ind)
+    )
+    mv.grp.1$var <- var
+
+    mv.grp.plot <-
+      xyplot(values ~ var | ind,
+        data = mv.grp.1, groups = ind, as.table = T,
+        layout = c(1, nlevels(mv.grp.1$ind)), type = "l",
+        auto.key = list(space = "right"),
+        #' main="Missing Values Percentage With Respect of Variables",
+        xlab = "Index of variables", ylab = "Percentage of missing values",
+        ...
+      )
+
+    #' --------------------------------------------------------------------
+    ret$mv.grp <- mv.grp
+    ret$mv.grp.plot <- mv.grp.plot
+  }
+
+  ret
+}
+
+#' =========================================================================
+#' wl-11-10-2011: replace zero/negative with NA.
+mv.zene <- function(dat) {
+  vec.func <- function(x) {
+    x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse
+  }
+
+  dat <- as.data.frame(dat, stringsAsFactors = F)
+  res <- sapply(dat, function(i) vec.func(i))
+  return(res)
+}
+
+#' ========================================================================
+#' wl-06-11-2018, Tue: feature filter index based on missing values
+#' Arguments:
+#'   x: a data frame where columns are features
+#'   thres_mv: threshold of missing values. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a logical vector of index for keeping features
+.mv_filter <- function(x, thres_mv = 0.30) {
+  res <- mv.stats(x)
+  idx <- res$mv.var < thres_mv
+  return(idx)
+}
+
+#' ========================================================================
+#' wl-06-11-2018, Tue: feature filter index based on RSD
+#' Arguments:
+#'   x: a data frame where columns are features
+#'   thres_rsd: threshold of RSD. Features less than this threshold will be
+#'              kept.
+#' Return:
+#'   a logical vector of index for keeping features
+.rsd_filter <- function(x, thres_rsd = 20) {
+  res <- rsd(x)
+  idx <- res < thres_rsd
+  idx[is.na(idx)] <- FALSE
+  #' some stats
+  if (F) {
+    summary(res)
+    tmp <- hist(res, plot = F)$counts
+    hist(res,
+      xlab = "rsd", ylab = "Counts", col = "lightblue",
+      ylim = c(0, max(tmp) + 10)
+    )
+  }
+  return(idx)
+}
+
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on missing values of samples
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   thres_mv: threshold of missing values on sample. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+mv_filter <- function(data, thres_mv = 0.30) {
+  idx <- .mv_filter(data$sample, thres_mv = thres_mv)
+  data <- lapply(data, function(x) {
+    x <- x[, idx]
+  })
+
+  return(data)
+}
+
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on QC's RSD
+#' wl-14-11-2018, Wed: add flag to missing value filtering
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   thres_rsd: threshold of RSD on QC. Features less than this
+#'             threshold will be kept.
+#'   f_mv: a flag indicating whether or not to performance missing value
+#'         filtering.
+#'   f_mv_qc: a flag for filtering using percentage of missing values on "qc"
+#'            or "sample". TRUE is for "qc".
+#'   thres_mv: threshold of missing values. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE,
+                      f_mv_qc = FALSE, thres_mv = 0.30) {
+  #' ----------------------------------------------------------------------
+  #' 1) filtering based on missing values: sample or qc.
+  if (f_mv) {
+    if (f_mv_qc) {
+      mat <- data$qc
+    } else {
+      mat <- data$sample
+    }
+    idx <- .mv_filter(mat, thres_mv = thres_mv)
+    data <- lapply(data, function(x) {
+      x <- x[, idx]
+    })
+  }
+
+  #' ----------------------------------------------------------------------
+  #' 2) filtering based rsd of "qc"
+  idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd)
+  data <- lapply(data, function(x) {
+    x <- x[, idx]
+  })
+
+  return(data)
+}
+
+#' ========================================================================
+#' wl-06-11-2018, Tue: Feature filtering based on blank
+#' wl-14-11-2018, Wed: change order of missing value filtering
+#' Arguments:
+#'   data: a data matrix list including "sample", "qc" and "blank"
+#'   method: method for stats. Support "mean", "median" and "max"
+#'   factor: multiplier for blank stats
+#'   f_mv: a flag indicating whether or not to performance missing value
+#'         filtering.
+#'   thres_mv: threshold of missing values on QC. Features less than this
+#'             threshold will be kept.
+#' Return:
+#'   a filtered data list including "sample", "qc" and "blank"
+blank_filter <- function(data, method = c("mean", "median", "max"),
+                         factor = 1, f_mv = TRUE, thres_mv = 0.30) {
+  method <- match.arg(method)
+  #' ----------------------------------------------------------------------
+  #' 1) filtering based on missing values of "sample".
+  if (f_mv) {
+    idx <- .mv_filter(data$sample, thres_mv = thres_mv)
+    data <- lapply(data, function(x) {
+      x <- x[, idx]
+    })
+  }
+
+  #' ----------------------------------------------------------------------
+  #' 2) filtering based on characteristics of blank intensities: mean, median
+  #'    or max
+
+  stats.blank <- apply(data$blank, 2, method, na.rm = TRUE)
+  stats.blank <- factor * stats.blank
+  stats.sample <- apply(data$sample, 2, method, na.rm = TRUE)
+
+  #' keep features with sample stats are larger than blank
+  idx <- stats.sample >= stats.blank
+  idx[is.na(idx)] <- FALSE
+  #' Also keep features whose values are NA in blank
+  idx.1 <- is.na(stats.blank)
+  #' take union
+  idx <- idx | idx.1
+
+  data <- lapply(data, function(x) {
+    x <- x[, idx]
+  })
+
+  return(data)
+}
+
+#' =======================================================================
+#' wl-19-11-2018, Mon: wrapper function for distribution of data stats such
+#' as rsd and percentage of missing values.
+#' Arguments:
+#'   x: an vector list including "sample", "qc" and "blank"
+#'   main: plot title
+#' Return:
+#'   a list of lattice plot objects including histogram and boxplot
+dist_plot <- function(x, main = "") {
+  x <- melt(x)
+  p.hist <- histogram(~ value | L1,
+    data = x,
+    type = "count", #' type="density",
+    nint = 100,
+    as.table = T, layout = c(1, 3), main = main,
+    scales = list(cex = .75, relation = "free")
+  )
+
+  #' ---------------------------------------------------------------------
+  #' histogram with density (problem with missing values)
+  #' ---------------------------------------------------------------------
+  #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50,
+  #'                as.table=T,  layout = c(1,3), main=main,
+  #'                scales=list(cex =.75,relation="free"),
+  #'                panel = function(x, subscripts, ...) {
+  #'                  panel.histogram(x, ...)
+  #'                  panel.mathdensity(dnorm, col = 'red', ...)
+  #'                  panel.densityplot(x, plot.points = FALSE, col = 'navy',...)
+  #'                } )
+
+  #' boxplot
+  p.box <- bwplot(value ~ L1, data = x, main = main)
+
+  return(list(p.hist = p.hist, p.box = p.box))
+}
+
+#' ========================================================================
+#' lwc-23-04-2010: Fill the zero/NA values by univariate.
+mv.fill <- function(dat, method = "mean", ze_ne = FALSE) {
+  method <-
+    if (is.function(method)) {
+      method
+    } else if (is.character(method)) {
+      get(method)
+    } else {
+      eval(method)
+    }
+
+  vec.func <- function(x) {
+    if (ze_ne) {
+      x <- ifelse(x < .Machine$double.eps, NA, x)
+      #' vectorisation of ifelse
+    }
+    m <- method(x, na.rm = TRUE)
+
+    x[is.na(x)] <- m
+    x
+  }
+
+  dat <- as.data.frame(dat, stringsAsFactors = F)
+  res <- sapply(dat, function(i) vec.func(i))
+  return(res)
+}
+
+#' =======================================================================
+#' wl-20-11-2018, Tue: Missing value imputation
+mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) {
+  method <- match.arg(method)
+  if (method == "knn") {
+    x <- t(impute::impute.knn(t(x))$data)
+    #' x <- suppressWarnings(t(impute.knn(t(x))$data))
+  } else if (method == "pca") {
+    x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs
+    x[x < 0] <- min(x[x > 0]) #' in case negative value
+  } else {
+    x <- mv.fill(x, method = method)
+  }
+  x <- as.data.frame(x)
+}