comparison dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip

Uploaded
author metaboflow_cam
date Tue, 22 Oct 2019 04:06:06 -0400
parents
children
comparison
equal deleted inserted replaced
1:b8b3148de013 2:a8155d5840e9
1 #' wl-06-11-2018, Tue: filter functions for DIMSP
2 #' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to
3 #' "#'" for 'lintr'.
4
5 suppressPackageStartupMessages({
6 library(reshape)
7 library(lattice)
8 library(impute)
9 library(pcaMethods)
10 })
11
12 #' =========================================================================
13 #' wl-02-06-2011: Relative standard deviation of matrix/data frame in
14 #' column wise
15 rsd <- function(x) {
16 mn <- colMeans(x, na.rm = TRUE)
17 std <- apply(x, 2, sd, na.rm = TRUE)
18 #' std <- sd(x,na.rm=TRUE) #' sd(<data.frame>) is deprecated.
19 res <- 100 * std / mn
20 return(res)
21 }
22
23 #' =======================================================================
24 #' wl-11-12-2007: Statistics and plot for missing values
25 mv.stats <- function(dat, grp = NULL, ...) {
26 #' overall missing values rate
27 mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat))
28
29 #' MV stats function for vector
30 vec.func <-
31 function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3)
32 #' vec.func <- function(x) sum(is.na(x)|is.nan(x)) #' number of MV
33 #' sum(is.na(x)|is.nan(x)|(x==0))
34
35 #' get number of Na, NaN and zero in each of feature/variable
36 #' mv.rep <- apply(dat, 1, vec.func)
37 mv.var <- apply(dat, 2, vec.func)
38
39 ret <- list(mv.overall = mv.all, mv.var = mv.var)
40
41 #' -------------------------------------------------------------------
42 if (!is.null(grp)) {
43 #' MV rate with respect of variables and class info
44 mv.grp <- sapply(levels(grp), function(y) {
45 idx <- (grp == y)
46 mat <- dat[idx, ]
47 mv <- apply(mat, 2, vec.func)
48 })
49 #' --------------------------------------------------------------------
50 #' wl-10-10-2011: Use aggregate. Beware that values passed in the
51 #' function is vector(columns).
52 #' mv.grp <- aggregate(dat, list(cls), vec.func)
53 #' rownames(mv.grp) <- mv.grp[,1]
54 #' mv.grp <- mv.grp[,-1]
55 #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F)
56
57 #' reshape matrix for lattice
58 mv.grp.1 <- data.frame(mv.grp)
59 mv.grp.1$all <- mv.var #' Combine all
60
61 var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1))
62 mv.grp.1 <- stack(mv.grp.1)
63 mv.grp.1$ind <- factor(mv.grp.1$ind,
64 levels = unique.default(mv.grp.1$ind)
65 )
66 mv.grp.1$var <- var
67
68 mv.grp.plot <-
69 xyplot(values ~ var | ind,
70 data = mv.grp.1, groups = ind, as.table = T,
71 layout = c(1, nlevels(mv.grp.1$ind)), type = "l",
72 auto.key = list(space = "right"),
73 #' main="Missing Values Percentage With Respect of Variables",
74 xlab = "Index of variables", ylab = "Percentage of missing values",
75 ...
76 )
77
78 #' --------------------------------------------------------------------
79 ret$mv.grp <- mv.grp
80 ret$mv.grp.plot <- mv.grp.plot
81 }
82
83 ret
84 }
85
86 #' =========================================================================
87 #' wl-11-10-2011: replace zero/negative with NA.
88 mv.zene <- function(dat) {
89 vec.func <- function(x) {
90 x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse
91 }
92
93 dat <- as.data.frame(dat, stringsAsFactors = F)
94 res <- sapply(dat, function(i) vec.func(i))
95 return(res)
96 }
97
98 #' ========================================================================
99 #' wl-06-11-2018, Tue: feature filter index based on missing values
100 #' Arguments:
101 #' x: a data frame where columns are features
102 #' thres_mv: threshold of missing values. Features less than this
103 #' threshold will be kept.
104 #' Return:
105 #' a logical vector of index for keeping features
106 .mv_filter <- function(x, thres_mv = 0.30) {
107 res <- mv.stats(x)
108 idx <- res$mv.var < thres_mv
109 return(idx)
110 }
111
112 #' ========================================================================
113 #' wl-06-11-2018, Tue: feature filter index based on RSD
114 #' Arguments:
115 #' x: a data frame where columns are features
116 #' thres_rsd: threshold of RSD. Features less than this threshold will be
117 #' kept.
118 #' Return:
119 #' a logical vector of index for keeping features
120 .rsd_filter <- function(x, thres_rsd = 20) {
121 res <- rsd(x)
122 idx <- res < thres_rsd
123 idx[is.na(idx)] <- FALSE
124 #' some stats
125 if (F) {
126 summary(res)
127 tmp <- hist(res, plot = F)$counts
128 hist(res,
129 xlab = "rsd", ylab = "Counts", col = "lightblue",
130 ylim = c(0, max(tmp) + 10)
131 )
132 }
133 return(idx)
134 }
135
136 #' ========================================================================
137 #' wl-06-11-2018, Tue: Feature filtering based on missing values of samples
138 #' Arguments:
139 #' data: a data matrix list including "sample", "qc" and "blank"
140 #' thres_mv: threshold of missing values on sample. Features less than this
141 #' threshold will be kept.
142 #' Return:
143 #' a filtered data list including "sample", "qc" and "blank"
144 mv_filter <- function(data, thres_mv = 0.30) {
145 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
146 data <- lapply(data, function(x) {
147 x <- x[, idx]
148 })
149
150 return(data)
151 }
152
153 #' ========================================================================
154 #' wl-06-11-2018, Tue: Feature filtering based on QC's RSD
155 #' wl-14-11-2018, Wed: add flag to missing value filtering
156 #' Arguments:
157 #' data: a data matrix list including "sample", "qc" and "blank"
158 #' thres_rsd: threshold of RSD on QC. Features less than this
159 #' threshold will be kept.
160 #' f_mv: a flag indicating whether or not to performance missing value
161 #' filtering.
162 #' f_mv_qc: a flag for filtering using percentage of missing values on "qc"
163 #' or "sample". TRUE is for "qc".
164 #' thres_mv: threshold of missing values. Features less than this
165 #' threshold will be kept.
166 #' Return:
167 #' a filtered data list including "sample", "qc" and "blank"
168 qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE,
169 f_mv_qc = FALSE, thres_mv = 0.30) {
170 #' ----------------------------------------------------------------------
171 #' 1) filtering based on missing values: sample or qc.
172 if (f_mv) {
173 if (f_mv_qc) {
174 mat <- data$qc
175 } else {
176 mat <- data$sample
177 }
178 idx <- .mv_filter(mat, thres_mv = thres_mv)
179 data <- lapply(data, function(x) {
180 x <- x[, idx]
181 })
182 }
183
184 #' ----------------------------------------------------------------------
185 #' 2) filtering based rsd of "qc"
186 idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd)
187 data <- lapply(data, function(x) {
188 x <- x[, idx]
189 })
190
191 return(data)
192 }
193
194 #' ========================================================================
195 #' wl-06-11-2018, Tue: Feature filtering based on blank
196 #' wl-14-11-2018, Wed: change order of missing value filtering
197 #' Arguments:
198 #' data: a data matrix list including "sample", "qc" and "blank"
199 #' method: method for stats. Support "mean", "median" and "max"
200 #' factor: multiplier for blank stats
201 #' f_mv: a flag indicating whether or not to performance missing value
202 #' filtering.
203 #' thres_mv: threshold of missing values on QC. Features less than this
204 #' threshold will be kept.
205 #' Return:
206 #' a filtered data list including "sample", "qc" and "blank"
207 blank_filter <- function(data, method = c("mean", "median", "max"),
208 factor = 1, f_mv = TRUE, thres_mv = 0.30) {
209 method <- match.arg(method)
210 #' ----------------------------------------------------------------------
211 #' 1) filtering based on missing values of "sample".
212 if (f_mv) {
213 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
214 data <- lapply(data, function(x) {
215 x <- x[, idx]
216 })
217 }
218
219 #' ----------------------------------------------------------------------
220 #' 2) filtering based on characteristics of blank intensities: mean, median
221 #' or max
222
223 stats.blank <- apply(data$blank, 2, method, na.rm = TRUE)
224 stats.blank <- factor * stats.blank
225 stats.sample <- apply(data$sample, 2, method, na.rm = TRUE)
226
227 #' keep features with sample stats are larger than blank
228 idx <- stats.sample >= stats.blank
229 idx[is.na(idx)] <- FALSE
230 #' Also keep features whose values are NA in blank
231 idx.1 <- is.na(stats.blank)
232 #' take union
233 idx <- idx | idx.1
234
235 data <- lapply(data, function(x) {
236 x <- x[, idx]
237 })
238
239 return(data)
240 }
241
242 #' =======================================================================
243 #' wl-19-11-2018, Mon: wrapper function for distribution of data stats such
244 #' as rsd and percentage of missing values.
245 #' Arguments:
246 #' x: an vector list including "sample", "qc" and "blank"
247 #' main: plot title
248 #' Return:
249 #' a list of lattice plot objects including histogram and boxplot
250 dist_plot <- function(x, main = "") {
251 x <- melt(x)
252 p.hist <- histogram(~ value | L1,
253 data = x,
254 type = "count", #' type="density",
255 nint = 100,
256 as.table = T, layout = c(1, 3), main = main,
257 scales = list(cex = .75, relation = "free")
258 )
259
260 #' ---------------------------------------------------------------------
261 #' histogram with density (problem with missing values)
262 #' ---------------------------------------------------------------------
263 #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50,
264 #' as.table=T, layout = c(1,3), main=main,
265 #' scales=list(cex =.75,relation="free"),
266 #' panel = function(x, subscripts, ...) {
267 #' panel.histogram(x, ...)
268 #' panel.mathdensity(dnorm, col = 'red', ...)
269 #' panel.densityplot(x, plot.points = FALSE, col = 'navy',...)
270 #' } )
271
272 #' boxplot
273 p.box <- bwplot(value ~ L1, data = x, main = main)
274
275 return(list(p.hist = p.hist, p.box = p.box))
276 }
277
278 #' ========================================================================
279 #' lwc-23-04-2010: Fill the zero/NA values by univariate.
280 mv.fill <- function(dat, method = "mean", ze_ne = FALSE) {
281 method <-
282 if (is.function(method)) {
283 method
284 } else if (is.character(method)) {
285 get(method)
286 } else {
287 eval(method)
288 }
289
290 vec.func <- function(x) {
291 if (ze_ne) {
292 x <- ifelse(x < .Machine$double.eps, NA, x)
293 #' vectorisation of ifelse
294 }
295 m <- method(x, na.rm = TRUE)
296
297 x[is.na(x)] <- m
298 x
299 }
300
301 dat <- as.data.frame(dat, stringsAsFactors = F)
302 res <- sapply(dat, function(i) vec.func(i))
303 return(res)
304 }
305
306 #' =======================================================================
307 #' wl-20-11-2018, Tue: Missing value imputation
308 mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) {
309 method <- match.arg(method)
310 if (method == "knn") {
311 x <- t(impute::impute.knn(t(x))$data)
312 #' x <- suppressWarnings(t(impute.knn(t(x))$data))
313 } else if (method == "pca") {
314 x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs
315 x[x < 0] <- min(x[x > 0]) #' in case negative value
316 } else {
317 x <- mv.fill(x, method = method)
318 }
319 x <- as.data.frame(x)
320 }