2
|
1 #' wl-06-11-2018, Tue: filter functions for DIMSP
|
|
2 #' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to
|
|
3 #' "#'" for 'lintr'.
|
|
4
|
|
5 suppressPackageStartupMessages({
|
|
6 library(reshape)
|
|
7 library(lattice)
|
|
8 library(impute)
|
|
9 library(pcaMethods)
|
|
10 })
|
|
11
|
|
12 #' =========================================================================
|
|
13 #' wl-02-06-2011: Relative standard deviation of matrix/data frame in
|
|
14 #' column wise
|
|
15 rsd <- function(x) {
|
|
16 mn <- colMeans(x, na.rm = TRUE)
|
|
17 std <- apply(x, 2, sd, na.rm = TRUE)
|
|
18 #' std <- sd(x,na.rm=TRUE) #' sd(<data.frame>) is deprecated.
|
|
19 res <- 100 * std / mn
|
|
20 return(res)
|
|
21 }
|
|
22
|
|
23 #' =======================================================================
|
|
24 #' wl-11-12-2007: Statistics and plot for missing values
|
|
25 mv.stats <- function(dat, grp = NULL, ...) {
|
|
26 #' overall missing values rate
|
|
27 mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat))
|
|
28
|
|
29 #' MV stats function for vector
|
|
30 vec.func <-
|
|
31 function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3)
|
|
32 #' vec.func <- function(x) sum(is.na(x)|is.nan(x)) #' number of MV
|
|
33 #' sum(is.na(x)|is.nan(x)|(x==0))
|
|
34
|
|
35 #' get number of Na, NaN and zero in each of feature/variable
|
|
36 #' mv.rep <- apply(dat, 1, vec.func)
|
|
37 mv.var <- apply(dat, 2, vec.func)
|
|
38
|
|
39 ret <- list(mv.overall = mv.all, mv.var = mv.var)
|
|
40
|
|
41 #' -------------------------------------------------------------------
|
|
42 if (!is.null(grp)) {
|
|
43 #' MV rate with respect of variables and class info
|
|
44 mv.grp <- sapply(levels(grp), function(y) {
|
|
45 idx <- (grp == y)
|
|
46 mat <- dat[idx, ]
|
|
47 mv <- apply(mat, 2, vec.func)
|
|
48 })
|
|
49 #' --------------------------------------------------------------------
|
|
50 #' wl-10-10-2011: Use aggregate. Beware that values passed in the
|
|
51 #' function is vector(columns).
|
|
52 #' mv.grp <- aggregate(dat, list(cls), vec.func)
|
|
53 #' rownames(mv.grp) <- mv.grp[,1]
|
|
54 #' mv.grp <- mv.grp[,-1]
|
|
55 #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F)
|
|
56
|
|
57 #' reshape matrix for lattice
|
|
58 mv.grp.1 <- data.frame(mv.grp)
|
|
59 mv.grp.1$all <- mv.var #' Combine all
|
|
60
|
|
61 var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1))
|
|
62 mv.grp.1 <- stack(mv.grp.1)
|
|
63 mv.grp.1$ind <- factor(mv.grp.1$ind,
|
|
64 levels = unique.default(mv.grp.1$ind)
|
|
65 )
|
|
66 mv.grp.1$var <- var
|
|
67
|
|
68 mv.grp.plot <-
|
|
69 xyplot(values ~ var | ind,
|
|
70 data = mv.grp.1, groups = ind, as.table = T,
|
|
71 layout = c(1, nlevels(mv.grp.1$ind)), type = "l",
|
|
72 auto.key = list(space = "right"),
|
|
73 #' main="Missing Values Percentage With Respect of Variables",
|
|
74 xlab = "Index of variables", ylab = "Percentage of missing values",
|
|
75 ...
|
|
76 )
|
|
77
|
|
78 #' --------------------------------------------------------------------
|
|
79 ret$mv.grp <- mv.grp
|
|
80 ret$mv.grp.plot <- mv.grp.plot
|
|
81 }
|
|
82
|
|
83 ret
|
|
84 }
|
|
85
|
|
86 #' =========================================================================
|
|
87 #' wl-11-10-2011: replace zero/negative with NA.
|
|
88 mv.zene <- function(dat) {
|
|
89 vec.func <- function(x) {
|
|
90 x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse
|
|
91 }
|
|
92
|
|
93 dat <- as.data.frame(dat, stringsAsFactors = F)
|
|
94 res <- sapply(dat, function(i) vec.func(i))
|
|
95 return(res)
|
|
96 }
|
|
97
|
|
98 #' ========================================================================
|
|
99 #' wl-06-11-2018, Tue: feature filter index based on missing values
|
|
100 #' Arguments:
|
|
101 #' x: a data frame where columns are features
|
|
102 #' thres_mv: threshold of missing values. Features less than this
|
|
103 #' threshold will be kept.
|
|
104 #' Return:
|
|
105 #' a logical vector of index for keeping features
|
|
106 .mv_filter <- function(x, thres_mv = 0.30) {
|
|
107 res <- mv.stats(x)
|
|
108 idx <- res$mv.var < thres_mv
|
|
109 return(idx)
|
|
110 }
|
|
111
|
|
112 #' ========================================================================
|
|
113 #' wl-06-11-2018, Tue: feature filter index based on RSD
|
|
114 #' Arguments:
|
|
115 #' x: a data frame where columns are features
|
|
116 #' thres_rsd: threshold of RSD. Features less than this threshold will be
|
|
117 #' kept.
|
|
118 #' Return:
|
|
119 #' a logical vector of index for keeping features
|
|
120 .rsd_filter <- function(x, thres_rsd = 20) {
|
|
121 res <- rsd(x)
|
|
122 idx <- res < thres_rsd
|
|
123 idx[is.na(idx)] <- FALSE
|
|
124 #' some stats
|
|
125 if (F) {
|
|
126 summary(res)
|
|
127 tmp <- hist(res, plot = F)$counts
|
|
128 hist(res,
|
|
129 xlab = "rsd", ylab = "Counts", col = "lightblue",
|
|
130 ylim = c(0, max(tmp) + 10)
|
|
131 )
|
|
132 }
|
|
133 return(idx)
|
|
134 }
|
|
135
|
|
136 #' ========================================================================
|
|
137 #' wl-06-11-2018, Tue: Feature filtering based on missing values of samples
|
|
138 #' Arguments:
|
|
139 #' data: a data matrix list including "sample", "qc" and "blank"
|
|
140 #' thres_mv: threshold of missing values on sample. Features less than this
|
|
141 #' threshold will be kept.
|
|
142 #' Return:
|
|
143 #' a filtered data list including "sample", "qc" and "blank"
|
|
144 mv_filter <- function(data, thres_mv = 0.30) {
|
|
145 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
|
|
146 data <- lapply(data, function(x) {
|
|
147 x <- x[, idx]
|
|
148 })
|
|
149
|
|
150 return(data)
|
|
151 }
|
|
152
|
|
153 #' ========================================================================
|
|
154 #' wl-06-11-2018, Tue: Feature filtering based on QC's RSD
|
|
155 #' wl-14-11-2018, Wed: add flag to missing value filtering
|
|
156 #' Arguments:
|
|
157 #' data: a data matrix list including "sample", "qc" and "blank"
|
|
158 #' thres_rsd: threshold of RSD on QC. Features less than this
|
|
159 #' threshold will be kept.
|
|
160 #' f_mv: a flag indicating whether or not to performance missing value
|
|
161 #' filtering.
|
|
162 #' f_mv_qc: a flag for filtering using percentage of missing values on "qc"
|
|
163 #' or "sample". TRUE is for "qc".
|
|
164 #' thres_mv: threshold of missing values. Features less than this
|
|
165 #' threshold will be kept.
|
|
166 #' Return:
|
|
167 #' a filtered data list including "sample", "qc" and "blank"
|
|
168 qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE,
|
|
169 f_mv_qc = FALSE, thres_mv = 0.30) {
|
|
170 #' ----------------------------------------------------------------------
|
|
171 #' 1) filtering based on missing values: sample or qc.
|
|
172 if (f_mv) {
|
|
173 if (f_mv_qc) {
|
|
174 mat <- data$qc
|
|
175 } else {
|
|
176 mat <- data$sample
|
|
177 }
|
|
178 idx <- .mv_filter(mat, thres_mv = thres_mv)
|
|
179 data <- lapply(data, function(x) {
|
|
180 x <- x[, idx]
|
|
181 })
|
|
182 }
|
|
183
|
|
184 #' ----------------------------------------------------------------------
|
|
185 #' 2) filtering based rsd of "qc"
|
|
186 idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd)
|
|
187 data <- lapply(data, function(x) {
|
|
188 x <- x[, idx]
|
|
189 })
|
|
190
|
|
191 return(data)
|
|
192 }
|
|
193
|
|
194 #' ========================================================================
|
|
195 #' wl-06-11-2018, Tue: Feature filtering based on blank
|
|
196 #' wl-14-11-2018, Wed: change order of missing value filtering
|
|
197 #' Arguments:
|
|
198 #' data: a data matrix list including "sample", "qc" and "blank"
|
|
199 #' method: method for stats. Support "mean", "median" and "max"
|
|
200 #' factor: multiplier for blank stats
|
|
201 #' f_mv: a flag indicating whether or not to performance missing value
|
|
202 #' filtering.
|
|
203 #' thres_mv: threshold of missing values on QC. Features less than this
|
|
204 #' threshold will be kept.
|
|
205 #' Return:
|
|
206 #' a filtered data list including "sample", "qc" and "blank"
|
|
207 blank_filter <- function(data, method = c("mean", "median", "max"),
|
|
208 factor = 1, f_mv = TRUE, thres_mv = 0.30) {
|
|
209 method <- match.arg(method)
|
|
210 #' ----------------------------------------------------------------------
|
|
211 #' 1) filtering based on missing values of "sample".
|
|
212 if (f_mv) {
|
|
213 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
|
|
214 data <- lapply(data, function(x) {
|
|
215 x <- x[, idx]
|
|
216 })
|
|
217 }
|
|
218
|
|
219 #' ----------------------------------------------------------------------
|
|
220 #' 2) filtering based on characteristics of blank intensities: mean, median
|
|
221 #' or max
|
|
222
|
|
223 stats.blank <- apply(data$blank, 2, method, na.rm = TRUE)
|
|
224 stats.blank <- factor * stats.blank
|
|
225 stats.sample <- apply(data$sample, 2, method, na.rm = TRUE)
|
|
226
|
|
227 #' keep features with sample stats are larger than blank
|
|
228 idx <- stats.sample >= stats.blank
|
|
229 idx[is.na(idx)] <- FALSE
|
|
230 #' Also keep features whose values are NA in blank
|
|
231 idx.1 <- is.na(stats.blank)
|
|
232 #' take union
|
|
233 idx <- idx | idx.1
|
|
234
|
|
235 data <- lapply(data, function(x) {
|
|
236 x <- x[, idx]
|
|
237 })
|
|
238
|
|
239 return(data)
|
|
240 }
|
|
241
|
|
242 #' =======================================================================
|
|
243 #' wl-19-11-2018, Mon: wrapper function for distribution of data stats such
|
|
244 #' as rsd and percentage of missing values.
|
|
245 #' Arguments:
|
|
246 #' x: an vector list including "sample", "qc" and "blank"
|
|
247 #' main: plot title
|
|
248 #' Return:
|
|
249 #' a list of lattice plot objects including histogram and boxplot
|
|
250 dist_plot <- function(x, main = "") {
|
|
251 x <- melt(x)
|
|
252 p.hist <- histogram(~ value | L1,
|
|
253 data = x,
|
|
254 type = "count", #' type="density",
|
|
255 nint = 100,
|
|
256 as.table = T, layout = c(1, 3), main = main,
|
|
257 scales = list(cex = .75, relation = "free")
|
|
258 )
|
|
259
|
|
260 #' ---------------------------------------------------------------------
|
|
261 #' histogram with density (problem with missing values)
|
|
262 #' ---------------------------------------------------------------------
|
|
263 #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50,
|
|
264 #' as.table=T, layout = c(1,3), main=main,
|
|
265 #' scales=list(cex =.75,relation="free"),
|
|
266 #' panel = function(x, subscripts, ...) {
|
|
267 #' panel.histogram(x, ...)
|
|
268 #' panel.mathdensity(dnorm, col = 'red', ...)
|
|
269 #' panel.densityplot(x, plot.points = FALSE, col = 'navy',...)
|
|
270 #' } )
|
|
271
|
|
272 #' boxplot
|
|
273 p.box <- bwplot(value ~ L1, data = x, main = main)
|
|
274
|
|
275 return(list(p.hist = p.hist, p.box = p.box))
|
|
276 }
|
|
277
|
|
278 #' ========================================================================
|
|
279 #' lwc-23-04-2010: Fill the zero/NA values by univariate.
|
|
280 mv.fill <- function(dat, method = "mean", ze_ne = FALSE) {
|
|
281 method <-
|
|
282 if (is.function(method)) {
|
|
283 method
|
|
284 } else if (is.character(method)) {
|
|
285 get(method)
|
|
286 } else {
|
|
287 eval(method)
|
|
288 }
|
|
289
|
|
290 vec.func <- function(x) {
|
|
291 if (ze_ne) {
|
|
292 x <- ifelse(x < .Machine$double.eps, NA, x)
|
|
293 #' vectorisation of ifelse
|
|
294 }
|
|
295 m <- method(x, na.rm = TRUE)
|
|
296
|
|
297 x[is.na(x)] <- m
|
|
298 x
|
|
299 }
|
|
300
|
|
301 dat <- as.data.frame(dat, stringsAsFactors = F)
|
|
302 res <- sapply(dat, function(i) vec.func(i))
|
|
303 return(res)
|
|
304 }
|
|
305
|
|
306 #' =======================================================================
|
|
307 #' wl-20-11-2018, Tue: Missing value imputation
|
|
308 mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) {
|
|
309 method <- match.arg(method)
|
|
310 if (method == "knn") {
|
|
311 x <- t(impute::impute.knn(t(x))$data)
|
|
312 #' x <- suppressWarnings(t(impute.knn(t(x))$data))
|
|
313 } else if (method == "pca") {
|
|
314 x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs
|
|
315 x[x < 0] <- min(x[x > 0]) #' in case negative value
|
|
316 } else {
|
|
317 x <- mv.fill(x, method = method)
|
|
318 }
|
|
319 x <- as.data.frame(x)
|
|
320 }
|