Mercurial > repos > metaboflow_cam > dimsp
comparison dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip
Uploaded
author | metaboflow_cam |
---|---|
date | Tue, 22 Oct 2019 04:06:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:b8b3148de013 | 2:a8155d5840e9 |
---|---|
1 #' wl-06-11-2018, Tue: filter functions for DIMSP | |
2 #' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to | |
3 #' "#'" for 'lintr'. | |
4 | |
5 suppressPackageStartupMessages({ | |
6 library(reshape) | |
7 library(lattice) | |
8 library(impute) | |
9 library(pcaMethods) | |
10 }) | |
11 | |
12 #' ========================================================================= | |
13 #' wl-02-06-2011: Relative standard deviation of matrix/data frame in | |
14 #' column wise | |
15 rsd <- function(x) { | |
16 mn <- colMeans(x, na.rm = TRUE) | |
17 std <- apply(x, 2, sd, na.rm = TRUE) | |
18 #' std <- sd(x,na.rm=TRUE) #' sd(<data.frame>) is deprecated. | |
19 res <- 100 * std / mn | |
20 return(res) | |
21 } | |
22 | |
23 #' ======================================================================= | |
24 #' wl-11-12-2007: Statistics and plot for missing values | |
25 mv.stats <- function(dat, grp = NULL, ...) { | |
26 #' overall missing values rate | |
27 mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat)) | |
28 | |
29 #' MV stats function for vector | |
30 vec.func <- | |
31 function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3) | |
32 #' vec.func <- function(x) sum(is.na(x)|is.nan(x)) #' number of MV | |
33 #' sum(is.na(x)|is.nan(x)|(x==0)) | |
34 | |
35 #' get number of Na, NaN and zero in each of feature/variable | |
36 #' mv.rep <- apply(dat, 1, vec.func) | |
37 mv.var <- apply(dat, 2, vec.func) | |
38 | |
39 ret <- list(mv.overall = mv.all, mv.var = mv.var) | |
40 | |
41 #' ------------------------------------------------------------------- | |
42 if (!is.null(grp)) { | |
43 #' MV rate with respect of variables and class info | |
44 mv.grp <- sapply(levels(grp), function(y) { | |
45 idx <- (grp == y) | |
46 mat <- dat[idx, ] | |
47 mv <- apply(mat, 2, vec.func) | |
48 }) | |
49 #' -------------------------------------------------------------------- | |
50 #' wl-10-10-2011: Use aggregate. Beware that values passed in the | |
51 #' function is vector(columns). | |
52 #' mv.grp <- aggregate(dat, list(cls), vec.func) | |
53 #' rownames(mv.grp) <- mv.grp[,1] | |
54 #' mv.grp <- mv.grp[,-1] | |
55 #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F) | |
56 | |
57 #' reshape matrix for lattice | |
58 mv.grp.1 <- data.frame(mv.grp) | |
59 mv.grp.1$all <- mv.var #' Combine all | |
60 | |
61 var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1)) | |
62 mv.grp.1 <- stack(mv.grp.1) | |
63 mv.grp.1$ind <- factor(mv.grp.1$ind, | |
64 levels = unique.default(mv.grp.1$ind) | |
65 ) | |
66 mv.grp.1$var <- var | |
67 | |
68 mv.grp.plot <- | |
69 xyplot(values ~ var | ind, | |
70 data = mv.grp.1, groups = ind, as.table = T, | |
71 layout = c(1, nlevels(mv.grp.1$ind)), type = "l", | |
72 auto.key = list(space = "right"), | |
73 #' main="Missing Values Percentage With Respect of Variables", | |
74 xlab = "Index of variables", ylab = "Percentage of missing values", | |
75 ... | |
76 ) | |
77 | |
78 #' -------------------------------------------------------------------- | |
79 ret$mv.grp <- mv.grp | |
80 ret$mv.grp.plot <- mv.grp.plot | |
81 } | |
82 | |
83 ret | |
84 } | |
85 | |
86 #' ========================================================================= | |
87 #' wl-11-10-2011: replace zero/negative with NA. | |
88 mv.zene <- function(dat) { | |
89 vec.func <- function(x) { | |
90 x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse | |
91 } | |
92 | |
93 dat <- as.data.frame(dat, stringsAsFactors = F) | |
94 res <- sapply(dat, function(i) vec.func(i)) | |
95 return(res) | |
96 } | |
97 | |
98 #' ======================================================================== | |
99 #' wl-06-11-2018, Tue: feature filter index based on missing values | |
100 #' Arguments: | |
101 #' x: a data frame where columns are features | |
102 #' thres_mv: threshold of missing values. Features less than this | |
103 #' threshold will be kept. | |
104 #' Return: | |
105 #' a logical vector of index for keeping features | |
106 .mv_filter <- function(x, thres_mv = 0.30) { | |
107 res <- mv.stats(x) | |
108 idx <- res$mv.var < thres_mv | |
109 return(idx) | |
110 } | |
111 | |
112 #' ======================================================================== | |
113 #' wl-06-11-2018, Tue: feature filter index based on RSD | |
114 #' Arguments: | |
115 #' x: a data frame where columns are features | |
116 #' thres_rsd: threshold of RSD. Features less than this threshold will be | |
117 #' kept. | |
118 #' Return: | |
119 #' a logical vector of index for keeping features | |
120 .rsd_filter <- function(x, thres_rsd = 20) { | |
121 res <- rsd(x) | |
122 idx <- res < thres_rsd | |
123 idx[is.na(idx)] <- FALSE | |
124 #' some stats | |
125 if (F) { | |
126 summary(res) | |
127 tmp <- hist(res, plot = F)$counts | |
128 hist(res, | |
129 xlab = "rsd", ylab = "Counts", col = "lightblue", | |
130 ylim = c(0, max(tmp) + 10) | |
131 ) | |
132 } | |
133 return(idx) | |
134 } | |
135 | |
136 #' ======================================================================== | |
137 #' wl-06-11-2018, Tue: Feature filtering based on missing values of samples | |
138 #' Arguments: | |
139 #' data: a data matrix list including "sample", "qc" and "blank" | |
140 #' thres_mv: threshold of missing values on sample. Features less than this | |
141 #' threshold will be kept. | |
142 #' Return: | |
143 #' a filtered data list including "sample", "qc" and "blank" | |
144 mv_filter <- function(data, thres_mv = 0.30) { | |
145 idx <- .mv_filter(data$sample, thres_mv = thres_mv) | |
146 data <- lapply(data, function(x) { | |
147 x <- x[, idx] | |
148 }) | |
149 | |
150 return(data) | |
151 } | |
152 | |
153 #' ======================================================================== | |
154 #' wl-06-11-2018, Tue: Feature filtering based on QC's RSD | |
155 #' wl-14-11-2018, Wed: add flag to missing value filtering | |
156 #' Arguments: | |
157 #' data: a data matrix list including "sample", "qc" and "blank" | |
158 #' thres_rsd: threshold of RSD on QC. Features less than this | |
159 #' threshold will be kept. | |
160 #' f_mv: a flag indicating whether or not to performance missing value | |
161 #' filtering. | |
162 #' f_mv_qc: a flag for filtering using percentage of missing values on "qc" | |
163 #' or "sample". TRUE is for "qc". | |
164 #' thres_mv: threshold of missing values. Features less than this | |
165 #' threshold will be kept. | |
166 #' Return: | |
167 #' a filtered data list including "sample", "qc" and "blank" | |
168 qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE, | |
169 f_mv_qc = FALSE, thres_mv = 0.30) { | |
170 #' ---------------------------------------------------------------------- | |
171 #' 1) filtering based on missing values: sample or qc. | |
172 if (f_mv) { | |
173 if (f_mv_qc) { | |
174 mat <- data$qc | |
175 } else { | |
176 mat <- data$sample | |
177 } | |
178 idx <- .mv_filter(mat, thres_mv = thres_mv) | |
179 data <- lapply(data, function(x) { | |
180 x <- x[, idx] | |
181 }) | |
182 } | |
183 | |
184 #' ---------------------------------------------------------------------- | |
185 #' 2) filtering based rsd of "qc" | |
186 idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd) | |
187 data <- lapply(data, function(x) { | |
188 x <- x[, idx] | |
189 }) | |
190 | |
191 return(data) | |
192 } | |
193 | |
194 #' ======================================================================== | |
195 #' wl-06-11-2018, Tue: Feature filtering based on blank | |
196 #' wl-14-11-2018, Wed: change order of missing value filtering | |
197 #' Arguments: | |
198 #' data: a data matrix list including "sample", "qc" and "blank" | |
199 #' method: method for stats. Support "mean", "median" and "max" | |
200 #' factor: multiplier for blank stats | |
201 #' f_mv: a flag indicating whether or not to performance missing value | |
202 #' filtering. | |
203 #' thres_mv: threshold of missing values on QC. Features less than this | |
204 #' threshold will be kept. | |
205 #' Return: | |
206 #' a filtered data list including "sample", "qc" and "blank" | |
207 blank_filter <- function(data, method = c("mean", "median", "max"), | |
208 factor = 1, f_mv = TRUE, thres_mv = 0.30) { | |
209 method <- match.arg(method) | |
210 #' ---------------------------------------------------------------------- | |
211 #' 1) filtering based on missing values of "sample". | |
212 if (f_mv) { | |
213 idx <- .mv_filter(data$sample, thres_mv = thres_mv) | |
214 data <- lapply(data, function(x) { | |
215 x <- x[, idx] | |
216 }) | |
217 } | |
218 | |
219 #' ---------------------------------------------------------------------- | |
220 #' 2) filtering based on characteristics of blank intensities: mean, median | |
221 #' or max | |
222 | |
223 stats.blank <- apply(data$blank, 2, method, na.rm = TRUE) | |
224 stats.blank <- factor * stats.blank | |
225 stats.sample <- apply(data$sample, 2, method, na.rm = TRUE) | |
226 | |
227 #' keep features with sample stats are larger than blank | |
228 idx <- stats.sample >= stats.blank | |
229 idx[is.na(idx)] <- FALSE | |
230 #' Also keep features whose values are NA in blank | |
231 idx.1 <- is.na(stats.blank) | |
232 #' take union | |
233 idx <- idx | idx.1 | |
234 | |
235 data <- lapply(data, function(x) { | |
236 x <- x[, idx] | |
237 }) | |
238 | |
239 return(data) | |
240 } | |
241 | |
242 #' ======================================================================= | |
243 #' wl-19-11-2018, Mon: wrapper function for distribution of data stats such | |
244 #' as rsd and percentage of missing values. | |
245 #' Arguments: | |
246 #' x: an vector list including "sample", "qc" and "blank" | |
247 #' main: plot title | |
248 #' Return: | |
249 #' a list of lattice plot objects including histogram and boxplot | |
250 dist_plot <- function(x, main = "") { | |
251 x <- melt(x) | |
252 p.hist <- histogram(~ value | L1, | |
253 data = x, | |
254 type = "count", #' type="density", | |
255 nint = 100, | |
256 as.table = T, layout = c(1, 3), main = main, | |
257 scales = list(cex = .75, relation = "free") | |
258 ) | |
259 | |
260 #' --------------------------------------------------------------------- | |
261 #' histogram with density (problem with missing values) | |
262 #' --------------------------------------------------------------------- | |
263 #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50, | |
264 #' as.table=T, layout = c(1,3), main=main, | |
265 #' scales=list(cex =.75,relation="free"), | |
266 #' panel = function(x, subscripts, ...) { | |
267 #' panel.histogram(x, ...) | |
268 #' panel.mathdensity(dnorm, col = 'red', ...) | |
269 #' panel.densityplot(x, plot.points = FALSE, col = 'navy',...) | |
270 #' } ) | |
271 | |
272 #' boxplot | |
273 p.box <- bwplot(value ~ L1, data = x, main = main) | |
274 | |
275 return(list(p.hist = p.hist, p.box = p.box)) | |
276 } | |
277 | |
278 #' ======================================================================== | |
279 #' lwc-23-04-2010: Fill the zero/NA values by univariate. | |
280 mv.fill <- function(dat, method = "mean", ze_ne = FALSE) { | |
281 method <- | |
282 if (is.function(method)) { | |
283 method | |
284 } else if (is.character(method)) { | |
285 get(method) | |
286 } else { | |
287 eval(method) | |
288 } | |
289 | |
290 vec.func <- function(x) { | |
291 if (ze_ne) { | |
292 x <- ifelse(x < .Machine$double.eps, NA, x) | |
293 #' vectorisation of ifelse | |
294 } | |
295 m <- method(x, na.rm = TRUE) | |
296 | |
297 x[is.na(x)] <- m | |
298 x | |
299 } | |
300 | |
301 dat <- as.data.frame(dat, stringsAsFactors = F) | |
302 res <- sapply(dat, function(i) vec.func(i)) | |
303 return(res) | |
304 } | |
305 | |
306 #' ======================================================================= | |
307 #' wl-20-11-2018, Tue: Missing value imputation | |
308 mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) { | |
309 method <- match.arg(method) | |
310 if (method == "knn") { | |
311 x <- t(impute::impute.knn(t(x))$data) | |
312 #' x <- suppressWarnings(t(impute.knn(t(x))$data)) | |
313 } else if (method == "pca") { | |
314 x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs | |
315 x[x < 0] <- min(x[x > 0]) #' in case negative value | |
316 } else { | |
317 x <- mv.fill(x, method = method) | |
318 } | |
319 x <- as.data.frame(x) | |
320 } |