annotate dimsp/fs_filter.R @ 2:a8155d5840e9 draft default tip

Uploaded
author metaboflow_cam
date Tue, 22 Oct 2019 04:06:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
1 #' wl-06-11-2018, Tue: filter functions for DIMSP
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
2 #' wl-01-03-2019, Fri: reformat with 'styler' and change comment string to
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
3 #' "#'" for 'lintr'.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
4
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
5 suppressPackageStartupMessages({
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
6 library(reshape)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
7 library(lattice)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
8 library(impute)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
9 library(pcaMethods)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
10 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
11
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
12 #' =========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
13 #' wl-02-06-2011: Relative standard deviation of matrix/data frame in
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
14 #' column wise
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
15 rsd <- function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
16 mn <- colMeans(x, na.rm = TRUE)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
17 std <- apply(x, 2, sd, na.rm = TRUE)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
18 #' std <- sd(x,na.rm=TRUE) #' sd(<data.frame>) is deprecated.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
19 res <- 100 * std / mn
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
20 return(res)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
21 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
22
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
23 #' =======================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
24 #' wl-11-12-2007: Statistics and plot for missing values
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
25 mv.stats <- function(dat, grp = NULL, ...) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
26 #' overall missing values rate
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
27 mv.all <- sum(is.na(as.matrix(dat))) / length(as.matrix(dat))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
28
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
29 #' MV stats function for vector
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
30 vec.func <-
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
31 function(x) round(sum(is.na(x) | is.nan(x)) / length(x), digits = 3)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
32 #' vec.func <- function(x) sum(is.na(x)|is.nan(x)) #' number of MV
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
33 #' sum(is.na(x)|is.nan(x)|(x==0))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
34
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
35 #' get number of Na, NaN and zero in each of feature/variable
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
36 #' mv.rep <- apply(dat, 1, vec.func)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
37 mv.var <- apply(dat, 2, vec.func)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
38
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
39 ret <- list(mv.overall = mv.all, mv.var = mv.var)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
40
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
41 #' -------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
42 if (!is.null(grp)) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
43 #' MV rate with respect of variables and class info
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
44 mv.grp <- sapply(levels(grp), function(y) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
45 idx <- (grp == y)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
46 mat <- dat[idx, ]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
47 mv <- apply(mat, 2, vec.func)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
48 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
49 #' --------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
50 #' wl-10-10-2011: Use aggregate. Beware that values passed in the
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
51 #' function is vector(columns).
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
52 #' mv.grp <- aggregate(dat, list(cls), vec.func)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
53 #' rownames(mv.grp) <- mv.grp[,1]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
54 #' mv.grp <- mv.grp[,-1]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
55 #' mv.grp <- as.data.frame(t(mv.grp),stringsAsFactors=F)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
56
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
57 #' reshape matrix for lattice
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
58 mv.grp.1 <- data.frame(mv.grp)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
59 mv.grp.1$all <- mv.var #' Combine all
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
60
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
61 var <- rep(1:nrow(mv.grp.1), ncol(mv.grp.1))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
62 mv.grp.1 <- stack(mv.grp.1)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
63 mv.grp.1$ind <- factor(mv.grp.1$ind,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
64 levels = unique.default(mv.grp.1$ind)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
65 )
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
66 mv.grp.1$var <- var
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
67
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
68 mv.grp.plot <-
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
69 xyplot(values ~ var | ind,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
70 data = mv.grp.1, groups = ind, as.table = T,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
71 layout = c(1, nlevels(mv.grp.1$ind)), type = "l",
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
72 auto.key = list(space = "right"),
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
73 #' main="Missing Values Percentage With Respect of Variables",
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
74 xlab = "Index of variables", ylab = "Percentage of missing values",
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
75 ...
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
76 )
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
77
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
78 #' --------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
79 ret$mv.grp <- mv.grp
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
80 ret$mv.grp.plot <- mv.grp.plot
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
81 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
82
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
83 ret
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
84 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
85
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
86 #' =========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
87 #' wl-11-10-2011: replace zero/negative with NA.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
88 mv.zene <- function(dat) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
89 vec.func <- function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
90 x <- ifelse(x < .Machine$double.eps, NA, x) #' vectorisation of ifelse
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
91 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
92
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
93 dat <- as.data.frame(dat, stringsAsFactors = F)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
94 res <- sapply(dat, function(i) vec.func(i))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
95 return(res)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
96 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
97
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
98 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
99 #' wl-06-11-2018, Tue: feature filter index based on missing values
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
100 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
101 #' x: a data frame where columns are features
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
102 #' thres_mv: threshold of missing values. Features less than this
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
103 #' threshold will be kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
104 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
105 #' a logical vector of index for keeping features
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
106 .mv_filter <- function(x, thres_mv = 0.30) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
107 res <- mv.stats(x)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
108 idx <- res$mv.var < thres_mv
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
109 return(idx)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
110 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
111
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
112 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
113 #' wl-06-11-2018, Tue: feature filter index based on RSD
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
114 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
115 #' x: a data frame where columns are features
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
116 #' thres_rsd: threshold of RSD. Features less than this threshold will be
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
117 #' kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
118 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
119 #' a logical vector of index for keeping features
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
120 .rsd_filter <- function(x, thres_rsd = 20) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
121 res <- rsd(x)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
122 idx <- res < thres_rsd
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
123 idx[is.na(idx)] <- FALSE
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
124 #' some stats
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
125 if (F) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
126 summary(res)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
127 tmp <- hist(res, plot = F)$counts
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
128 hist(res,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
129 xlab = "rsd", ylab = "Counts", col = "lightblue",
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
130 ylim = c(0, max(tmp) + 10)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
131 )
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
132 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
133 return(idx)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
134 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
135
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
136 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
137 #' wl-06-11-2018, Tue: Feature filtering based on missing values of samples
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
138 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
139 #' data: a data matrix list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
140 #' thres_mv: threshold of missing values on sample. Features less than this
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
141 #' threshold will be kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
142 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
143 #' a filtered data list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
144 mv_filter <- function(data, thres_mv = 0.30) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
145 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
146 data <- lapply(data, function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
147 x <- x[, idx]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
148 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
149
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
150 return(data)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
151 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
152
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
153 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
154 #' wl-06-11-2018, Tue: Feature filtering based on QC's RSD
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
155 #' wl-14-11-2018, Wed: add flag to missing value filtering
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
156 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
157 #' data: a data matrix list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
158 #' thres_rsd: threshold of RSD on QC. Features less than this
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
159 #' threshold will be kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
160 #' f_mv: a flag indicating whether or not to performance missing value
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
161 #' filtering.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
162 #' f_mv_qc: a flag for filtering using percentage of missing values on "qc"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
163 #' or "sample". TRUE is for "qc".
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
164 #' thres_mv: threshold of missing values. Features less than this
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
165 #' threshold will be kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
166 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
167 #' a filtered data list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
168 qc_filter <- function(data, thres_rsd = 20, f_mv = TRUE,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
169 f_mv_qc = FALSE, thres_mv = 0.30) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
170 #' ----------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
171 #' 1) filtering based on missing values: sample or qc.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
172 if (f_mv) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
173 if (f_mv_qc) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
174 mat <- data$qc
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
175 } else {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
176 mat <- data$sample
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
177 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
178 idx <- .mv_filter(mat, thres_mv = thres_mv)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
179 data <- lapply(data, function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
180 x <- x[, idx]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
181 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
182 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
183
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
184 #' ----------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
185 #' 2) filtering based rsd of "qc"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
186 idx <- .rsd_filter(data$qc, thres_rsd = thres_rsd)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
187 data <- lapply(data, function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
188 x <- x[, idx]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
189 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
190
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
191 return(data)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
192 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
193
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
194 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
195 #' wl-06-11-2018, Tue: Feature filtering based on blank
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
196 #' wl-14-11-2018, Wed: change order of missing value filtering
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
197 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
198 #' data: a data matrix list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
199 #' method: method for stats. Support "mean", "median" and "max"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
200 #' factor: multiplier for blank stats
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
201 #' f_mv: a flag indicating whether or not to performance missing value
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
202 #' filtering.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
203 #' thres_mv: threshold of missing values on QC. Features less than this
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
204 #' threshold will be kept.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
205 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
206 #' a filtered data list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
207 blank_filter <- function(data, method = c("mean", "median", "max"),
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
208 factor = 1, f_mv = TRUE, thres_mv = 0.30) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
209 method <- match.arg(method)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
210 #' ----------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
211 #' 1) filtering based on missing values of "sample".
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
212 if (f_mv) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
213 idx <- .mv_filter(data$sample, thres_mv = thres_mv)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
214 data <- lapply(data, function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
215 x <- x[, idx]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
216 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
217 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
218
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
219 #' ----------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
220 #' 2) filtering based on characteristics of blank intensities: mean, median
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
221 #' or max
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
222
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
223 stats.blank <- apply(data$blank, 2, method, na.rm = TRUE)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
224 stats.blank <- factor * stats.blank
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
225 stats.sample <- apply(data$sample, 2, method, na.rm = TRUE)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
226
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
227 #' keep features with sample stats are larger than blank
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
228 idx <- stats.sample >= stats.blank
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
229 idx[is.na(idx)] <- FALSE
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
230 #' Also keep features whose values are NA in blank
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
231 idx.1 <- is.na(stats.blank)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
232 #' take union
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
233 idx <- idx | idx.1
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
234
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
235 data <- lapply(data, function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
236 x <- x[, idx]
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
237 })
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
238
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
239 return(data)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
240 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
241
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
242 #' =======================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
243 #' wl-19-11-2018, Mon: wrapper function for distribution of data stats such
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
244 #' as rsd and percentage of missing values.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
245 #' Arguments:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
246 #' x: an vector list including "sample", "qc" and "blank"
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
247 #' main: plot title
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
248 #' Return:
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
249 #' a list of lattice plot objects including histogram and boxplot
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
250 dist_plot <- function(x, main = "") {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
251 x <- melt(x)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
252 p.hist <- histogram(~ value | L1,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
253 data = x,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
254 type = "count", #' type="density",
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
255 nint = 100,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
256 as.table = T, layout = c(1, 3), main = main,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
257 scales = list(cex = .75, relation = "free")
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
258 )
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
259
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
260 #' ---------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
261 #' histogram with density (problem with missing values)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
262 #' ---------------------------------------------------------------------
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
263 #' p <- histogram(~ value | L1, data = x, type = 'density',nint = 50,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
264 #' as.table=T, layout = c(1,3), main=main,
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
265 #' scales=list(cex =.75,relation="free"),
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
266 #' panel = function(x, subscripts, ...) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
267 #' panel.histogram(x, ...)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
268 #' panel.mathdensity(dnorm, col = 'red', ...)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
269 #' panel.densityplot(x, plot.points = FALSE, col = 'navy',...)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
270 #' } )
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
271
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
272 #' boxplot
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
273 p.box <- bwplot(value ~ L1, data = x, main = main)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
274
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
275 return(list(p.hist = p.hist, p.box = p.box))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
276 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
277
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
278 #' ========================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
279 #' lwc-23-04-2010: Fill the zero/NA values by univariate.
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
280 mv.fill <- function(dat, method = "mean", ze_ne = FALSE) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
281 method <-
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
282 if (is.function(method)) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
283 method
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
284 } else if (is.character(method)) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
285 get(method)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
286 } else {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
287 eval(method)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
288 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
289
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
290 vec.func <- function(x) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
291 if (ze_ne) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
292 x <- ifelse(x < .Machine$double.eps, NA, x)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
293 #' vectorisation of ifelse
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
294 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
295 m <- method(x, na.rm = TRUE)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
296
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
297 x[is.na(x)] <- m
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
298 x
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
299 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
300
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
301 dat <- as.data.frame(dat, stringsAsFactors = F)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
302 res <- sapply(dat, function(i) vec.func(i))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
303 return(res)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
304 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
305
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
306 #' =======================================================================
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
307 #' wl-20-11-2018, Tue: Missing value imputation
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
308 mv.impute <- function(x, method = c("mean", "median", "min", "knn", "pca")) {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
309 method <- match.arg(method)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
310 if (method == "knn") {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
311 x <- t(impute::impute.knn(t(x))$data)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
312 #' x <- suppressWarnings(t(impute.knn(t(x))$data))
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
313 } else if (method == "pca") {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
314 x <- pcaMethods::pca(x, method = "ppca", nPcs = 5)@completeObs
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
315 x[x < 0] <- min(x[x > 0]) #' in case negative value
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
316 } else {
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
317 x <- mv.fill(x, method = method)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
318 }
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
319 x <- as.data.frame(x)
a8155d5840e9 Uploaded
metaboflow_cam
parents:
diff changeset
320 }