Mercurial > repos > eschen42 > w4mcorcov
comparison w4mcorcov_calc.R @ 0:23f9fad4edfc draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit bd26542b811de06c1a877337a2840a9f899c2b94
author | eschen42 |
---|---|
date | Mon, 16 Oct 2017 14:56:52 -0400 |
parents | |
children | 0c2ad44b6c9c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:23f9fad4edfc |
---|---|
1 # center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/ | |
2 center_colmeans <- function(x) { | |
3 xcenter = colMeans(x) | |
4 x - rep(xcenter, rep.int(nrow(x), ncol(x))) | |
5 } | |
6 | |
7 #### OPLS-DA | |
8 algoC <- "nipals" | |
9 | |
10 do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) { | |
11 off <- function(x) if (x_show_labels) x else 0 | |
12 salience_lookup <- x_env$salience_lookup | |
13 salient_rcv_lookup <- x_env$salient_rcv_lookup | |
14 # x_progress("head(salience_df): ", head(salience_df)) | |
15 # x_progress("head(salience): ", head(salience)) | |
16 if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) { | |
17 my_oplsda <- opls( | |
18 x = x_dataMatrix | |
19 , y = x_predictor | |
20 , algoC = x_algorithm | |
21 , predI = 1 | |
22 , orthoI = if (ncol(x_dataMatrix) > 1) 1 else 0 | |
23 , printL = FALSE | |
24 , plotL = FALSE | |
25 ) | |
26 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) | |
27 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] | |
28 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] | |
29 my_cor_vs_cov <- cor_vs_cov( | |
30 matrix_x = x_dataMatrix | |
31 , ropls_x = my_oplsda | |
32 ) | |
33 with( | |
34 my_cor_vs_cov | |
35 , { | |
36 min_x <- min(covariance) | |
37 max_x <- max(covariance) | |
38 lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs)) | |
39 covariance <- covariance / lim_x | |
40 lim_x <- 1.2 | |
41 main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2) | |
42 # print("main_label") | |
43 # print(main_label) | |
44 main_cex <- min(1.0, 46.0/nchar(main_label)) | |
45 # "It is generally accepted that a variable should be selected if vj>1, [27–29], | |
46 # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." | |
47 # (Mehmood 2012 doi:10.1186/1748-7188-6-27) | |
48 vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) | |
49 alpha <- 0.1 + 0.4 * vipco | |
50 red <- as.numeric(correlation < 0) * vipco | |
51 blue <- as.numeric(correlation > 0) * vipco | |
52 minus_cor <- -correlation | |
53 minus_cov <- -covariance | |
54 # cex <- salience_lookup(feature = names(minus_cor)) | |
55 # cex <- 0.25 + (1.25 * cex / max(cex)) | |
56 cex <- 0.75 | |
57 plot( | |
58 y = minus_cor | |
59 , x = minus_cov | |
60 , type="p" | |
61 , xlim=c(-lim_x, lim_x + off(0.1)) | |
62 , ylim=c(-1.0 - off(0.1), 1.0) | |
63 , xlab = sprintf("relative covariance(feature,t1)") | |
64 , ylab = sprintf("correlation(feature,t1)") | |
65 , main = main_label | |
66 , cex.main = main_cex | |
67 , cex = cex | |
68 , pch = 16 | |
69 , col = rgb(blue = blue, red = red, green = 0, alpha = alpha) | |
70 ) | |
71 low_x <- -0.7 * lim_x | |
72 high_x <- 0.7 * lim_x | |
73 text(x = low_x, y = -0.15, labels = fctr_lvl_1) | |
74 text(x = high_x, y = 0.15, labels = fctr_lvl_2) | |
75 if (x_show_labels) { | |
76 text( | |
77 y = minus_cor - 0.013 | |
78 , x = minus_cov + 0.020 | |
79 , cex = 0.3 | |
80 , labels = tsv1$featureID | |
81 , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha) | |
82 , srt = -30 # slant 30 degrees downward | |
83 , adj = 0 # left-justified | |
84 ) | |
85 } | |
86 } | |
87 ) | |
88 typeVc <- c("correlation", # 1 | |
89 "outlier", # 2 | |
90 "overview", # 3 | |
91 "permutation", # 4 | |
92 "predict-train", # 5 | |
93 "predict-test", # 6 | |
94 "summary", # 7 = c(2,3,4,9) | |
95 "x-loading", # 8 | |
96 "x-score", # 9 | |
97 "x-variance", # 10 | |
98 "xy-score", # 11 | |
99 "xy-weight" # 12 | |
100 ) # [c(3,8,9)] # [c(4,3,8,9)] | |
101 if ( length(my_oplsda@orthoVipVn) > 0 ) { | |
102 my_typevc <- typeVc[c(9,3,8)] | |
103 } else { | |
104 my_typevc <- c("(dummy)","overview","(dummy)") | |
105 } | |
106 for (my_type in my_typevc) { | |
107 if (my_type %in% typeVc) { | |
108 # print(sprintf("plotting type %s", my_type)) | |
109 plot( | |
110 x = my_oplsda | |
111 , typeVc = my_type | |
112 , parCexN = 0.4 | |
113 , parDevNewL = FALSE | |
114 , parLayL = TRUE | |
115 , parEllipsesL = TRUE | |
116 ) | |
117 } else { | |
118 # print("plotting dummy graph") | |
119 plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") | |
120 text(x=1, y=1, labels="no orthogonal projection is possible") | |
121 } | |
122 } | |
123 return (my_cor_vs_cov) | |
124 } else { | |
125 # x_progress(sprintf("x_is_match = %s, ncol(x_dataMatrix) = %d, length(unique(x_predictor)) = %d",x_is_match, ncol(x_dataMatrix), length(unique(x_predictor)))) | |
126 return (NULL) | |
127 } | |
128 } | |
129 | |
130 # S-PLOT and OPLS reference: Wiklund_2008 doi:10.1021/ac0713510 | |
131 corcov_calc <- function(calc_env, failure_action = stop, progress_action = function(x){}, corcov_tsv_action = function(t){}, salience_tsv_action = function(t){}) { | |
132 if ( ! is.environment(calc_env) ) { | |
133 failure_action("corcov_calc: fatal error - 'calc_env' is not an environment") | |
134 return ( FALSE ) | |
135 } | |
136 if ( ! is.function(corcov_tsv_action) ) { | |
137 failure_action("corcov_calc: fatal error - 'corcov_tsv_action' is not a function") | |
138 return ( FALSE ) | |
139 } | |
140 if ( ! is.function(salience_tsv_action) ) { | |
141 failure_action("salience_calc: fatal error - 'salience_tsv_action' is not a function") | |
142 return ( FALSE ) | |
143 } | |
144 | |
145 # extract parameters from the environment | |
146 vrbl_metadata <- calc_env$vrbl_metadata | |
147 vrbl_metadata_names <- vrbl_metadata[,1] | |
148 smpl_metadata <- calc_env$smpl_metadata | |
149 data_matrix <- calc_env$data_matrix | |
150 pairSigFeatOnly <- calc_env$pairSigFeatOnly | |
151 facC <- calc_env$facC | |
152 tesC <- calc_env$tesC | |
153 # extract the levels from the environment | |
154 originalLevCSV <- levCSV <- calc_env$levCSV | |
155 # matchingC is one of { "none", "wildcard", "regex" } | |
156 matchingC <- calc_env$matchingC | |
157 labelFeatures <- calc_env$labelFeatures | |
158 | |
159 # arg/env checking | |
160 if (!(facC %in% names(smpl_metadata))) { | |
161 failure_action(sprintf("bad parameter! Factor name '%s' not found in sampleMetadata", facC)) | |
162 return ( FALSE ) | |
163 } | |
164 | |
165 # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) | |
166 salience_df <- calc_env$salience_df <- w4msalience( | |
167 data_matrix = data_matrix | |
168 , sample_class = smpl_metadata[,facC] | |
169 , failure_action = failure_action | |
170 ) | |
171 salience_tsv_action({ | |
172 my_df <- data.frame( | |
173 featureID = salience_df$feature | |
174 , salientLevel = salience_df$max_level | |
175 , salientRCV = salience_df$salient_rcv | |
176 , salience = salience_df$salience | |
177 ) | |
178 my_df[order(-my_df$salience),] | |
179 }) | |
180 salience <- salience_df$salience | |
181 names(salience) <- salience_df$feature | |
182 salience_lookup <- calc_env$salience_lookup <- function(feature) unname(salience[feature]) | |
183 salient_rcv <- salience_df$salient_rcv | |
184 names(salient_rcv) <- salience_df$feature | |
185 salient_rcv_lookup <- calc_env$salient_rcv_lookup <- function(feature) unname(salient_rcv[feature]) | |
186 salient_level <- salience_df$max_level | |
187 names(salient_level) <- salience_df$feature | |
188 salient_level_lookup <- calc_env$salient_level_lookup <- function(feature) unname(salient_level[feature]) | |
189 | |
190 # transform wildcards to regexen | |
191 if (matchingC == "wildcard") { | |
192 # strsplit(x = "hello,wild,world", split = ",") | |
193 levCSV <- gsub("[.]", "[.]", levCSV) | |
194 levCSV <- strsplit(x = levCSV, split = ",") | |
195 levCSV <- sapply(levCSV, utils::glob2rx, trim.tail = FALSE) | |
196 levCSV <- paste(levCSV, collapse = ",") | |
197 } | |
198 # function to determine whether level is a member of levCSV | |
199 isLevelSelected <- function(lvl) { | |
200 matchFun <- if (matchingC != "none") grepl else `==` | |
201 return( | |
202 Reduce( | |
203 f = "||" | |
204 , x = sapply( | |
205 X = strsplit( | |
206 x = levCSV | |
207 , split = "," | |
208 , fixed = TRUE | |
209 )[[1]] | |
210 , FUN = matchFun | |
211 , lvl | |
212 ) | |
213 ) | |
214 ) | |
215 } | |
216 | |
217 # transpose matrix because ropls matrix is the transpose of XCMS matrix | |
218 # Wiklund_2008 centers and pareto-scales data before OPLS-DA S-plot | |
219 # center | |
220 cdm <- center_colmeans(t(data_matrix)) | |
221 # pareto-scale | |
222 my_scale <- sqrt(apply(cdm, 2, sd, na.rm=TRUE)) | |
223 scdm <- sweep(cdm, 2, my_scale, "/") | |
224 | |
225 # pattern to match column names like k10_kruskal_k4.k3_sig | |
226 col_pattern <- sprintf('^%s_%s_(.*)[.](.*)_sig$', facC, tesC) | |
227 # column name like k10_kruskal_sig | |
228 intersample_sig_col <- sprintf('%s_%s_sig', facC, tesC) | |
229 # get the facC column from sampleMetadata, dropping to one dimension | |
230 smpl_metadata_facC <- smpl_metadata[,facC] | |
231 | |
232 # allocate a slot in the environment for the contrast_list, each element of which will be a data.frame with this structure: | |
233 # - feature ID | |
234 # - value1 | |
235 # - value2 | |
236 # - Wiklund_2008 correlation | |
237 # - Wiklund_2008 covariance | |
238 # - Wiklund_2008 VIP | |
239 calc_env$contrast_list <- list() | |
240 | |
241 | |
242 did_plot <- FALSE | |
243 if (tesC != "none") { | |
244 # for each column name, extract the parts of the name matched by 'col_pattern', if any | |
245 the_colnames <- colnames(vrbl_metadata) | |
246 if ( ! Reduce( f = "||", x = grepl(tesC, the_colnames) ) ) { | |
247 failure_action(sprintf("bad parameter! variableMetadata must contain results of W4M Univariate test '%s'.", tesC)) | |
248 return ( FALSE ) | |
249 } | |
250 col_matches <- lapply( | |
251 X = the_colnames, | |
252 FUN = function(x) { | |
253 regmatches( x, regexec(col_pattern, x) )[[1]] | |
254 } | |
255 ) | |
256 ## first contrast each selected level with all other levels combined into one "super-level" ## | |
257 # process columns matching the pattern | |
258 level_union <- c() | |
259 for (i in 1:length(col_matches)) { | |
260 col_match <- col_matches[[i]] | |
261 if (length(col_match) > 0) { | |
262 # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig | |
263 vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name | |
264 fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 | |
265 fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 | |
266 # only process this column if both factors are members of lvlCSV | |
267 is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) | |
268 if (is_match) { | |
269 level_union <- c(level_union, col_match[2], col_match[3]) | |
270 } | |
271 } | |
272 } | |
273 level_union <- unique(sort(level_union)) | |
274 overall_significant <- 1 == vrbl_metadata[,intersample_sig_col] | |
275 if ( length(level_union) > 1 ) { | |
276 chosen_samples <- smpl_metadata_facC %in% level_union | |
277 chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) | |
278 col_selector <- vrbl_metadata_names[ overall_significant ] | |
279 my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] | |
280 fctr_lvl_2 <- "other" | |
281 for ( fctr_lvl_1 in level_union[1:length(level_union)] ) { | |
282 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) | |
283 predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) | |
284 my_cor_cov <- do_detail_plot( | |
285 x_dataMatrix = my_matrix | |
286 , x_predictor = predictor | |
287 , x_is_match = is_match | |
288 , x_algorithm = algoC | |
289 , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" | |
290 , x_show_labels = labelFeatures | |
291 , x_progress = progress_action | |
292 , x_env = calc_env | |
293 ) | |
294 if ( is.null(my_cor_cov) ) { | |
295 progress_action("NOTHING TO PLOT.") | |
296 } else { | |
297 tsv <- my_cor_cov$tsv1 | |
298 # tsv$salientLevel <- salient_level_lookup(tsv$featureID) | |
299 # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) | |
300 # tsv$salience <- salience_lookup(tsv$featureID) | |
301 tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] | |
302 corcov_tsv_action(tsv) | |
303 did_plot <- TRUE | |
304 } | |
305 } | |
306 } | |
307 | |
308 ## next, contrast each selected level with each of the other levels individually ## | |
309 # process columns matching the pattern | |
310 for (i in 1:length(col_matches)) { | |
311 # for each potential match of the pattern | |
312 col_match <- col_matches[[i]] | |
313 if (length(col_match) > 0) { | |
314 # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig | |
315 vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name | |
316 fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 | |
317 fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 | |
318 # only process this column if both factors are members of lvlCSV | |
319 is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) | |
320 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) | |
321 # TODO delete next line displaying currently-processed column | |
322 # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match)) | |
323 # choose only samples with one of the two factors for this column | |
324 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) | |
325 predictor <- smpl_metadata_facC[chosen_samples] | |
326 # extract only the significantly-varying features and the chosen samples | |
327 fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * vrbl_metadata[,intersample_sig_col] | |
328 col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ] | |
329 my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] | |
330 my_cor_cov <- do_detail_plot( | |
331 x_dataMatrix = my_matrix | |
332 , x_predictor = predictor | |
333 , x_is_match = is_match | |
334 , x_algorithm = algoC | |
335 , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" | |
336 , x_show_labels = labelFeatures | |
337 , x_progress = progress_action | |
338 , x_env = calc_env | |
339 ) | |
340 if ( is.null(my_cor_cov) ) { | |
341 progress_action("NOTHING TO PLOT.") | |
342 } else { | |
343 tsv <- my_cor_cov$tsv1 | |
344 # tsv$salientLevel <- salient_level_lookup(tsv$featureID) | |
345 # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) | |
346 # tsv$salience <- salience_lookup(tsv$featureID) | |
347 tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] | |
348 corcov_tsv_action(tsv) | |
349 did_plot <- TRUE | |
350 } | |
351 } | |
352 } | |
353 } else { # tesC == "none" | |
354 level_union <- unique(sort(smpl_metadata_facC)) | |
355 if ( length(level_union) > 1 ) { | |
356 ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ## | |
357 completed <- c() | |
358 lapply( | |
359 X = level_union | |
360 , FUN = function(x) { | |
361 fctr_lvl_1 <- x[1] | |
362 fctr_lvl_2 <- { | |
363 if ( fctr_lvl_1 %in% completed ) | |
364 return("DUMMY") | |
365 # strF(completed) | |
366 completed <<- c(completed, fctr_lvl_1) | |
367 setdiff(level_union, fctr_lvl_1) | |
368 } | |
369 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) | |
370 fctr_lvl_2 <- "other" | |
371 # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) | |
372 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) | |
373 if (length(unique(chosen_samples)) < 1) { | |
374 progress_action("NOTHING TO PLOT...") | |
375 } else { | |
376 chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) | |
377 predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) | |
378 my_matrix <- scdm[ chosen_samples, , drop = FALSE ] | |
379 # only process this column if both factors are members of lvlCSV | |
380 is_match <- isLevelSelected(fctr_lvl_1) | |
381 my_cor_cov <- do_detail_plot( | |
382 x_dataMatrix = my_matrix | |
383 , x_predictor = predictor | |
384 , x_is_match = is_match | |
385 , x_algorithm = algoC | |
386 , x_prefix = "Features" | |
387 , x_show_labels = labelFeatures | |
388 , x_progress = progress_action | |
389 , x_env = calc_env | |
390 ) | |
391 if ( is.null(my_cor_cov) ) { | |
392 progress_action("NOTHING TO PLOT") | |
393 } else { | |
394 tsv <- my_cor_cov$tsv1 | |
395 # tsv$salientLevel <- salient_level_lookup(tsv$featureID) | |
396 # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) | |
397 # tsv$salience <- salience_lookup(tsv$featureID) | |
398 corcov_tsv_action(tsv) | |
399 did_plot <<- TRUE | |
400 } | |
401 } | |
402 #print("baz") | |
403 "dummy" # need to return a value; otherwise combn fails with an error | |
404 } | |
405 ) | |
406 ## pass 2 - contrast each selected level with each of the other levels individually ## | |
407 completed <- c() | |
408 utils::combn( | |
409 x = level_union | |
410 , m = 2 | |
411 , FUN = function(x) { | |
412 fctr_lvl_1 <- x[1] | |
413 fctr_lvl_2 <- x[2] | |
414 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) | |
415 # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) | |
416 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) | |
417 if (length(unique(chosen_samples)) < 1) { | |
418 progress_action("NOTHING TO PLOT...") | |
419 } else { | |
420 chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) | |
421 predictor <- chosen_facC | |
422 my_matrix <- scdm[ chosen_samples, , drop = FALSE ] | |
423 # only process this column if both factors are members of lvlCSV | |
424 is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) | |
425 my_cor_cov <- do_detail_plot( | |
426 x_dataMatrix = my_matrix | |
427 , x_predictor = predictor | |
428 , x_is_match = is_match | |
429 , x_algorithm = algoC | |
430 , x_prefix = "Features" | |
431 , x_show_labels = labelFeatures | |
432 , x_progress = progress_action | |
433 , x_env = calc_env | |
434 ) | |
435 if ( is.null(my_cor_cov) ) { | |
436 progress_action("NOTHING TO PLOT") | |
437 } else { | |
438 tsv <- my_cor_cov$tsv1 | |
439 # tsv$salientLevel <- salient_level_lookup(tsv$featureID) | |
440 # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) | |
441 # tsv$salience <- salience_lookup(tsv$featureID) | |
442 corcov_tsv_action(tsv) | |
443 did_plot <<- TRUE | |
444 } | |
445 } | |
446 #print("baz") | |
447 "dummy" # need to return a value; otherwise combn fails with an error | |
448 } | |
449 ) | |
450 } else { | |
451 progress_action("NOTHING TO PLOT....") | |
452 } | |
453 } | |
454 if (!did_plot) { | |
455 failure_action(sprintf("bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'", facC, originalLevCSV)) | |
456 return ( FALSE ) | |
457 } | |
458 return ( TRUE ) | |
459 } | |
460 | |
461 # Calculate data for correlation-versus-covariance plot | |
462 # Adapted from: | |
463 # Wiklund_2008 doi:10.1021/ac0713510 | |
464 # Galindo_Prieto_2014 doi:10.1002/cem.2627 | |
465 # https://github.com/HegemanLab/extra_tools/blob/master/generic_PCA.R | |
466 cor_vs_cov <- function(matrix_x, ropls_x) { | |
467 x_class <- class(ropls_x) | |
468 if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) ) | |
469 stop( "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class ", as.character(x_class) ) | |
470 } | |
471 result <- list() | |
472 # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS | |
473 if ( ropls_x@suppLs$algoC == "nipals") { | |
474 # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510 | |
475 mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) | |
476 mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x])) | |
477 score_matrix <- ropls_x@scoreMN | |
478 score_matrix_transposed <- t(score_matrix) | |
479 score_matrix_magnitude <- mag(score_matrix) | |
480 result$covariance <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) | |
481 result$correlation <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi ) | |
482 } else { | |
483 # WARNING - untested code - I don't have test data to exercise this branch | |
484 # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 | |
485 # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F | |
486 score_matrix <- ropls_x@scoreMN | |
487 score_matrix_transposed <- t(score_matrix) | |
488 cov_divisor <- nrow(matrix_x) - 1 | |
489 result$covariance <- sapply( | |
490 X = 1:ncol(matrix_x) | |
491 , FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor | |
492 ) | |
493 score_sd <- sapply( | |
494 X = 1:ncol(score_matrix) | |
495 , FUN = function(x) sd(score_matrix[,x]) | |
496 ) | |
497 # xSdVn - Numerical vector: variable standard deviations of the 'x' matrix | |
498 xSdVn <- ropls_x@xSdVn | |
499 result$correlation <- sapply( | |
500 X = 1:ncol(matrix_x) | |
501 , FUN = function(x) { | |
502 ( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) ) | |
503 } | |
504 ) | |
505 } | |
506 result$correlation <- result$correlation[1,,drop = TRUE] | |
507 result$covariance <- result$covariance[1,,drop = TRUE] | |
508 | |
509 # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014 | |
510 # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) | |
511 result$vip4p <- as.numeric(ropls_x@vipVn) | |
512 result$vip4o <- as.numeric(ropls_x@orthoVipVn) | |
513 # get the level names | |
514 level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) | |
515 feature_count <- length(ropls_x@vipVn) | |
516 result$level1 <- rep.int(x = level_names[1], times = feature_count) | |
517 result$level2 <- rep.int(x = level_names[2], times = feature_count) | |
518 # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation))) | |
519 superresult <- list() | |
520 if (length(result$vip4o) == 0) result$vip4o <- NA | |
521 superresult$tsv1 <- data.frame( | |
522 featureID = names(ropls_x@vipVn) | |
523 , factorLevel1 = result$level1 | |
524 , factorLevel2 = result$level2 | |
525 , correlation = result$correlation | |
526 , covariance = result$covariance | |
527 , vip4p = result$vip4p | |
528 , vip4o = result$vip4o | |
529 , row.names = NULL | |
530 ) | |
531 rownames(superresult$tsv1) <- superresult$tsv1$featureID | |
532 superresult$covariance <- result$covariance | |
533 superresult$correlation <- result$correlation | |
534 superresult$vip4p <- result$vip4p | |
535 superresult$vip4o <- result$vip4o | |
536 superresult$details <- result | |
537 # #print(superresult$tsv1) | |
538 result$superresult <- superresult | |
539 # Include thise in case future consumers of this routine want to use it in currently unanticipated ways | |
540 result$oplsda <- ropls_x | |
541 result$predictor <- ropls_x@suppLs$y # in case future consumers of this routine want to use it in currently unanticipated ways | |
542 return (superresult) | |
543 } | |
544 | |
545 |