Mercurial > repos > eschen42 > w4mcorcov
diff w4mcorcov_calc.R @ 0:23f9fad4edfc draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit bd26542b811de06c1a877337a2840a9f899c2b94
author | eschen42 |
---|---|
date | Mon, 16 Oct 2017 14:56:52 -0400 |
parents | |
children | 0c2ad44b6c9c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/w4mcorcov_calc.R Mon Oct 16 14:56:52 2017 -0400 @@ -0,0 +1,545 @@ +# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/ +center_colmeans <- function(x) { + xcenter = colMeans(x) + x - rep(xcenter, rep.int(nrow(x), ncol(x))) +} + +#### OPLS-DA +algoC <- "nipals" + +do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) { + off <- function(x) if (x_show_labels) x else 0 + salience_lookup <- x_env$salience_lookup + salient_rcv_lookup <- x_env$salient_rcv_lookup + # x_progress("head(salience_df): ", head(salience_df)) + # x_progress("head(salience): ", head(salience)) + if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) { + my_oplsda <- opls( + x = x_dataMatrix + , y = x_predictor + , algoC = x_algorithm + , predI = 1 + , orthoI = if (ncol(x_dataMatrix) > 1) 1 else 0 + , printL = FALSE + , plotL = FALSE + ) + my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) + fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] + fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] + my_cor_vs_cov <- cor_vs_cov( + matrix_x = x_dataMatrix + , ropls_x = my_oplsda + ) + with( + my_cor_vs_cov + , { + min_x <- min(covariance) + max_x <- max(covariance) + lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs)) + covariance <- covariance / lim_x + lim_x <- 1.2 + main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2) + # print("main_label") + # print(main_label) + main_cex <- min(1.0, 46.0/nchar(main_label)) + # "It is generally accepted that a variable should be selected if vj>1, [27–29], + # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." + # (Mehmood 2012 doi:10.1186/1748-7188-6-27) + vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) + alpha <- 0.1 + 0.4 * vipco + red <- as.numeric(correlation < 0) * vipco + blue <- as.numeric(correlation > 0) * vipco + minus_cor <- -correlation + minus_cov <- -covariance + # cex <- salience_lookup(feature = names(minus_cor)) + # cex <- 0.25 + (1.25 * cex / max(cex)) + cex <- 0.75 + plot( + y = minus_cor + , x = minus_cov + , type="p" + , xlim=c(-lim_x, lim_x + off(0.1)) + , ylim=c(-1.0 - off(0.1), 1.0) + , xlab = sprintf("relative covariance(feature,t1)") + , ylab = sprintf("correlation(feature,t1)") + , main = main_label + , cex.main = main_cex + , cex = cex + , pch = 16 + , col = rgb(blue = blue, red = red, green = 0, alpha = alpha) + ) + low_x <- -0.7 * lim_x + high_x <- 0.7 * lim_x + text(x = low_x, y = -0.15, labels = fctr_lvl_1) + text(x = high_x, y = 0.15, labels = fctr_lvl_2) + if (x_show_labels) { + text( + y = minus_cor - 0.013 + , x = minus_cov + 0.020 + , cex = 0.3 + , labels = tsv1$featureID + , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha) + , srt = -30 # slant 30 degrees downward + , adj = 0 # left-justified + ) + } + } + ) + typeVc <- c("correlation", # 1 + "outlier", # 2 + "overview", # 3 + "permutation", # 4 + "predict-train", # 5 + "predict-test", # 6 + "summary", # 7 = c(2,3,4,9) + "x-loading", # 8 + "x-score", # 9 + "x-variance", # 10 + "xy-score", # 11 + "xy-weight" # 12 + ) # [c(3,8,9)] # [c(4,3,8,9)] + if ( length(my_oplsda@orthoVipVn) > 0 ) { + my_typevc <- typeVc[c(9,3,8)] + } else { + my_typevc <- c("(dummy)","overview","(dummy)") + } + for (my_type in my_typevc) { + if (my_type %in% typeVc) { + # print(sprintf("plotting type %s", my_type)) + plot( + x = my_oplsda + , typeVc = my_type + , parCexN = 0.4 + , parDevNewL = FALSE + , parLayL = TRUE + , parEllipsesL = TRUE + ) + } else { + # print("plotting dummy graph") + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no orthogonal projection is possible") + } + } + return (my_cor_vs_cov) + } else { + # x_progress(sprintf("x_is_match = %s, ncol(x_dataMatrix) = %d, length(unique(x_predictor)) = %d",x_is_match, ncol(x_dataMatrix), length(unique(x_predictor)))) + return (NULL) + } +} + +# S-PLOT and OPLS reference: Wiklund_2008 doi:10.1021/ac0713510 +corcov_calc <- function(calc_env, failure_action = stop, progress_action = function(x){}, corcov_tsv_action = function(t){}, salience_tsv_action = function(t){}) { + if ( ! is.environment(calc_env) ) { + failure_action("corcov_calc: fatal error - 'calc_env' is not an environment") + return ( FALSE ) + } + if ( ! is.function(corcov_tsv_action) ) { + failure_action("corcov_calc: fatal error - 'corcov_tsv_action' is not a function") + return ( FALSE ) + } + if ( ! is.function(salience_tsv_action) ) { + failure_action("salience_calc: fatal error - 'salience_tsv_action' is not a function") + return ( FALSE ) + } + + # extract parameters from the environment + vrbl_metadata <- calc_env$vrbl_metadata + vrbl_metadata_names <- vrbl_metadata[,1] + smpl_metadata <- calc_env$smpl_metadata + data_matrix <- calc_env$data_matrix + pairSigFeatOnly <- calc_env$pairSigFeatOnly + facC <- calc_env$facC + tesC <- calc_env$tesC + # extract the levels from the environment + originalLevCSV <- levCSV <- calc_env$levCSV + # matchingC is one of { "none", "wildcard", "regex" } + matchingC <- calc_env$matchingC + labelFeatures <- calc_env$labelFeatures + + # arg/env checking + if (!(facC %in% names(smpl_metadata))) { + failure_action(sprintf("bad parameter! Factor name '%s' not found in sampleMetadata", facC)) + return ( FALSE ) + } + + # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) + salience_df <- calc_env$salience_df <- w4msalience( + data_matrix = data_matrix + , sample_class = smpl_metadata[,facC] + , failure_action = failure_action + ) + salience_tsv_action({ + my_df <- data.frame( + featureID = salience_df$feature + , salientLevel = salience_df$max_level + , salientRCV = salience_df$salient_rcv + , salience = salience_df$salience + ) + my_df[order(-my_df$salience),] + }) + salience <- salience_df$salience + names(salience) <- salience_df$feature + salience_lookup <- calc_env$salience_lookup <- function(feature) unname(salience[feature]) + salient_rcv <- salience_df$salient_rcv + names(salient_rcv) <- salience_df$feature + salient_rcv_lookup <- calc_env$salient_rcv_lookup <- function(feature) unname(salient_rcv[feature]) + salient_level <- salience_df$max_level + names(salient_level) <- salience_df$feature + salient_level_lookup <- calc_env$salient_level_lookup <- function(feature) unname(salient_level[feature]) + + # transform wildcards to regexen + if (matchingC == "wildcard") { + # strsplit(x = "hello,wild,world", split = ",") + levCSV <- gsub("[.]", "[.]", levCSV) + levCSV <- strsplit(x = levCSV, split = ",") + levCSV <- sapply(levCSV, utils::glob2rx, trim.tail = FALSE) + levCSV <- paste(levCSV, collapse = ",") + } + # function to determine whether level is a member of levCSV + isLevelSelected <- function(lvl) { + matchFun <- if (matchingC != "none") grepl else `==` + return( + Reduce( + f = "||" + , x = sapply( + X = strsplit( + x = levCSV + , split = "," + , fixed = TRUE + )[[1]] + , FUN = matchFun + , lvl + ) + ) + ) + } + + # transpose matrix because ropls matrix is the transpose of XCMS matrix + # Wiklund_2008 centers and pareto-scales data before OPLS-DA S-plot + # center + cdm <- center_colmeans(t(data_matrix)) + # pareto-scale + my_scale <- sqrt(apply(cdm, 2, sd, na.rm=TRUE)) + scdm <- sweep(cdm, 2, my_scale, "/") + + # pattern to match column names like k10_kruskal_k4.k3_sig + col_pattern <- sprintf('^%s_%s_(.*)[.](.*)_sig$', facC, tesC) + # column name like k10_kruskal_sig + intersample_sig_col <- sprintf('%s_%s_sig', facC, tesC) + # get the facC column from sampleMetadata, dropping to one dimension + smpl_metadata_facC <- smpl_metadata[,facC] + + # allocate a slot in the environment for the contrast_list, each element of which will be a data.frame with this structure: + # - feature ID + # - value1 + # - value2 + # - Wiklund_2008 correlation + # - Wiklund_2008 covariance + # - Wiklund_2008 VIP + calc_env$contrast_list <- list() + + + did_plot <- FALSE + if (tesC != "none") { + # for each column name, extract the parts of the name matched by 'col_pattern', if any + the_colnames <- colnames(vrbl_metadata) + if ( ! Reduce( f = "||", x = grepl(tesC, the_colnames) ) ) { + failure_action(sprintf("bad parameter! variableMetadata must contain results of W4M Univariate test '%s'.", tesC)) + return ( FALSE ) + } + col_matches <- lapply( + X = the_colnames, + FUN = function(x) { + regmatches( x, regexec(col_pattern, x) )[[1]] + } + ) + ## first contrast each selected level with all other levels combined into one "super-level" ## + # process columns matching the pattern + level_union <- c() + for (i in 1:length(col_matches)) { + col_match <- col_matches[[i]] + if (length(col_match) > 0) { + # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig + vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name + fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 + fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) + if (is_match) { + level_union <- c(level_union, col_match[2], col_match[3]) + } + } + } + level_union <- unique(sort(level_union)) + overall_significant <- 1 == vrbl_metadata[,intersample_sig_col] + if ( length(level_union) > 1 ) { + chosen_samples <- smpl_metadata_facC %in% level_union + chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) + col_selector <- vrbl_metadata_names[ overall_significant ] + my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] + fctr_lvl_2 <- "other" + for ( fctr_lvl_1 in level_union[1:length(level_union)] ) { + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT.") + } else { + tsv <- my_cor_cov$tsv1 + # tsv$salientLevel <- salient_level_lookup(tsv$featureID) + # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) + # tsv$salience <- salience_lookup(tsv$featureID) + tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] + corcov_tsv_action(tsv) + did_plot <- TRUE + } + } + } + + ## next, contrast each selected level with each of the other levels individually ## + # process columns matching the pattern + for (i in 1:length(col_matches)) { + # for each potential match of the pattern + col_match <- col_matches[[i]] + if (length(col_match) > 0) { + # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig + vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name + fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 + fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + # TODO delete next line displaying currently-processed column + # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match)) + # choose only samples with one of the two factors for this column + chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) + predictor <- smpl_metadata_facC[chosen_samples] + # extract only the significantly-varying features and the chosen samples + fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * vrbl_metadata[,intersample_sig_col] + col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ] + my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT.") + } else { + tsv <- my_cor_cov$tsv1 + # tsv$salientLevel <- salient_level_lookup(tsv$featureID) + # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) + # tsv$salience <- salience_lookup(tsv$featureID) + tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] + corcov_tsv_action(tsv) + did_plot <- TRUE + } + } + } + } else { # tesC == "none" + level_union <- unique(sort(smpl_metadata_facC)) + if ( length(level_union) > 1 ) { + ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ## + completed <- c() + lapply( + X = level_union + , FUN = function(x) { + fctr_lvl_1 <- x[1] + fctr_lvl_2 <- { + if ( fctr_lvl_1 %in% completed ) + return("DUMMY") + # strF(completed) + completed <<- c(completed, fctr_lvl_1) + setdiff(level_union, fctr_lvl_1) + } + chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) + fctr_lvl_2 <- "other" + # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + if (length(unique(chosen_samples)) < 1) { + progress_action("NOTHING TO PLOT...") + } else { + chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) + predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) + my_matrix <- scdm[ chosen_samples, , drop = FALSE ] + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = "Features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT") + } else { + tsv <- my_cor_cov$tsv1 + # tsv$salientLevel <- salient_level_lookup(tsv$featureID) + # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) + # tsv$salience <- salience_lookup(tsv$featureID) + corcov_tsv_action(tsv) + did_plot <<- TRUE + } + } + #print("baz") + "dummy" # need to return a value; otherwise combn fails with an error + } + ) + ## pass 2 - contrast each selected level with each of the other levels individually ## + completed <- c() + utils::combn( + x = level_union + , m = 2 + , FUN = function(x) { + fctr_lvl_1 <- x[1] + fctr_lvl_2 <- x[2] + chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) + # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + if (length(unique(chosen_samples)) < 1) { + progress_action("NOTHING TO PLOT...") + } else { + chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) + predictor <- chosen_facC + my_matrix <- scdm[ chosen_samples, , drop = FALSE ] + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = "Features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT") + } else { + tsv <- my_cor_cov$tsv1 + # tsv$salientLevel <- salient_level_lookup(tsv$featureID) + # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) + # tsv$salience <- salience_lookup(tsv$featureID) + corcov_tsv_action(tsv) + did_plot <<- TRUE + } + } + #print("baz") + "dummy" # need to return a value; otherwise combn fails with an error + } + ) + } else { + progress_action("NOTHING TO PLOT....") + } + } + if (!did_plot) { + failure_action(sprintf("bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'", facC, originalLevCSV)) + return ( FALSE ) + } + return ( TRUE ) +} + +# Calculate data for correlation-versus-covariance plot +# Adapted from: +# Wiklund_2008 doi:10.1021/ac0713510 +# Galindo_Prieto_2014 doi:10.1002/cem.2627 +# https://github.com/HegemanLab/extra_tools/blob/master/generic_PCA.R +cor_vs_cov <- function(matrix_x, ropls_x) { + x_class <- class(ropls_x) + if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) ) + stop( "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class ", as.character(x_class) ) + } + result <- list() + # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS + if ( ropls_x@suppLs$algoC == "nipals") { + # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510 + mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) + mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x])) + score_matrix <- ropls_x@scoreMN + score_matrix_transposed <- t(score_matrix) + score_matrix_magnitude <- mag(score_matrix) + result$covariance <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) + result$correlation <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi ) + } else { + # WARNING - untested code - I don't have test data to exercise this branch + # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 + # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F + score_matrix <- ropls_x@scoreMN + score_matrix_transposed <- t(score_matrix) + cov_divisor <- nrow(matrix_x) - 1 + result$covariance <- sapply( + X = 1:ncol(matrix_x) + , FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor + ) + score_sd <- sapply( + X = 1:ncol(score_matrix) + , FUN = function(x) sd(score_matrix[,x]) + ) + # xSdVn - Numerical vector: variable standard deviations of the 'x' matrix + xSdVn <- ropls_x@xSdVn + result$correlation <- sapply( + X = 1:ncol(matrix_x) + , FUN = function(x) { + ( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) ) + } + ) + } + result$correlation <- result$correlation[1,,drop = TRUE] + result$covariance <- result$covariance[1,,drop = TRUE] + + # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014 + # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) + result$vip4p <- as.numeric(ropls_x@vipVn) + result$vip4o <- as.numeric(ropls_x@orthoVipVn) + # get the level names + level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) + feature_count <- length(ropls_x@vipVn) + result$level1 <- rep.int(x = level_names[1], times = feature_count) + result$level2 <- rep.int(x = level_names[2], times = feature_count) + # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation))) + superresult <- list() + if (length(result$vip4o) == 0) result$vip4o <- NA + superresult$tsv1 <- data.frame( + featureID = names(ropls_x@vipVn) + , factorLevel1 = result$level1 + , factorLevel2 = result$level2 + , correlation = result$correlation + , covariance = result$covariance + , vip4p = result$vip4p + , vip4o = result$vip4o + , row.names = NULL + ) + rownames(superresult$tsv1) <- superresult$tsv1$featureID + superresult$covariance <- result$covariance + superresult$correlation <- result$correlation + superresult$vip4p <- result$vip4p + superresult$vip4o <- result$vip4o + superresult$details <- result + # #print(superresult$tsv1) + result$superresult <- superresult + # Include thise in case future consumers of this routine want to use it in currently unanticipated ways + result$oplsda <- ropls_x + result$predictor <- ropls_x@suppLs$y # in case future consumers of this routine want to use it in currently unanticipated ways + return (superresult) +} + +