Mercurial > repos > eschen42 > w4mcorcov
view w4mcorcov_calc.R @ 1:0c2ad44b6c9c draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 01d4a951cf09e7b88fcec96b8043bc7568cc5c92
author | eschen42 |
---|---|
date | Sun, 22 Oct 2017 18:47:57 -0400 |
parents | 23f9fad4edfc |
children | e03582f26617 |
line wrap: on
line source
# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/ center_colmeans <- function(x) { xcenter = colMeans(x) x - rep(xcenter, rep.int(nrow(x), ncol(x))) } #### OPLS-DA algoC <- "nipals" do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) { off <- function(x) if (x_show_labels == "0") x else 0 if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) { my_oplsda <- opls( x = x_dataMatrix , y = x_predictor , algoC = x_algorithm , predI = 1 , orthoI = if (ncol(x_dataMatrix) > 1) 1 else 0 , printL = FALSE , plotL = FALSE ) my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] my_cor_vs_cov <- cor_vs_cov( matrix_x = x_dataMatrix , ropls_x = my_oplsda ) with( my_cor_vs_cov , { min_x <- min(covariance) max_x <- max(covariance) lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs)) covariance <- covariance / lim_x lim_x <- 1.2 main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2) main_cex <- min(1.0, 46.0/nchar(main_label)) # "It is generally accepted that a variable should be selected if vj>1, [27–29], # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." # (Mehmood 2012 doi:10.1186/1748-7188-6-27) vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) alpha <- 0.1 + 0.4 * vipco red <- as.numeric(correlation > 0) * vipco blue <- as.numeric(correlation < 0) * vipco plus_cor <- correlation plus_cov <- covariance cex <- 0.75 plot( y = plus_cor , x = plus_cov , type="p" , xlim=c(-lim_x, lim_x + off(0.1)) , ylim=c(-1.0 - off(0.1), 1.0) , xlab = sprintf("relative covariance(feature,t1)") , ylab = sprintf("correlation(feature,t1)") , main = main_label , cex.main = main_cex , cex = cex , pch = 16 , col = rgb(blue = blue, red = red, green = 0, alpha = alpha) ) low_x <- -0.7 * lim_x high_x <- 0.7 * lim_x text(x = low_x, y = -0.05, labels = fctr_lvl_1) text(x = high_x, y = 0.05, labels = fctr_lvl_2) if ( x_show_labels != "0" ) { my_loadp <- loadp my_loado <- loado names(my_loadp) <- tsv1$featureID names(my_loado) <- tsv1$featureID if ( x_show_labels == "ALL" ) { n_labels <- length(loadp) } else { n_labels <- as.numeric(x_show_labels) } n_labels <- min( n_labels, (1 + length(loadp)) / 2 ) labels_to_show <- c( names(head(sort(my_loadp),n = n_labels)) , names(head(sort(my_loado),n = n_labels)) , names(tail(sort(my_loadp),n = n_labels)) , names(tail(sort(my_loado),n = n_labels)) ) labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" )) text( y = plus_cor - 0.013 , x = plus_cov + 0.020 , cex = 0.3 , labels = labels , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha) , srt = -30 # slant 30 degrees downward , adj = 0 # left-justified ) } } ) typeVc <- c("correlation", # 1 "outlier", # 2 "overview", # 3 "permutation", # 4 "predict-train", # 5 "predict-test", # 6 "summary", # 7 = c(2,3,4,9) "x-loading", # 8 "x-score", # 9 "x-variance", # 10 "xy-score", # 11 "xy-weight" # 12 ) # [c(3,8,9)] # [c(4,3,8,9)] if ( length(my_oplsda@orthoVipVn) > 0 ) { my_typevc <- typeVc[c(9,3,8)] } else { my_typevc <- c("(dummy)","overview","(dummy)") } for (my_type in my_typevc) { if (my_type %in% typeVc) { # print(sprintf("plotting type %s", my_type)) plot( x = my_oplsda , typeVc = my_type , parCexN = 0.4 , parDevNewL = FALSE , parLayL = TRUE , parEllipsesL = TRUE ) } else { # print("plotting dummy graph") plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") text(x=1, y=1, labels="no orthogonal projection is possible") } } return (my_cor_vs_cov) } else { # x_progress(sprintf("x_is_match = %s, ncol(x_dataMatrix) = %d, length(unique(x_predictor)) = %d",x_is_match, ncol(x_dataMatrix), length(unique(x_predictor)))) return (NULL) } } # S-PLOT and OPLS reference: Wiklund_2008 doi:10.1021/ac0713510 corcov_calc <- function(calc_env, failure_action = stop, progress_action = function(x){}, corcov_tsv_action = function(t){}, salience_tsv_action = function(t){}) { if ( ! is.environment(calc_env) ) { failure_action("corcov_calc: fatal error - 'calc_env' is not an environment") return ( FALSE ) } if ( ! is.function(corcov_tsv_action) ) { failure_action("corcov_calc: fatal error - 'corcov_tsv_action' is not a function") return ( FALSE ) } if ( ! is.function(salience_tsv_action) ) { failure_action("salience_calc: fatal error - 'salience_tsv_action' is not a function") return ( FALSE ) } # extract parameters from the environment vrbl_metadata <- calc_env$vrbl_metadata vrbl_metadata_names <- vrbl_metadata[,1] smpl_metadata <- calc_env$smpl_metadata data_matrix <- calc_env$data_matrix pairSigFeatOnly <- calc_env$pairSigFeatOnly facC <- calc_env$facC tesC <- calc_env$tesC # extract the levels from the environment originalLevCSV <- levCSV <- calc_env$levCSV # matchingC is one of { "none", "wildcard", "regex" } matchingC <- calc_env$matchingC labelFeatures <- calc_env$labelFeatures # arg/env checking if (!(facC %in% names(smpl_metadata))) { failure_action(sprintf("bad parameter! Factor name '%s' not found in sampleMetadata", facC)) return ( FALSE ) } mz <- vrbl_metadata$mz names(mz) <- vrbl_metadata$variableMetadata mz_lookup <- function(feature) unname(mz[feature]) rt <- vrbl_metadata$rt names(rt) <- vrbl_metadata$variableMetadata rt_lookup <- function(feature) unname(rt[feature]) # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) salience_df <- calc_env$salience_df <- w4msalience( data_matrix = data_matrix , sample_class = smpl_metadata[,facC] , failure_action = failure_action ) salience_tsv_action({ my_df <- data.frame( featureID = salience_df$feature , salientLevel = salience_df$max_level , salientRCV = salience_df$salient_rcv , salience = salience_df$salience , mz = mz_lookup(salience_df$feature) , rt = rt_lookup(salience_df$feature) ) my_df[order(-my_df$salience),] }) # transform wildcards to regexen if (matchingC == "wildcard") { # strsplit(x = "hello,wild,world", split = ",") levCSV <- gsub("[.]", "[.]", levCSV) levCSV <- strsplit(x = levCSV, split = ",") levCSV <- sapply(levCSV, utils::glob2rx, trim.tail = FALSE) levCSV <- paste(levCSV, collapse = ",") } # function to determine whether level is a member of levCSV isLevelSelected <- function(lvl) { matchFun <- if (matchingC != "none") grepl else `==` return( Reduce( f = "||" , x = sapply( X = strsplit( x = levCSV , split = "," , fixed = TRUE )[[1]] , FUN = matchFun , lvl ) ) ) } # transpose matrix because ropls matrix is the transpose of XCMS matrix # Wiklund_2008 centers and pareto-scales data before OPLS-DA S-plot # center cdm <- center_colmeans(t(data_matrix)) # pareto-scale my_scale <- sqrt(apply(cdm, 2, sd, na.rm=TRUE)) scdm <- sweep(cdm, 2, my_scale, "/") # pattern to match column names like k10_kruskal_k4.k3_sig col_pattern <- sprintf('^%s_%s_(.*)[.](.*)_sig$', facC, tesC) # column name like k10_kruskal_sig intersample_sig_col <- sprintf('%s_%s_sig', facC, tesC) # get the facC column from sampleMetadata, dropping to one dimension smpl_metadata_facC <- smpl_metadata[,facC] # allocate a slot in the environment for the contrast_list, each element of which will be a data.frame with this structure: # - feature ID # - value1 # - value2 # - Wiklund_2008 correlation # - Wiklund_2008 covariance # - Wiklund_2008 VIP calc_env$contrast_list <- list() did_plot <- FALSE if (tesC != "none") { # for each column name, extract the parts of the name matched by 'col_pattern', if any the_colnames <- colnames(vrbl_metadata) if ( ! Reduce( f = "||", x = grepl(tesC, the_colnames) ) ) { failure_action(sprintf("bad parameter! variableMetadata must contain results of W4M Univariate test '%s'.", tesC)) return ( FALSE ) } col_matches <- lapply( X = the_colnames, FUN = function(x) { regmatches( x, regexec(col_pattern, x) )[[1]] } ) ## first contrast each selected level with all other levels combined into one "super-level" ## # process columns matching the pattern level_union <- c() for (i in 1:length(col_matches)) { col_match <- col_matches[[i]] if (length(col_match) > 0) { # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 # only process this column if both factors are members of lvlCSV is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) if (is_match) { level_union <- c(level_union, col_match[2], col_match[3]) } } } level_union <- unique(sort(level_union)) overall_significant <- 1 == ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE ) if ( length(level_union) > 2 ) { chosen_samples <- smpl_metadata_facC %in% level_union chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) col_selector <- vrbl_metadata_names[ overall_significant ] my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] plot_action <- function(fctr_lvl_1, fctr_lvl_2) { progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match , x_algorithm = algoC , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" , x_show_labels = labelFeatures , x_progress = progress_action , x_env = calc_env ) if ( is.null(my_cor_cov) ) { progress_action("NOTHING TO PLOT.") } else { my_tsv <- my_cor_cov$tsv1 my_tsv$mz <- mz_lookup(my_tsv$featureID) my_tsv$rt <- rt_lookup(my_tsv$featureID) my_tsv["level1Level2Sig"] <- vrbl_metadata[ match(my_tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] tsv <<- my_tsv corcov_tsv_action(tsv) did_plot <<- TRUE } } if ( length(level_union) != 2 ) { fctr_lvl_2 <- "other" for ( fctr_lvl_1 in level_union[1:length(level_union)] ) { plot_action(fctr_lvl_1, fctr_lvl_2) } } else { plot_action(fctr_lvl_1 = level_union[1], fctr_lvl_2 = level_union[2]) } } if ( length(level_union) > 1 ) { ## next, contrast each selected level with each of the other levels individually ## # process columns matching the pattern for (i in 1:length(col_matches)) { # for each potential match of the pattern col_match <- col_matches[[i]] if (length(col_match) > 0) { # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 # only process this column if both factors are members of lvlCSV is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) # TODO delete next line displaying currently-processed column # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match)) # choose only samples with one of the two factors for this column chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) predictor <- smpl_metadata_facC[chosen_samples] # extract only the significantly-varying features and the chosen samples fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE ) col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ] my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match , x_algorithm = algoC , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" , x_show_labels = labelFeatures , x_progress = progress_action , x_env = calc_env ) if ( is.null(my_cor_cov) ) { progress_action("NOTHING TO PLOT.") } else { tsv <- my_cor_cov$tsv1 tsv$mz <- mz_lookup(tsv$featureID) tsv$rt <- rt_lookup(tsv$featureID) tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] corcov_tsv_action(tsv) did_plot <- TRUE } } } } } else { # tesC == "none" level_union <- unique(sort(smpl_metadata_facC)) if ( length(level_union) > 1 ) { if ( length(level_union) > 2 ) { ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ## completed <- c() lapply( X = level_union , FUN = function(x) { fctr_lvl_1 <- x[1] fctr_lvl_2 <- { if ( fctr_lvl_1 %in% completed ) return("DUMMY") # strF(completed) completed <<- c(completed, fctr_lvl_1) setdiff(level_union, fctr_lvl_1) } chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) fctr_lvl_2 <- "other" # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) if (length(unique(chosen_samples)) < 1) { progress_action("NOTHING TO PLOT...") } else { chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) my_matrix <- scdm[ chosen_samples, , drop = FALSE ] # only process this column if both factors are members of lvlCSV is_match <- isLevelSelected(fctr_lvl_1) my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match , x_algorithm = algoC , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action , x_env = calc_env ) if ( is.null(my_cor_cov) ) { progress_action("NOTHING TO PLOT") } else { tsv <- my_cor_cov$tsv1 tsv$mz <- mz_lookup(tsv$featureID) tsv$rt <- rt_lookup(tsv$featureID) corcov_tsv_action(tsv) did_plot <<- TRUE } } #print("baz") "dummy" # need to return a value; otherwise combn fails with an error } ) } ## pass 2 - contrast each selected level with each of the other levels individually ## completed <- c() utils::combn( x = level_union , m = 2 , FUN = function(x) { fctr_lvl_1 <- x[1] fctr_lvl_2 <- x[2] chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) if (length(unique(chosen_samples)) < 1) { progress_action("NOTHING TO PLOT...") } else { chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) predictor <- chosen_facC my_matrix <- scdm[ chosen_samples, , drop = FALSE ] # only process this column if both factors are members of lvlCSV is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match , x_algorithm = algoC , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action , x_env = calc_env ) if ( is.null(my_cor_cov) ) { progress_action("NOTHING TO PLOT") } else { tsv <- my_cor_cov$tsv1 tsv$mz <- mz_lookup(tsv$featureID) tsv$rt <- rt_lookup(tsv$featureID) corcov_tsv_action(tsv) did_plot <<- TRUE } } #print("baz") "dummy" # need to return a value; otherwise combn fails with an error } ) } else { progress_action("NOTHING TO PLOT....") } } if (!did_plot) { failure_action(sprintf("bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'", facC, originalLevCSV)) return ( FALSE ) } return ( TRUE ) } # Calculate data for correlation-versus-covariance plot # Adapted from: # Wiklund_2008 doi:10.1021/ac0713510 # Galindo_Prieto_2014 doi:10.1002/cem.2627 # https://github.com/HegemanLab/extra_tools/blob/master/generic_PCA.R cor_vs_cov <- function(matrix_x, ropls_x) { x_class <- class(ropls_x) if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) ) stop( "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class ", as.character(x_class) ) } result <- list() # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS if ( ropls_x@suppLs$algoC == "nipals") { # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510 mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x])) score_matrix <- ropls_x@scoreMN score_matrix_transposed <- t(score_matrix) score_matrix_magnitude <- mag(score_matrix) result$covariance <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) result$correlation <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi ) } else { # WARNING - untested code - I don't have test data to exercise this branch # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F score_matrix <- ropls_x@scoreMN score_matrix_transposed <- t(score_matrix) cov_divisor <- nrow(matrix_x) - 1 result$covariance <- sapply( X = 1:ncol(matrix_x) , FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor ) score_sd <- sapply( X = 1:ncol(score_matrix) , FUN = function(x) sd(score_matrix[,x]) ) # xSdVn - Numerical vector: variable standard deviations of the 'x' matrix xSdVn <- ropls_x@xSdVn result$correlation <- sapply( X = 1:ncol(matrix_x) , FUN = function(x) { ( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) ) } ) } result$correlation <- result$correlation[1,,drop = TRUE] result$covariance <- result$covariance[1,,drop = TRUE] # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014 # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) result$vip4p <- as.numeric(ropls_x@vipVn) result$vip4o <- as.numeric(ropls_x@orthoVipVn) result$loadp <- as.numeric(ropls_x@loadingMN) result$loado <- as.numeric(ropls_x@orthoLoadingMN) # get the level names level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) fctr_lvl_1 <- level_names[1] fctr_lvl_2 <- level_names[2] feature_count <- length(ropls_x@vipVn) result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count) result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count) # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation))) superresult <- list() if (length(result$vip4o) == 0) result$vip4o <- NA greaterLevel <- sapply( X = result$correlation, FUN = function(my_corr) if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 ) superresult$tsv1 <- data.frame( featureID = names(ropls_x@vipVn) , factorLevel1 = result$level1 , factorLevel2 = result$level2 , greaterLevel = greaterLevel , correlation = result$correlation , covariance = result$covariance , vip4p = result$vip4p , vip4o = result$vip4o , loadp = result$loadp , loado = result$loado , row.names = NULL ) rownames(superresult$tsv1) <- superresult$tsv1$featureID superresult$covariance <- result$covariance superresult$correlation <- result$correlation superresult$vip4p <- result$vip4p superresult$vip4o <- result$vip4o superresult$loadp <- result$loadp superresult$loado <- result$loado superresult$details <- result # #print(superresult$tsv1) result$superresult <- superresult # Include thise in case future consumers of this routine want to use it in currently unanticipated ways result$oplsda <- ropls_x result$predictor <- ropls_x@suppLs$y # in case future consumers of this routine want to use it in currently unanticipated ways return (superresult) }