Mercurial > repos > eschen42 > w4mcorcov
diff w4mcorcov_calc.R @ 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author | eschen42 |
---|---|
date | Wed, 12 Dec 2018 09:20:02 -0500 |
parents | ddaf84e15d06 |
children | 90708fdbc22d |
line wrap: on
line diff
--- a/w4mcorcov_calc.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_calc.R Wed Dec 12 09:20:02 2018 -0500 @@ -1,12 +1,4 @@ -# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/ -center_colmeans <- function(x) { - xcenter = colMeans(x) - x - rep(xcenter, rep.int(nrow(x), ncol(x))) -} - -#### OPLS-DA -algoC <- "nipals" - +# compute and output detail plots do_detail_plot <- function( x_dataMatrix , x_predictor @@ -21,7 +13,7 @@ off <- function(x) if (x_show_labels == "0") 0 else x if ( x_is_match && ncol(x_dataMatrix) > 0 - && length(unique(x_predictor))> 1 + && length(unique(x_predictor)) > 1 && x_crossval_i < nrow(x_dataMatrix) ) { my_oplsda <- opls( @@ -36,20 +28,22 @@ , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2. ) # strip out variables having negligible variance - x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE] + x_dataMatrix <- x_dataMatrix[ , names(my_oplsda@vipVn), drop = FALSE] my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] do_s_plot <- function( - x_env - , predictor_projection_x = TRUE - , cplot_x = FALSE - , cor_vs_cov_x = NULL - ) - { + x_env + , predictor_projection_x = TRUE + , cplot_x = FALSE + , cor_vs_cov_x = NULL + ) { if (cplot_x) { cplot_y_correlation <- (x_env$cplot_y == "correlation") + default_lim_x <- 10 + } else { + default_lim_x <- 1.2 } if (is.null(cor_vs_cov_x)) { my_cor_vs_cov <- cor_vs_cov( @@ -57,14 +51,17 @@ , ropls_x = my_oplsda , predictor_projection_x = predictor_projection_x , x_progress + , x_env ) } else { my_cor_vs_cov <- cor_vs_cov_x } - # str(my_cor_vs_cov) + if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) { if (is.null(cor_vs_cov_x)) { - x_progress("No cor_vs_cov data produced") + x_progress( + sprintf("No cor_vs_cov data produced for level %s versus %s", fctr_lvl_1, fctr_lvl_2) + ) } plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") text(x=1, y=1, labels="too few covariance data") @@ -76,216 +73,240 @@ min_x <- min(covariance, na.rm = TRUE) max_x <- max(covariance, na.rm = TRUE) lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs)) - covariance <- covariance / lim_x - lim_x <- 1.2 - # "It is generally accepted that a variable should be selected if vj>1, [27–29], + + # Regarding using VIP as a guide to selecting a biomarker: + # "It is generally accepted that a variable should be selected if vj>1, [27–29], # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." # (Mehmood 2012 doi:10.1186/1748-7188-6-27) plus_cor <- correlation plus_cov <- covariance - cex <- 0.65 - which_projection <- if (projection == 1) "t1" else "o1" - which_loading <- if (projection == 1) "parallel" else "orthogonal" - if (projection == 1) { # predictor-projection - vipcp <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) - if (!cplot_x) { # S-plot predictor-projection - my_xlab <- "relative covariance(feature,t1)" - my_x <- plus_cov - my_ylab <- "correlation(feature,t1)" - my_y <- plus_cor - } else { # C-plot predictor-projection - my_xlab <- "variable importance in predictor-projection" - my_x <- vip4p - if (cplot_y_correlation) { + if (length(plus_cor) != 0 && length(plus_cor) != 0) { + cex <- 0.65 + if (projection == 1) { + # predictor-projection + vipcp <- pmax(0, pmin(1, (vip4p - 0.83) / (1.21 - 0.83))) + if (!cplot_x) { + # S-plot predictor-projection + my_xlab <- "covariance(feature,t1)" + my_x <- plus_cov my_ylab <- "correlation(feature,t1)" my_y <- plus_cor + # X,Y limits for S-PLOT + my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3)) + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) } else { - my_ylab <- "relative covariance(feature,t1)" - my_y <- plus_cov + # C-plot predictor-projection + my_xlab <- "variable importance in predictor-projection" + my_x <- vip4p + if (cplot_y_correlation) { + my_ylab <- "correlation(feature,t1)" + my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) + } else { + my_ylab <- "covariance(feature,t1)" + my_y <- plus_cov + my_ylim <- max(abs(plus_cov)) + my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) ) + } + # X,Y limits for C-plot + lim_x <- max(my_x, na.rm = TRUE) * 1.1 + lim_x <- min(lim_x, default_lim_x) + my_xlim <- c( 0, lim_x ) # + off(0.2) ) } - } - if (cplot_x) { - lim_x <- max(my_x, na.rm = TRUE) * 1.1 - my_xlim <- c( 0, lim_x + off(0.2) ) + my_load_distal <- loadp + my_load_proximal <- loado + red <- as.numeric(correlation > 0) * vipcp + blue <- as.numeric(correlation < 0) * vipcp + alpha <- 0.1 + 0.4 * vipcp + red[is.na(red)] <- 0 + blue[is.na(blue)] <- 0 + alpha[is.na(alpha)] <- 0 + my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha) + main_label <- sprintf("%s for level %s versus %s" + , x_prefix, fctr_lvl_1, fctr_lvl_2) } else { - my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) ) - } - my_ylim <- c( -1.0 - off(0.2), 1.0 + off(0.2) ) - my_load_distal <- loadp - my_load_proximal <- loado - red <- as.numeric(correlation > 0) * vipcp - blue <- as.numeric(correlation < 0) * vipcp - alpha <- 0.1 + 0.4 * vipcp - red[is.na(red)] <- 0 - blue[is.na(blue)] <- 0 - alpha[is.na(alpha)] <- 0 - my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha) - main_label <- sprintf("%s for level %s versus %s" - , x_prefix, fctr_lvl_1, fctr_lvl_2) - } else { # orthogonal projection - vipco <- pmax(0, pmin(1,(vip4o-0.83)/(1.21-0.83))) - if (!cplot_x) { - my_xlab <- "relative covariance(feature,to1)" - my_x <- -plus_cov - } else { - my_xlab <- "variable importance in orthogonal projection" - my_x <- vip4o - } - if (!cplot_x) { # S-plot orthogonal projection - my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) ) - my_ylab <- "correlation(feature,to1)" - my_y <- plus_cor - } else { # C-plot orthogonal projection - lim_x <- max(my_x, na.rm = TRUE) * 1.1 - my_xlim <- c( 0, lim_x + off(0.2) ) - if (cplot_y_correlation) { + # orthogonal projection + vipco <- pmax(0, pmin(1, (vip4o - 0.83) / (1.21 - 0.83))) + if (!cplot_x) { + # S-PLOT orthogonal-projection + my_xlab <- "covariance(feature,to1)" + my_x <- -plus_cov + # X,Y limits for S-PLOT + my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3)) my_ylab <- "correlation(feature,to1)" my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) } else { - my_ylab <- "relative covariance(feature,to1)" - my_y <- plus_cov + # C-plot orthogonal-projection + my_xlab <- "variable importance in orthogonal projection" + my_x <- vip4o + # C-plot orthogonal projection + # X,Y limits for C-plot + lim_x <- max(my_x, na.rm = TRUE) * 1.1 + my_xlim <- c( 0, lim_x ) # + off(0.2) ) + if (cplot_y_correlation) { + my_ylab <- "correlation(feature,to1)" + my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) + } else { + my_ylab <- "covariance(feature,to1)" + my_y <- plus_cov + my_ylim <- max(abs(plus_cov)) + my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) ) + } } - } - my_ylim <- c( -1.0 - off(0.2), 1.0 + off(0.2) ) - my_load_distal <- loado - my_load_proximal <- loadp - alpha <- 0.1 + 0.4 * vipco - alpha[is.na(alpha)] <- 0 - my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha) - main_label <- sprintf( - "Features influencing orthogonal projection for %s versus %s" - , fctr_lvl_1, fctr_lvl_2) - } - main_cex <- min(1.0, 46.0/nchar(main_label)) - my_feature_label_slant <- -30 # slant feature labels 30 degrees downward - my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18) - plot( - y = my_y - , x = my_x - , type = "p" - , xlim = my_xlim - , ylim = my_ylim - , xlab = my_xlab - , ylab = my_ylab - , main = main_label - , cex.main = main_cex - , cex = cex - , pch = my_pch - , col = my_col - ) - low_x <- -0.7 * lim_x - high_x <- 0.7 * lim_x - if (projection == 1 && !cplot_x) { - text(x = low_x, y = -0.05, labels = fctr_lvl_1, col = "blue") - text(x = high_x, y = 0.05, labels = fctr_lvl_2, col = "red") - } - if ( x_show_labels != "0" ) { - names(my_load_distal) <- tsv1$featureID - names(my_load_proximal) <- tsv1$featureID - if ( x_show_labels == "ALL" ) { - n_labels <- length(my_load_distal) - } else { - n_labels <- as.numeric(x_show_labels) + my_load_distal <- loado + my_load_proximal <- loadp + alpha <- 0.1 + 0.4 * vipco + alpha[is.na(alpha)] <- 0 + my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha) + main_label <- sprintf( + "Features influencing orthogonal projection for %s versus %s" + , fctr_lvl_1, fctr_lvl_2) } - n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 ) - labels_to_show <- c( - names(head(sort(my_load_distal),n = n_labels)) - , names(tail(sort(my_load_distal),n = n_labels)) - ) - labels <- unname( - sapply( - X = tsv1$featureID - , FUN = function(x) if( x %in% labels_to_show ) x else "" + main_cex <- min(1.0, 46.0/nchar(main_label)) + my_feature_label_slant <- -30 # slant feature labels 30 degrees downward + my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18) + if ( sum(is.infinite(my_xlim)) == 0 ) { + plot( + y = my_y + , x = my_x + , type = "p" + , xlim = my_xlim + , ylim = my_ylim + , xlab = my_xlab + , ylab = my_ylab + , main = main_label + , cex.main = main_cex + , cex = cex + , pch = my_pch + , col = my_col ) - ) - x_text_offset <- 0.024 - y_text_off <- 0.017 - if (!cplot_x) { # S-plot - y_text_offset <- if (projection == 1) -y_text_off else y_text_off - } else { # C-plot - y_text_offset <- - sapply( - X = (my_y > 0) - , FUN = function(x) { if (x) y_text_off else -y_text_off } + low_x <- -0.7 * lim_x + high_x <- 0.7 * lim_x + if (projection == 1 && !cplot_x) { + text(x = low_x, y = -0.05, labels = fctr_lvl_1, col = "blue") + text(x = high_x, y = 0.05, labels = fctr_lvl_2, col = "red") + } + if ( x_show_labels != "0" ) { + names(my_load_distal) <- tsv1$featureID + names(my_load_proximal) <- tsv1$featureID + if ( x_show_labels == "ALL" ) { + n_labels <- length(my_load_distal) + } else { + n_labels <- as.numeric(x_show_labels) + } + n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 ) + labels_to_show <- c( + names(head(sort(my_load_distal), n = n_labels)) + , names(tail(sort(my_load_distal), n = n_labels)) + ) + labels <- unname( + sapply( + X = tsv1$featureID + , FUN = function(x) if ( x %in% labels_to_show ) x else "" + ) ) - } - label_features <- function(x_arg, y_arg, labels_arg, slant_arg) { - if (length(labels_arg) > 0) { - unique_slant <- unique(slant_arg) - if (length(unique_slant) == 1) { - text( - y = y_arg - , x = x_arg + x_text_offset - , cex = 0.4 - , labels = labels_arg - , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels - , srt = slant_arg - , adj = 0 # left-justified + x_text_offset <- 0.024 + y_text_off <- 0.017 + if (!cplot_x) { + # S-plot + y_text_offset <- if (projection == 1) -y_text_off else y_text_off + } else { + # C-plot + y_text_offset <- + sapply( + X = (my_y > 0) + , FUN = function(x) { + if (x) y_text_off else -y_text_off + } + ) + } + label_features <- function(x_arg, y_arg, labels_arg, slant_arg) { + if (length(labels_arg) > 0) { + unique_slant <- unique(slant_arg) + if (length(unique_slant) == 1) { + text( + y = y_arg + , x = x_arg + x_text_offset + , cex = 0.4 + , labels = labels_arg + , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels + , srt = slant_arg + , adj = 0 # left-justified + ) + } else { + for (slant in unique_slant) { + text( + y = y_arg[slant_arg == slant] + , x = x_arg[slant_arg == slant] + x_text_offset + , cex = 0.4 + , labels = labels_arg[slant_arg == slant] + , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels + , srt = slant + , adj = 0 # left-justified + ) + } + } + } + } + if (!cplot_x) { + my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant + } else { + my_slant <- sapply( + X = (my_y > 0) + , FUN = function(x) if (x) 2 else -2 + ) * my_feature_label_slant + } + if (length(my_x) > 1) { + label_features( + x_arg = my_x [my_x > 0] + , y_arg = my_y [my_x > 0] - y_text_offset + , labels_arg = labels[my_x > 0] + , slant_arg = (if (!cplot_x) -my_slant else (my_slant)) ) - } else { - for (slant in unique_slant) { - text( - y = y_arg[slant_arg == slant] - , x = x_arg[slant_arg == slant] + x_text_offset - , cex = 0.4 - , labels = labels_arg[slant_arg == slant] - , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels - , srt = slant - , adj = 0 # left-justified + if (!cplot_x) { + label_features( + x_arg = my_x [my_x < 0] + , y_arg = my_y [my_x < 0] + y_text_offset + , labels_arg = labels[my_x < 0] + , slant_arg = my_slant ) } - } - } - } - if (!cplot_x) { - my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant + } else { + if (!cplot_x) { + my_slant <- (if (my_x > 1) -1 else 1) * my_slant + my_y_arg <- my_y + (if (my_x > 1) -1 else 1) * y_text_offset + } else { + my_slant <- (if (my_y > 1) -1 else 1) * my_slant + my_y_arg <- my_y + (if (my_y > 1) -1 else 1) * y_text_offset + } + label_features( + x_arg = my_x + , y_arg = my_y_arg + , labels_arg = labels + , slant_arg = my_slant + ) + } # end if (length(my_x) > 1) + } # end if ( x_show_labels != "0" ) } else { - my_slant <- sapply( - X = (my_y > 0) - , FUN = function(x) if (x) 2 else -2 - ) * my_feature_label_slant - } - if (length(my_x) > 1) { - label_features( - x_arg = my_x [my_x > 0] - , y_arg = my_y [my_x > 0] - y_text_offset - , labels_arg = labels[my_x > 0] - , slant_arg = (if (!cplot_x) -my_slant else (my_slant)) - ) - if (!cplot_x) { - label_features( - x_arg = my_x [my_x < 0] - , y_arg = my_y [my_x < 0] + y_text_offset - , labels_arg = labels[my_x < 0] - , slant_arg = my_slant - ) - } - } else { - if (!cplot_x) { - my_slant <- (if (my_x > 1) -1 else 1) * my_slant - my_y_arg = my_y + (if (my_x > 1) -1 else 1) * y_text_offset - } else { - my_slant <- (if (my_y > 1) -1 else 1) * my_slant - my_y_arg = my_y + (if (my_y > 1) -1 else 1) * y_text_offset - } - label_features( - x_arg = my_x - , y_arg = my_y_arg - , labels_arg = labels - , slant_arg = my_slant - ) - } - } - } - ) + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no S-plot is possible") + } # end if (sum(is.infinte(my_xlim))==0) + } else { + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no S-plot is possible") + } # end if (length(plus_cor) != 0 && length(plus_cor) != 0) + } # end action + ) # end with( my_cor_vs_cov, action ) return (my_cor_vs_cov) - } + } # end function do_s_plot my_cor_vs_cov <- do_s_plot( x_env = x_env , predictor_projection_x = TRUE , cplot_x = FALSE ) - typeVc <- c("correlation", # 1 + typevc <- c("correlation", # 1 "outlier", # 2 "overview", # 3 "permutation", # 4 @@ -299,36 +320,37 @@ "xy-weight" # 12 ) # [c(3,8,9)] # [c(4,3,8,9)] if ( length(my_oplsda@orthoVipVn) > 0 ) { - my_typevc <- typeVc[c(9,3,8)] + my_typevc <- typevc[c(9,3,8)] } else { my_typevc <- c("(dummy)","overview","(dummy)") } my_ortho_cor_vs_cov_exists <- FALSE for (my_type in my_typevc) { - if (my_type %in% typeVc) { + if (my_type %in% typevc) { tryCatch({ - if ( my_type != "x-loading" ) { - plot( - x = my_oplsda - , typeVc = my_type - , parCexN = 0.4 - , parDevNewL = FALSE - , parLayL = TRUE - , parEllipsesL = TRUE - ) - if (my_type == "overview") { - sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2) - title(sub = sub_label) - } - } else { - my_ortho_cor_vs_cov <- do_s_plot( - x_env = x_env - , predictor_projection_x = FALSE - , cplot_x = FALSE - ) - my_ortho_cor_vs_cov_exists <- TRUE + if ( my_type != "x-loading" ) { + plot( + x = my_oplsda + , typeVc = my_type + , parCexN = 0.4 + , parDevNewL = FALSE + , parLayL = TRUE + , parEllipsesL = TRUE + ) + if (my_type == "overview") { + sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2) + title(sub = sub_label) + } + } else { + my_ortho_cor_vs_cov <- do_s_plot( + x_env = x_env + , predictor_projection_x = FALSE + , cplot_x = FALSE + ) + my_ortho_cor_vs_cov_exists <- TRUE + } } - }, error = function(e) { + , error = function(e) { x_progress( sprintf( "factor level %s or %s may have only one sample - %s" @@ -347,12 +369,17 @@ cplot_o <- x_env$cplot_o if (cplot_p || cplot_o) { if (cplot_p) { - do_s_plot( - x_env = x_env - , predictor_projection_x = TRUE - , cplot_x = TRUE - , cor_vs_cov_x = my_cor_vs_cov - ) + if (!is.null(my_cor_vs_cov)) { + do_s_plot( + x_env = x_env + , predictor_projection_x = TRUE + , cplot_x = TRUE + , cor_vs_cov_x = my_cor_vs_cov + ) + } else { + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no predictor projection is possible") + } did_plots <- 1 } else { did_plots <- 0 @@ -405,10 +432,10 @@ # extract parameters from the environment vrbl_metadata <- calc_env$vrbl_metadata - vrbl_metadata_names <- vrbl_metadata[,1] + vrbl_metadata_names <- vrbl_metadata[, 1] smpl_metadata <- calc_env$smpl_metadata data_matrix <- calc_env$data_matrix - pairSigFeatOnly <- calc_env$pairSigFeatOnly + pair_significant_features_only <- calc_env$pairSigFeatOnly facC <- calc_env$facC tesC <- calc_env$tesC # extract the levels from the environment @@ -433,22 +460,27 @@ names(rt) <- vrbl_metadata$variableMetadata rt_lookup <- function(feature) unname(rt[feature]) - # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) + # calculate salience_df as data.frame(feature, max_level, max_median, salience_rcv, mean_median, salience, salience_rcv) salience_df <- calc_env$salience_df <- w4msalience( data_matrix = data_matrix , sample_class = smpl_metadata[,facC] , failure_action = failure_action ) salience_tsv_action({ - my_df <- data.frame( - featureID = salience_df$feature - , salientLevel = salience_df$max_level - , salientRCV = salience_df$salient_rcv - , salience = salience_df$salience - , mz = mz_lookup(salience_df$feature) - , rt = rt_lookup(salience_df$feature) - ) - my_df[order(-my_df$salience),] + with ( + salience_df + , { + my_df <<- data.frame( + featureID = feature + , salientLevel = max_level + , salienceRCV = salience_rcv + , relativeSalientDistance = relative_salient_distance + , salience = salience + , mz = mz_lookup(feature) + , rt = rt_lookup(feature) + ) + }) + my_df[order(-my_df$relativeSalientDistance),] }) # transform wildcards to regexen @@ -560,8 +592,8 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = TRUE - , x_algorithm = algoC - , x_prefix = if (pairSigFeatOnly) { + , x_algorithm = "nipals" + , x_prefix = if (pair_significant_features_only) { "Significantly contrasting features" } else { "Significant features" @@ -627,15 +659,15 @@ } ) col_selector <- vrbl_metadata_names[ - if ( pairSigFeatOnly ) fully_significant else overall_significant + if (pair_significant_features_only) fully_significant else overall_significant ] my_matrix <- tdm[ chosen_samples, col_selector, drop = FALSE ] my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC - , x_prefix = if (pairSigFeatOnly) { + , x_algorithm = "nipals" + , x_prefix = if (pair_significant_features_only) { "Significantly contrasting features" } else { "Significant features" @@ -668,7 +700,8 @@ } } } - } else { # tesC == "none" + } else { + # tesC == "none" # find all the levels for factor facC in sampleMetadata level_union <- unique(sort(smpl_metadata_facC)) # identify the selected levels for factor facC from sampleMetadata @@ -686,7 +719,6 @@ fctr_lvl_2 <- { if ( fctr_lvl_1 %in% completed ) return("DUMMY") - # strF(completed) completed <<- c(completed, fctr_lvl_1) setdiff(level_union, fctr_lvl_1) } @@ -717,7 +749,7 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC + , x_algorithm = "nipals" , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action @@ -770,7 +802,7 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC + , x_algorithm = "nipals" , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action @@ -804,7 +836,7 @@ if (!did_plot) { failure_action( sprintf( - "bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'" + "Did not plot. Does sampleMetadata have at least two levels of factor '%s' matching '%s' ?" , facC, originalLevCSV)) return ( FALSE ) } @@ -821,12 +853,14 @@ , ropls_x , predictor_projection_x = TRUE , x_progress = print +, x_env ) { tryCatch({ - return( - cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress) - ) - }, error = function(e) { + return( + cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress, x_env) + ) + } + , error = function(e) { x_progress( sprintf( "cor_vs_cov fatal error - %s" @@ -842,7 +876,12 @@ , ropls_x # an instance of ropls::opls , predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection , x_progress = print # function to produce progress and error messages +, x_env ) { + my_matrix_x <- matrix_x + my_matrix_x[my_matrix_x==0] <- NA + fdr_features <- x_env$fdr_features + x_class <- class(ropls_x) if ( !( as.character(x_class) == "opls" ) ) { stop( @@ -866,12 +905,12 @@ # I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 # (and not from the supplement despite the statement that, for the NIPALS algorithm, - # the equations from the supplement should be used) because of the definition of the + # the equations from the supplement should be used) because of the definition of the # Pearson/Galton coefficient of correlation is defined as # $$ # \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y} # $$ - # as described (among other places) on Wikipedia at + # as described (among other places) on Wikipedia at # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population # The equations in the supplement said to use, for the predictive component t1, # \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))} @@ -879,112 +918,116 @@ # perhaps my data are not centered exactly the same way that theirs were. # The correlations calculated here are in agreement with those calculated with the code from # page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf - # I did transform covariance to "relative covariance" (relative to the maximum value) - # to keep the figures consistent with one another. + - # count the features (one column for each sample) - Nfeatures <- ncol(matrix_x) - # count the samples (one row for each sample) - Nobservations <- nrow(matrix_x) - # a one-dimensional magnitude function (i.e., take the vector norm) - vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) - # calculate the standard deviation for each feature - sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x])) + # count the features/variables (one column for each sample) + # count the features/variables (one column for each sample) + n_features <- ncol(my_matrix_x) + all_n_features <- x_env$fdr_features + if (length(grep("^[0-9][0-9]*$", all_n_features)) > 0) { + all_n_features <- as.integer(all_n_features) + } else { + all_n_features <- n_features + } + fdr_n_features <- max(n_features, all_n_features) + # print("n_features") + # print(n_features) + + # count the samples/observations (one row for each sample) + n_observations <- nrow(my_matrix_x) + # choose whether to plot the predictive score vector or orthogonal score vector if (predictor_projection_x) - score_matrix <- ropls_x@scoreMN + score_vector <- ropls_x@scoreMN else - score_matrix <- ropls_x@orthoScoreMN - # transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation - score_matrix_transposed <- t(score_matrix) - # compute the norm of the vector (i.e., the magnitude) - score_matrix_magnitude <- vector_norm(score_matrix) - # compute the standard deviation of the vector - score_matrix_sd <- sd(score_matrix) - # compute the relative covariance of each feature with the score vector + score_vector <- ropls_x@orthoScoreMN + + # compute the covariance of each feature with the score vector result$covariance <- - score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) + sapply( + X = 1:n_features + , FUN = function(i) { + cov(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs") + } + ) + # access covariance by feature name + names(result$covariance) <- colnames(my_matrix_x) + # compute the correlation of each feature with the score vector result$correlation <- - score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) ) - - # convert covariance and correlation from one-dimensional matrices to arrays of values, - # which are accessed by feature name below - p1 <- result$covariance <- result$covariance [ 1, , drop = TRUE ] - # x_progress("strF(p1)") - # x_progress(strF(p1)) + sapply( + X = 1:n_features + , FUN = function(i) { + cor(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs") + } + ) + # access correlation by feature name + names(result$correlation) <- colnames(my_matrix_x) - pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ] - # x_progress("pearson strF(pcorr1)") - # x_progress(strF(pcorr1)) - # x_progress(typeof(pcorr1)) - # x_progress(str(pcorr1)) - - # # this is how to use Spearman correlation instead of pearson - # result$spearcor <- sapply( - # X = 1:Nfeatures - # , FUN = function(i) { - # stats::cor( - # x = as.vector(score_matrix) - # , y = as.vector(matrix_x[,i]) - # # , method = "spearman" - # , method = "pearson" - # ) - # } - # ) - # names(result$spearcor) <- names(p1) - # pcorr1 <- result$spearcor - # x_progress("spearman strF(pcorr1)") - # x_progress(strF(pcorr1)) - # x_progress(typeof(pcorr1)) - # x_progress(str(pcorr1)) - # pcorr1 <- result$correlation <- result$spearcor + # eliminate NAs in either correlation or covariance + nas <- is.na(result$correlation) | is.na(result$covariance) + featureID <- names(ropls_x@vipVn) + featureID <- featureID[!nas] + result$correlation <- result$correlation[!nas] + result$covariance <- result$covariance[!nas] + n_features <- length(featureID) - # correl.ci(r, n, a = 0.05, rho = 0) correl_pci <- lapply( - X = 1:Nfeatures - , FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations) + X = 1:n_features + , FUN = function(i) { + correl.ci( + r = result$correlation[i] + , n_obs = n_observations + , n_vars = fdr_n_features + ) + } ) result$p_value_raw <- sapply( - X = 1:Nfeatures + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$p.value.raw + ) + result$p_value_raw[is.na(result$p_value_raw)] <- 1.0 + result$p_value <- sapply( + X = 1:n_features , FUN = function(i) correl_pci[[i]]$p.value ) - result$p_value_raw[is.na(result$p_value_raw)] <- 0.0 + result$p_value[is.na(result$p_value)] <- 1.0 result$ci_lower <- sapply( - X = 1:Nfeatures - , FUN = function(i) correl_pci[[i]]$CI['lower'] + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$CI["lower"] ) result$ci_upper <- sapply( - X = 1:Nfeatures - , FUN = function(i) correl_pci[[i]]$CI['upper'] + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$CI["upper"] ) # extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627) # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) - result$vip4p <- as.numeric(ropls_x@vipVn) - result$vip4o <- as.numeric(ropls_x@orthoVipVn) + result$vip4p <- as.numeric(ropls_x@vipVn)[!nas] + result$vip4o <- as.numeric(ropls_x@orthoVipVn)[!nas] if (length(result$vip4o) == 0) result$vip4o <- NA # extract the loadings - result$loadp <- as.numeric(ropls_x@loadingMN) - result$loado <- as.numeric(ropls_x@orthoLoadingMN) + result$loadp <- as.numeric(ropls_x@loadingMN)[!nas] + result$loado <- as.numeric(ropls_x@orthoLoadingMN)[!nas] # get the level names level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) fctr_lvl_1 <- level_names[1] fctr_lvl_2 <- level_names[2] - feature_count <- length(ropls_x@vipVn) - result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count) - result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count) + result$level1 <- rep.int(x = fctr_lvl_1, times = n_features) + result$level2 <- rep.int(x = fctr_lvl_2, times = n_features) greaterLevel <- sapply( X = result$correlation - , FUN = function(my_corr) + , FUN = function(my_corr) { tryCatch({ - if ( is.nan( my_corr ) ) { - NA + if ( is.na(my_corr) || ! is.numeric( my_corr ) ) { + NA } else { if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 } - }, error = function(e) { + } + , error = function(e) { + print(my_corr) x_progress( sprintf( "cor_vs_cov -> sapply: error - substituting NA - %s" @@ -992,16 +1035,11 @@ ) ) NA - }) + } + ) + } ) - # begin fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1 - featureID <- names(ropls_x@vipVn) - greaterLevel <- greaterLevel[featureID] - result$correlation <- result$correlation[featureID] - result$covariance <- result$covariance[featureID] - # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1 - # build a data frame to hold the content for the tab-separated values file tsv1 <- data.frame( featureID = featureID @@ -1016,8 +1054,8 @@ , loadp = result$loadp , loado = result$loado , cor_p_val_raw = result$p_value_raw - , cor_p_value = p.adjust(p = result$p_value_raw, method = "BY") - , cor_ci_lower = result$ci_lower + , cor_p_value = result$p_value + , cor_ci_lower = result$ci_lower , cor_ci_upper = result$ci_upper ) rownames(tsv1) <- tsv1$featureID @@ -1039,7 +1077,7 @@ tsv1 <- tsv1[!is.na(tsv1$covariance),] superresult$tsv1 <- tsv1 - # # I did not include these but left them commentd out in case future + # # I did not include these but left them commentd out in case future # # consumers of this routine want to use it in currently unanticipated ways # result$superresult <- superresult # result$oplsda <- ropls_x @@ -1059,22 +1097,28 @@ # which follows # https://en.wikipedia.org/wiki/Fisher_transformation#Definition -correl.ci <- function(r, n, a = 0.05, rho = 0) { - ## r is the calculated correlation coefficient for n pairs +correl.ci <- function(r, n_obs, n_vars, a = 0.05, rho = 0) { + ## r is the calculated correlation coefficient for n_obs pairs of observations of one variable ## a is the significance level ## rho is the hypothesised correlation zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho zh1 <- atanh(r) # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1 - se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho + se <- (1 - r^2)/sqrt(n_obs - 3) ## standard error for Fisher's z-transformation of Ho test <- (zh1 - zh0)/se ### test statistic pvalue <- 2*(1 - pnorm(abs(test))) ## p-value - zL <- zh1 - qnorm(1 - a/2)*se - zH <- zh1 + qnorm(1 - a/2)*se - fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit - fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit - CI <- c(fishL, fishH) - names(CI) <- c('lower', 'upper') - list(correlation = r, p.value = pvalue, CI = CI) + pvalue.adj <- p.adjust(p = pvalue, method = "BY", n = n_vars) + z_L <- zh1 - qnorm(1 - a/2)*se + z_h <- zh1 + qnorm(1 - a/2)*se + fish_l <- tanh(z_L) # (exp(2*z_l)-1)/(exp(2*z_l)+1), i.e., lower confidence limit + fish_h <- tanh(z_h) # (exp(2*z_h)-1)/(exp(2*z_h)+1), i.e., upper confidence limit + ci <- c(fish_l, fish_h) + names(ci) <- c("lower", "upper") + list( + correlation = r + , p.value.raw = pvalue + , p.value = pvalue.adj + , CI = ci + ) } -# vim: sw=2 ts=2 et : +# vim: sw=2 ts=2 et ai :