w4mcorcov: w4mcorcov_calc.R comparison

comparison w4mcorcov_calc.R @ 13:2ae2d26e3270 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4

author	eschen42
date	Wed, 12 Dec 2018 09:20:02 -0500
parents	ddaf84e15d06
children	90708fdbc22d

comparison

equal deleted inserted replaced

-:ddaf84e15d06
+:2ae2d26e3270
-# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/
+# compute and output detail plots
-center_colmeans <- function(x) {
-xcenter = colMeans(x)
-x - rep(xcenter, rep.int(nrow(x), ncol(x)))
-}
-#### OPLS-DA
-algoC <- "nipals"
 do_detail_plot <- function(
 x_dataMatrix
 , x_predictor
 , x_is_match
 , x_algorithm
 , x_crossval_i
 ) {
 off <- function(x) if (x_show_labels == "0") 0 else x
 if ( x_is_match
 && ncol(x_dataMatrix) > 0
-&& length(unique(x_predictor))> 1
+&& length(unique(x_predictor)) > 1
 && x_crossval_i < nrow(x_dataMatrix)
 ) {
 my_oplsda <- opls(
 x      = x_dataMatrix
 , y      = x_predictor
 , plotL  = FALSE
 , crossvalI = x_crossval_i
 , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2.
 )
 # strip out variables having negligible variance
-x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE]
+x_dataMatrix <- x_dataMatrix[ , names(my_oplsda@vipVn), drop = FALSE]
 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
 do_s_plot <- function(
 x_env
 , predictor_projection_x   = TRUE
 , cplot_x      = FALSE
 , cor_vs_cov_x = NULL
-)
+) {
-{
 if (cplot_x) {
 cplot_y_correlation <- (x_env$cplot_y == "correlation")
+default_lim_x <- 10
+} else {
+default_lim_x <- 1.2
 }
 if (is.null(cor_vs_cov_x)) {
 my_cor_vs_cov <- cor_vs_cov(
 matrix_x   = x_dataMatrix
 , ropls_x    = my_oplsda
 , predictor_projection_x = predictor_projection_x
 , x_progress
+, x_env
 )
 } else {
 my_cor_vs_cov <- cor_vs_cov_x
 }
-# str(my_cor_vs_cov)
 if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) {
 if (is.null(cor_vs_cov_x)) {
-x_progress("No cor_vs_cov data produced")
+x_progress(
+sprintf("No cor_vs_cov data produced for level %s versus %s", fctr_lvl_1, fctr_lvl_2)
+)
 }
 plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
 text(x=1, y=1, labels="too few covariance data")
 return(my_cor_vs_cov)
 }
 my_cor_vs_cov
 , {
 min_x <- min(covariance, na.rm = TRUE)
 max_x <- max(covariance, na.rm = TRUE)
 lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
-covariance <- covariance / lim_x
-lim_x <- 1.2
+# Regarding using VIP as a guide to selecting a biomarker:
-# "It is generally accepted that a variable should be selected if vj>1, [27–29],
+#   "It is generally accepted that a variable should be selected if vj>1, [27–29],
 #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
 #   (Mehmood 2012 doi:10.1186/1748-7188-6-27)
 plus_cor <- correlation
 plus_cov <- covariance
-cex <- 0.65
+if (length(plus_cor) != 0 && length(plus_cor) != 0) {
-which_projection <- if (projection == 1) "t1" else "o1"
+cex <- 0.65
-which_loading <- if (projection == 1) "parallel" else "orthogonal"
+if (projection == 1) {
-if (projection == 1) { # predictor-projection
+# predictor-projection
-vipcp <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
+vipcp <- pmax(0, pmin(1, (vip4p - 0.83) / (1.21 - 0.83)))
-if (!cplot_x) { # S-plot predictor-projection
+if (!cplot_x) {
-my_xlab <- "relative covariance(feature,t1)"
+# S-plot predictor-projection
-my_x <- plus_cov
+my_xlab <- "covariance(feature,t1)"
-my_ylab <- "correlation(feature,t1)"
+my_x <- plus_cov
-my_y <- plus_cor
-} else { # C-plot predictor-projection
-my_xlab <- "variable importance in predictor-projection"
-my_x <- vip4p
-if (cplot_y_correlation) {
 my_ylab <- "correlation(feature,t1)"
 my_y <- plus_cor
+# X,Y limits for S-PLOT
+my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3))
+my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
 } else {
-my_ylab <- "relative covariance(feature,t1)"
+# C-plot predictor-projection
-my_y <- plus_cov
+my_xlab <- "variable importance in predictor-projection"
+my_x <- vip4p
+if (cplot_y_correlation) {
+my_ylab <- "correlation(feature,t1)"
+my_y <- plus_cor
+my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
+} else {
+my_ylab <- "covariance(feature,t1)"
+my_y <- plus_cov
+my_ylim <- max(abs(plus_cov))
+my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) )
+}
+# X,Y limits for C-plot
+lim_x <- max(my_x, na.rm = TRUE) * 1.1
+lim_x <- min(lim_x, default_lim_x)
+my_xlim <- c( 0, lim_x ) # + off(0.2) )
 }
-}
+my_load_distal <- loadp
-if (cplot_x) {
+my_load_proximal <- loado
-lim_x <- max(my_x, na.rm = TRUE) * 1.1
+red  <- as.numeric(correlation > 0) * vipcp
-my_xlim <- c( 0, lim_x + off(0.2) )
+blue <- as.numeric(correlation < 0) * vipcp
+alpha <- 0.1 + 0.4 * vipcp
+red[is.na(red)] <- 0
+blue[is.na(blue)] <- 0
+alpha[is.na(alpha)] <- 0
+my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha)
+main_label <- sprintf("%s for level %s versus %s"
+, x_prefix, fctr_lvl_1, fctr_lvl_2)
 } else {
-my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
+# orthogonal projection
-}
+vipco <- pmax(0, pmin(1, (vip4o - 0.83) / (1.21 - 0.83)))
-my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
+if (!cplot_x) {
-my_load_distal <- loadp
+# S-PLOT orthogonal-projection
-my_load_proximal <- loado
+my_xlab <- "covariance(feature,to1)"
-red  <- as.numeric(correlation > 0) * vipcp
+my_x <- -plus_cov
-blue <- as.numeric(correlation < 0) * vipcp
+# X,Y limits for S-PLOT
-alpha <- 0.1 + 0.4 * vipcp
+my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3))
-red[is.na(red)] <- 0
-blue[is.na(blue)] <- 0
-alpha[is.na(alpha)] <- 0
-my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha)
-main_label <- sprintf("%s for level %s versus %s"
-, x_prefix, fctr_lvl_1, fctr_lvl_2)
-} else { # orthogonal projection
-vipco <- pmax(0, pmin(1,(vip4o-0.83)/(1.21-0.83)))
-if (!cplot_x) {
-my_xlab <- "relative covariance(feature,to1)"
-my_x <- -plus_cov
-} else {
-my_xlab <- "variable importance in orthogonal projection"
-my_x <- vip4o
-}
-if (!cplot_x) { # S-plot orthogonal projection
-my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
-my_ylab <- "correlation(feature,to1)"
-my_y <- plus_cor
-} else { # C-plot orthogonal projection
-lim_x <- max(my_x, na.rm = TRUE) * 1.1
-my_xlim <- c( 0, lim_x + off(0.2) )
-if (cplot_y_correlation) {
 my_ylab <- "correlation(feature,to1)"
 my_y <- plus_cor
+my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
 } else {
-my_ylab <- "relative covariance(feature,to1)"
+# C-plot orthogonal-projection
-my_y <- plus_cov
+my_xlab <- "variable importance in orthogonal projection"
+my_x <- vip4o
+# C-plot orthogonal projection
+# X,Y limits for C-plot
+lim_x <- max(my_x, na.rm = TRUE) * 1.1
+my_xlim <- c( 0, lim_x ) # + off(0.2) )
+if (cplot_y_correlation) {
+my_ylab <- "correlation(feature,to1)"
+my_y <- plus_cor
+my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
+} else {
+my_ylab <- "covariance(feature,to1)"
+my_y <- plus_cov
+my_ylim <- max(abs(plus_cov))
+my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) )
+}
 }
+my_load_distal <- loado
+my_load_proximal <- loadp
+alpha <- 0.1 + 0.4 * vipco
+alpha[is.na(alpha)] <- 0
+my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha)
+main_label <- sprintf(
+"Features influencing orthogonal projection for %s versus %s"
+, fctr_lvl_1, fctr_lvl_2)
 }
-my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
+main_cex <- min(1.0, 46.0/nchar(main_label))
-my_load_distal <- loado
+my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
-my_load_proximal <- loadp
+my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
-alpha <- 0.1 + 0.4 * vipco
+if ( sum(is.infinite(my_xlim)) == 0 ) {
-alpha[is.na(alpha)] <- 0
+plot(
-my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha)
+y = my_y
-main_label <- sprintf(
+, x = my_x
-"Features influencing orthogonal projection for %s versus %s"
+, type = "p"
-, fctr_lvl_1, fctr_lvl_2)
+, xlim = my_xlim
-}
+, ylim = my_ylim
-main_cex <- min(1.0, 46.0/nchar(main_label))
+, xlab = my_xlab
-my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
+, ylab = my_ylab
-my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
+, main = main_label
-plot(
+, cex.main = main_cex
-y = my_y
+, cex = cex
-, x = my_x
+, pch = my_pch
-, type = "p"
+, col = my_col
-, xlim = my_xlim
-, ylim = my_ylim
-, xlab = my_xlab
-, ylab = my_ylab
-, main = main_label
-, cex.main = main_cex
-, cex = cex
-, pch = my_pch
-, col = my_col
-)
-low_x <- -0.7 * lim_x
-high_x <- 0.7 * lim_x
-if (projection == 1 && !cplot_x) {
-text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
-text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
-}
-if ( x_show_labels != "0" ) {
-names(my_load_distal) <- tsv1$featureID
-names(my_load_proximal) <- tsv1$featureID
-if ( x_show_labels == "ALL" ) {
-n_labels <- length(my_load_distal)
-} else {
-n_labels <- as.numeric(x_show_labels)
-}
-n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 )
-labels_to_show <- c(
-names(head(sort(my_load_distal),n = n_labels))
-, names(tail(sort(my_load_distal),n = n_labels))
-)
-labels <- unname(
-sapply(
-X = tsv1$featureID
-, FUN = function(x) if( x %in% labels_to_show ) x else ""
 )
-)
+low_x <- -0.7 * lim_x
-x_text_offset <- 0.024
+high_x <- 0.7 * lim_x
-y_text_off <- 0.017
+if (projection == 1 && !cplot_x) {
-if (!cplot_x) { # S-plot
+text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
-y_text_offset <- if (projection == 1) -y_text_off else y_text_off
+text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
-} else { # C-plot
+}
-y_text_offset <-
+if ( x_show_labels != "0" ) {
-sapply(
+names(my_load_distal) <- tsv1$featureID
-X = (my_y > 0)
+names(my_load_proximal) <- tsv1$featureID
-, FUN = function(x) { if (x) y_text_off else -y_text_off }
+if ( x_show_labels == "ALL" ) {
+n_labels <- length(my_load_distal)
+} else {
+n_labels <- as.numeric(x_show_labels)
+}
+n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 )
+labels_to_show <- c(
+names(head(sort(my_load_distal), n = n_labels))
+, names(tail(sort(my_load_distal), n = n_labels))
 )
-}
+labels <- unname(
-label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
+sapply(
-if (length(labels_arg) > 0) {
+X = tsv1$featureID
-unique_slant <- unique(slant_arg)
+, FUN = function(x) if ( x %in% labels_to_show ) x else ""
-if (length(unique_slant) == 1) {
-text(
-y = y_arg
-, x = x_arg + x_text_offset
-, cex = 0.4
-, labels = labels_arg
-, col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
-, srt = slant_arg
-, adj = 0   # left-justified
 )
+)
+x_text_offset <- 0.024
+y_text_off <- 0.017
+if (!cplot_x) {
+# S-plot
+y_text_offset <- if (projection == 1) -y_text_off else y_text_off
 } else {
-for (slant in unique_slant) {
+# C-plot
-text(
+y_text_offset <-
-y = y_arg[slant_arg == slant]
+sapply(
-, x = x_arg[slant_arg == slant] + x_text_offset
+X = (my_y > 0)
-, cex = 0.4
+, FUN = function(x) {
-, labels = labels_arg[slant_arg == slant]
+if (x) y_text_off else -y_text_off
-, col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+}
-, srt = slant
+)
-, adj = 0   # left-justified
+}
+label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
+if (length(labels_arg) > 0) {
+unique_slant <- unique(slant_arg)
+if (length(unique_slant) == 1) {
+text(
+y = y_arg
+, x = x_arg + x_text_offset
+, cex = 0.4
+, labels = labels_arg
+, col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+, srt = slant_arg
+, adj = 0   # left-justified
+)
+} else {
+for (slant in unique_slant) {
+text(
+y = y_arg[slant_arg == slant]
+, x = x_arg[slant_arg == slant] + x_text_offset
+, cex = 0.4
+, labels = labels_arg[slant_arg == slant]
+, col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+, srt = slant
+, adj = 0   # left-justified
+)
+}
+}
+}
+}
+if (!cplot_x) {
+my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant
+} else {
+my_slant <- sapply(
+X = (my_y > 0)
+, FUN = function(x) if (x) 2 else -2
+) * my_feature_label_slant
+}
+if (length(my_x) > 1) {
+label_features(
+x_arg      = my_x  [my_x > 0]
+, y_arg      = my_y  [my_x > 0] - y_text_offset
+, labels_arg = labels[my_x > 0]
+, slant_arg = (if (!cplot_x) -my_slant else (my_slant))
+)
+if (!cplot_x) {
+label_features(
+x_arg      = my_x  [my_x < 0]
+, y_arg      = my_y  [my_x < 0] + y_text_offset
+, labels_arg = labels[my_x < 0]
+, slant_arg = my_slant
 )
 }
-}
+} else {
-}
+if (!cplot_x) {
-}
+my_slant <- (if (my_x > 1) -1 else 1) * my_slant
-if (!cplot_x) {
+my_y_arg <- my_y + (if (my_x > 1) -1 else 1) * y_text_offset
-my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant
+} else {
+my_slant <- (if (my_y > 1) -1 else 1) * my_slant
+my_y_arg <- my_y + (if (my_y > 1) -1 else 1) * y_text_offset
+}
+label_features(
+x_arg = my_x
+, y_arg = my_y_arg
+, labels_arg = labels
+, slant_arg = my_slant
+)
+} # end if (length(my_x) > 1)
+} # end if ( x_show_labels != "0" )
 } else {
-my_slant <- sapply(
+plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
-X = (my_y > 0)
+text(x=1, y=1, labels="no S-plot is possible")
-, FUN = function(x) if (x) 2 else -2
+} # end if (sum(is.infinte(my_xlim))==0)
-) * my_feature_label_slant
+} else {
-}
+plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
-if (length(my_x) > 1) {
+text(x=1, y=1, labels="no S-plot is possible")
-label_features(
+} # end if (length(plus_cor) != 0 && length(plus_cor) != 0)
-x_arg      = my_x  [my_x > 0]
+} # end action
-, y_arg      = my_y  [my_x > 0] - y_text_offset
+) # end with( my_cor_vs_cov, action )
-, labels_arg = labels[my_x > 0]
-, slant_arg = (if (!cplot_x) -my_slant else (my_slant))
-)
-if (!cplot_x) {
-label_features(
-x_arg      = my_x  [my_x < 0]
-, y_arg      = my_y  [my_x < 0] + y_text_offset
-, labels_arg = labels[my_x < 0]
-, slant_arg = my_slant
-)
-}
-} else {
-if (!cplot_x) {
-my_slant <- (if (my_x > 1) -1 else 1) * my_slant
-my_y_arg = my_y + (if (my_x > 1) -1 else 1) * y_text_offset
-} else {
-my_slant <- (if (my_y > 1) -1 else 1) * my_slant
-my_y_arg = my_y + (if (my_y > 1) -1 else 1) * y_text_offset
-}
-label_features(
-x_arg = my_x
-, y_arg = my_y_arg
-, labels_arg = labels
-, slant_arg = my_slant
-)
-}
-}
-}
-)
 return (my_cor_vs_cov)
-}
+} # end function do_s_plot
 my_cor_vs_cov <- do_s_plot(
 x_env = x_env
 , predictor_projection_x = TRUE
 , cplot_x = FALSE
 )
-typeVc <- c("correlation",      # 1
+typevc <- c("correlation",      # 1
 "outlier",          # 2
 "overview",         # 3
 "permutation",      # 4
 "predict-train",    # 5
 "predict-test",     # 6
 "x-variance",       # 10
 "xy-score",         # 11
 "xy-weight"         # 12
 )                    # [c(3,8,9)] # [c(4,3,8,9)]
 if ( length(my_oplsda@orthoVipVn) > 0 ) {
-my_typevc <- typeVc[c(9,3,8)]
+my_typevc <- typevc[c(9,3,8)]
 } else {
 my_typevc <- c("(dummy)","overview","(dummy)")
 }
 my_ortho_cor_vs_cov_exists <- FALSE
 for (my_type in my_typevc) {
-if (my_type %in% typeVc) {
+if (my_type %in% typevc) {
 tryCatch({
 if ( my_type != "x-loading" ) {
 plot(
 x            = my_oplsda
 , typeVc       = my_type
 , parCexN      = 0.4
 , parDevNewL   = FALSE
 , parLayL      = TRUE
 , parEllipsesL = TRUE
 )
 if (my_type == "overview") {
 sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2)
 title(sub = sub_label)
 }
 } else {
 my_ortho_cor_vs_cov <- do_s_plot(
 x_env = x_env
 , predictor_projection_x = FALSE
 , cplot_x = FALSE
 )
 my_ortho_cor_vs_cov_exists <- TRUE
+}
 }
-}, error = function(e) {
+, error = function(e) {
 x_progress(
 sprintf(
 "factor level %s or %s may have only one sample - %s"
 , fctr_lvl_1
 , fctr_lvl_2
 }
 cplot_p <- x_env$cplot_p
 cplot_o <- x_env$cplot_o
 if (cplot_p || cplot_o) {
 if (cplot_p) {
-do_s_plot(
+if (!is.null(my_cor_vs_cov)) {
-x_env = x_env
+do_s_plot(
-, predictor_projection_x = TRUE
+x_env = x_env
-, cplot_x = TRUE
+, predictor_projection_x = TRUE
-, cor_vs_cov_x = my_cor_vs_cov
+, cplot_x = TRUE
-)
+, cor_vs_cov_x = my_cor_vs_cov
+)
+} else {
+plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
+text(x=1, y=1, labels="no predictor projection is possible")
+}
 did_plots <- 1
 } else {
 did_plots <- 0
 }
 if (cplot_o) {
 return ( FALSE )
 }
 # extract parameters from the environment
 vrbl_metadata <- calc_env$vrbl_metadata
-vrbl_metadata_names <- vrbl_metadata[,1]
+vrbl_metadata_names <- vrbl_metadata[, 1]
 smpl_metadata <- calc_env$smpl_metadata
 data_matrix <- calc_env$data_matrix
-pairSigFeatOnly <- calc_env$pairSigFeatOnly
+pair_significant_features_only <- calc_env$pairSigFeatOnly
 facC <- calc_env$facC
 tesC <- calc_env$tesC
 # extract the levels from the environment
 originalLevCSV <- levCSV <- calc_env$levCSV
 # matchingC is one of { "none", "wildcard", "regex" }
 rt             <- vrbl_metadata$rt
 names(rt)      <- vrbl_metadata$variableMetadata
 rt_lookup      <- function(feature) unname(rt[feature])
-# calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv)
+# calculate salience_df as data.frame(feature, max_level, max_median, salience_rcv, mean_median, salience, salience_rcv)
 salience_df <- calc_env$salience_df <- w4msalience(
 data_matrix    = data_matrix
 , sample_class   = smpl_metadata[,facC]
 , failure_action = failure_action
 )
 salience_tsv_action({
-my_df <- data.frame(
+with (
-featureID    = salience_df$feature
+salience_df
-, salientLevel = salience_df$max_level
+, {
-, salientRCV   = salience_df$salient_rcv
+my_df <<- data.frame(
-, salience     = salience_df$salience
+featureID       = feature
-, mz           = mz_lookup(salience_df$feature)
+, salientLevel    = max_level
-, rt           = rt_lookup(salience_df$feature)
+, salienceRCV     = salience_rcv
-)
+, relativeSalientDistance = relative_salient_distance
-my_df[order(-my_df$salience),]
+, salience        = salience
+, mz              = mz_lookup(feature)
+, rt              = rt_lookup(feature)
+)
+})
+my_df[order(-my_df$relativeSalientDistance),]
 })
 # transform wildcards to regexen
 if (matchingC == "wildcard") {
 # strsplit(x = "hello,wild,world", split = ",")
 )
 my_cor_cov <- do_detail_plot(
 x_dataMatrix  = my_matrix
 , x_predictor   = predictor
 , x_is_match    = TRUE
-, x_algorithm   = algoC
+, x_algorithm   = "nipals"
-, x_prefix      = if (pairSigFeatOnly) {
+, x_prefix      = if (pair_significant_features_only) {
 "Significantly contrasting features"
 } else {
 "Significant features"
 }
 , x_show_labels = labelFeatures
 } else {
 1
 }
 )
 col_selector <- vrbl_metadata_names[
-if ( pairSigFeatOnly ) fully_significant else overall_significant
+if (pair_significant_features_only) fully_significant else overall_significant
 ]
 my_matrix <- tdm[ chosen_samples, col_selector, drop = FALSE ]
 my_cor_cov <- do_detail_plot(
 x_dataMatrix  = my_matrix
 , x_predictor   = predictor
 , x_is_match    = is_match
-, x_algorithm   = algoC
+, x_algorithm   = "nipals"
-, x_prefix      = if (pairSigFeatOnly) {
+, x_prefix      = if (pair_significant_features_only) {
 "Significantly contrasting features"
 } else {
 "Significant features"
 }
 , x_show_labels = labelFeatures
 )
 }
 }
 }
 }
-} else { # tesC == "none"
+} else {
+# tesC == "none"
 # find all the levels for factor facC in sampleMetadata
 level_union <- unique(sort(smpl_metadata_facC))
 # identify the selected levels for factor facC from sampleMetadata
 level_include <- sapply(X = level_union, FUN = isLevelSelected)
 # discard the non-selected levels for factor facC
 , FUN = function(x) {
 fctr_lvl_1 <- x[1]
 fctr_lvl_2 <- {
 if ( fctr_lvl_1 %in% completed )
 return("DUMMY")
-# strF(completed)
 completed <<- c(completed, fctr_lvl_1)
 setdiff(level_union, fctr_lvl_1)
 }
 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
 fctr_lvl_2 <- "other"
 )
 my_cor_cov <- do_detail_plot(
 x_dataMatrix  = my_matrix
 , x_predictor   = predictor
 , x_is_match    = is_match
-, x_algorithm   = algoC
+, x_algorithm   = "nipals"
 , x_prefix      = "Features"
 , x_show_labels = labelFeatures
 , x_progress    = progress_action
 , x_crossval_i  = min(7, length(chosen_samples))
 , x_env         = calc_env
 )
 my_cor_cov <- do_detail_plot(
 x_dataMatrix  = my_matrix
 , x_predictor   = predictor
 , x_is_match    = is_match
-, x_algorithm   = algoC
+, x_algorithm   = "nipals"
 , x_prefix      = "Features"
 , x_show_labels = labelFeatures
 , x_progress    = progress_action
 , x_crossval_i  = min(7, length(chosen_samples))
 , x_env         = calc_env
 }
 }
 if (!did_plot) {
 failure_action(
 sprintf(
-"bad parameter!  sampleMetadata must have at least two levels of factor '%s' matching '%s'"
+"Did not plot. Does sampleMetadata have at least two levels of factor '%s' matching '%s' ?"
 , facC, originalLevCSV))
 return ( FALSE )
 }
 return ( TRUE )
 }
 cor_vs_cov <- function(
 matrix_x
 , ropls_x
 , predictor_projection_x = TRUE
 , x_progress = print
+, x_env
 ) {
 tryCatch({
 return(
-cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress)
+cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress, x_env)
 )
-}, error = function(e) {
+}
+, error = function(e) {
 x_progress(
 sprintf(
 "cor_vs_cov fatal error - %s"
 , as.character(e) # e$message
 )
 cor_vs_cov_try <- function(
 matrix_x                      # rows are samples; columns, features
 , ropls_x                       # an instance of ropls::opls
 , predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection
 , x_progress = print            # function to produce progress and error messages
+, x_env
 ) {
+my_matrix_x <- matrix_x
+my_matrix_x[my_matrix_x==0] <- NA
+fdr_features <- x_env$fdr_features
 x_class <- class(ropls_x)
 if ( !( as.character(x_class) == "opls" ) ) {
 stop(
 paste(
 "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class "
 result <- list()
 result$projection <- projection <- if (predictor_projection_x) 1 else 2
 # I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
 #   (and not from the supplement despite the statement that, for the NIPALS algorithm,
 #   the equations from the supplement should be used) because of the definition of the
 #   Pearson/Galton coefficient of correlation is defined as
 #   $$
 #      \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y}
 #   $$
 #   as described (among other places) on Wikipedia at
 #     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population
 # The equations in the supplement said to use, for the predictive component t1,
 #      \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))}
 # but the results that I got were dramatically different from published results for S-PLOTs;
 # perhaps my data are not centered exactly the same way that theirs were.
 # The correlations calculated here are in agreement with those calculated with the code from
 #   page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf
-# I did transform covariance to "relative covariance" (relative to the maximum value)
-#   to keep the figures consistent with one another.
+# count the features/variables (one column for each sample)
-# count the features (one column for each sample)
+# count the features/variables (one column for each sample)
-Nfeatures <- ncol(matrix_x)
+n_features <- ncol(my_matrix_x)
-# count the samples (one row for each sample)
+all_n_features <- x_env$fdr_features
-Nobservations <- nrow(matrix_x)
+if (length(grep("^[0-9][0-9]*$", all_n_features)) > 0) {
-# a one-dimensional magnitude function (i.e., take the vector norm)
+all_n_features <- as.integer(all_n_features)
-vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
+} else {
-# calculate the standard deviation for each feature
+all_n_features <- n_features
-sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x]))
+}
+fdr_n_features <- max(n_features, all_n_features)
+# print("n_features")
+# print(n_features)
+# count the samples/observations (one row for each sample)
+n_observations <- nrow(my_matrix_x)
 # choose whether to plot the predictive score vector or orthogonal score vector
 if (predictor_projection_x)
-score_matrix <- ropls_x@scoreMN
+score_vector <- ropls_x@scoreMN
 else
-score_matrix <- ropls_x@orthoScoreMN
+score_vector <- ropls_x@orthoScoreMN
-# transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation
-score_matrix_transposed <- t(score_matrix)
+# compute the covariance of each feature with the score vector
-# compute the norm of the vector (i.e., the magnitude)
-score_matrix_magnitude  <- vector_norm(score_matrix)
-# compute the standard deviation of the vector
-score_matrix_sd         <- sd(score_matrix)
-# compute the relative covariance of each feature with the score vector
 result$covariance <-
-score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
+sapply(
+X = 1:n_features
+, FUN = function(i) {
+cov(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs")
+}
+)
+# access covariance by feature name
+names(result$covariance) <- colnames(my_matrix_x)
 # compute the correlation of each feature with the score vector
 result$correlation <-
-score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) )
+sapply(
+X = 1:n_features
-# convert covariance and correlation from one-dimensional matrices to arrays of values,
+, FUN = function(i) {
-#   which are accessed by feature name below
+cor(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs")
-p1     <- result$covariance  <- result$covariance [ 1, , drop = TRUE ]
+}
-# x_progress("strF(p1)")
+)
-# x_progress(strF(p1))
+# access correlation by feature name
+names(result$correlation) <- colnames(my_matrix_x)
-pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ]
-# x_progress("pearson strF(pcorr1)")
+# eliminate NAs in either correlation or covariance
-# x_progress(strF(pcorr1))
+nas <- is.na(result$correlation) | is.na(result$covariance)
-# x_progress(typeof(pcorr1))
+featureID          <- names(ropls_x@vipVn)
-# x_progress(str(pcorr1))
+featureID          <- featureID[!nas]
+result$correlation <- result$correlation[!nas]
-# # this is how to use Spearman correlation instead of pearson
+result$covariance  <- result$covariance[!nas]
-# result$spearcor <- sapply(
+n_features <- length(featureID)
-#   X = 1:Nfeatures
-# , FUN = function(i) {
-#     stats::cor(
-#       x = as.vector(score_matrix)
-#     , y = as.vector(matrix_x[,i])
-#     # , method = "spearman"
-#     , method = "pearson"
-#     )
-#   }
-# )
-# names(result$spearcor) <- names(p1)
-# pcorr1 <- result$spearcor
-# x_progress("spearman strF(pcorr1)")
-# x_progress(strF(pcorr1))
-# x_progress(typeof(pcorr1))
-# x_progress(str(pcorr1))
-# pcorr1 <- result$correlation <- result$spearcor
-# correl.ci(r, n, a = 0.05, rho = 0)
 correl_pci <- lapply(
-X = 1:Nfeatures
+X = 1:n_features
-, FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations)
+, FUN = function(i) {
+correl.ci(
+r = result$correlation[i]
+, n_obs = n_observations
+, n_vars = fdr_n_features
+)
+}
 )
 result$p_value_raw <- sapply(
-X = 1:Nfeatures
+X = 1:n_features
+, FUN = function(i) correl_pci[[i]]$p.value.raw
+)
+result$p_value_raw[is.na(result$p_value_raw)] <- 1.0
+result$p_value <- sapply(
+X = 1:n_features
 , FUN = function(i) correl_pci[[i]]$p.value
 )
-result$p_value_raw[is.na(result$p_value_raw)] <- 0.0
+result$p_value[is.na(result$p_value)] <- 1.0
 result$ci_lower <- sapply(
-X = 1:Nfeatures
+X = 1:n_features
-, FUN = function(i) correl_pci[[i]]$CI['lower']
+, FUN = function(i) correl_pci[[i]]$CI["lower"]
 )
 result$ci_upper <- sapply(
-X = 1:Nfeatures
+X = 1:n_features
-, FUN = function(i) correl_pci[[i]]$CI['upper']
+, FUN = function(i) correl_pci[[i]]$CI["upper"]
 )
 # extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627)
 #    Length = number of features; labels = feature identifiers.  (The same is true for $correlation and $covariance.)
-result$vip4p     <- as.numeric(ropls_x@vipVn)
+result$vip4p     <- as.numeric(ropls_x@vipVn)[!nas]
-result$vip4o     <- as.numeric(ropls_x@orthoVipVn)
+result$vip4o     <- as.numeric(ropls_x@orthoVipVn)[!nas]
 if (length(result$vip4o) == 0) result$vip4o <- NA
 # extract the loadings
-result$loadp     <- as.numeric(ropls_x@loadingMN)
+result$loadp     <- as.numeric(ropls_x@loadingMN)[!nas]
-result$loado     <- as.numeric(ropls_x@orthoLoadingMN)
+result$loado     <- as.numeric(ropls_x@orthoLoadingMN)[!nas]
 # get the level names
 level_names      <- sort(levels(as.factor(ropls_x@suppLs$y)))
 fctr_lvl_1       <- level_names[1]
 fctr_lvl_2       <- level_names[2]
-feature_count    <- length(ropls_x@vipVn)
+result$level1    <- rep.int(x = fctr_lvl_1, times = n_features)
-result$level1    <- rep.int(x = fctr_lvl_1, times = feature_count)
+result$level2    <- rep.int(x = fctr_lvl_2, times = n_features)
-result$level2    <- rep.int(x = fctr_lvl_2, times = feature_count)
 greaterLevel <- sapply(
 X = result$correlation
-, FUN = function(my_corr)
+, FUN = function(my_corr) {
 tryCatch({
-if ( is.nan( my_corr ) ) {
+if ( is.na(my_corr) || ! is.numeric( my_corr ) ) {
 NA
 } else {
 if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2
 }
-}, error = function(e) {
+}
+, error = function(e) {
+print(my_corr)
 x_progress(
 sprintf(
 "cor_vs_cov -> sapply:  error - substituting NA - %s"
 , as.character(e)
 )
 )
 NA
-})
+}
+)
+}
 )
-# begin fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
-featureID          <- names(ropls_x@vipVn)
-greaterLevel       <- greaterLevel[featureID]
-result$correlation <- result$correlation[featureID]
-result$covariance  <- result$covariance[featureID]
-# end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
 # build a data frame to hold the content for the tab-separated values file
 tsv1 <- data.frame(
 featureID     = featureID
 , factorLevel1  = result$level1
 , vip4p         = result$vip4p
 , vip4o         = result$vip4o
 , loadp         = result$loadp
 , loado         = result$loado
 , cor_p_val_raw = result$p_value_raw
-, cor_p_value   = p.adjust(p = result$p_value_raw, method = "BY")
+, cor_p_value   = result$p_value
 , cor_ci_lower  = result$ci_lower
 , cor_ci_upper  = result$ci_upper
 )
 rownames(tsv1) <- tsv1$featureID
 # build the superresult, i.e., the result returned by this function
 # remove any rows having NA for covariance or correlation
 tsv1 <- tsv1[!is.na(tsv1$correlation),]
 tsv1 <- tsv1[!is.na(tsv1$covariance),]
 superresult$tsv1 <- tsv1
 # # I did not include these but left them commentd out in case future
 # #   consumers of this routine want to use it in currently unanticipated ways
 # result$superresult <- superresult
 # result$oplsda    <- ropls_x
 # result$predictor <- ropls_x@suppLs$y
 #     title = {Multivariate data analysis in R}
 #   }
 # which follows
 #   https://en.wikipedia.org/wiki/Fisher_transformation#Definition
-correl.ci <- function(r, n, a = 0.05, rho = 0) {
+correl.ci <- function(r, n_obs, n_vars, a = 0.05, rho = 0) {
-## r is the calculated correlation coefficient for n pairs
+## r is the calculated correlation coefficient for n_obs pairs of observations of one variable
 ## a is the significance level
 ## rho is the hypothesised correlation
 zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho
 zh1 <- atanh(r)   # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1
-se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho
+se <- (1 - r^2)/sqrt(n_obs - 3) ## standard error for Fisher's z-transformation of Ho
 test <- (zh1 - zh0)/se ### test statistic
 pvalue <- 2*(1 - pnorm(abs(test))) ## p-value
-zL <- zh1 - qnorm(1 - a/2)*se
+pvalue.adj <- p.adjust(p = pvalue, method = "BY", n = n_vars)
-zH <- zh1 + qnorm(1 - a/2)*se
+z_L <- zh1 - qnorm(1 - a/2)*se
-fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit
+z_h <- zh1 + qnorm(1 - a/2)*se
-fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit
+fish_l <- tanh(z_L) # (exp(2*z_l)-1)/(exp(2*z_l)+1), i.e., lower confidence limit
-CI <- c(fishL, fishH)
+fish_h <- tanh(z_h) # (exp(2*z_h)-1)/(exp(2*z_h)+1), i.e., upper confidence limit
-names(CI) <- c('lower', 'upper')
+ci <- c(fish_l, fish_h)
-list(correlation = r, p.value = pvalue, CI = CI)
+names(ci) <- c("lower", "upper")
+list(
+correlation = r
+, p.value.raw = pvalue
+, p.value = pvalue.adj
+, CI = ci
+)
 }
-# vim: sw=2 ts=2 et :
+# vim: sw=2 ts=2 et ai :

Mercurial > repos > eschen42 > w4mcorcov

comparison w4mcorcov_calc.R @ 13:2ae2d26e3270 draft