diff w4mcorcov_calc.R @ 5:50f60f94c034 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit aff1790e25523d038a1e9528de748191c096132f
author eschen42
date Fri, 30 Mar 2018 14:59:19 -0400
parents 8bba31f628da
children 7bd523ca1f9a
line wrap: on
line diff
--- a/w4mcorcov_calc.R	Sun Mar 04 14:51:42 2018 -0500
+++ b/w4mcorcov_calc.R	Fri Mar 30 14:59:19 2018 -0400
@@ -7,7 +7,7 @@
 #### OPLS-DA
 algoC <- "nipals"
 
-do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_show_loado_labels, x_progress = print, x_env, x_crossval_i) {
+do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env, x_crossval_i) {
   off <- function(x) if (x_show_labels == "0") 0 else x
   if ( x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1 && x_crossval_i < nrow(x_dataMatrix) ) {
     my_oplsda <- opls(
@@ -23,83 +23,140 @@
     my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
     fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
     fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
-    my_cor_vs_cov <- cor_vs_cov(
-        matrix_x = x_dataMatrix
-      , ropls_x  = my_oplsda
-      )
-    with(
-      my_cor_vs_cov
-    , {
-        min_x <- min(covariance)
-        max_x <- max(covariance)
-        lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
-        covariance <- covariance / lim_x
-        lim_x <- 1.2
-        main_label <- sprintf("%s for level %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
-        main_cex <- min(1.0, 46.0/nchar(main_label))
-        # "It is generally accepted that a variable should be selected if vj>1, [27–29],
-        #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
-        #   (Mehmood 2012 doi:10.1186/1748-7188-6-27)
-        vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
-        alpha <- 0.1 + 0.4 * vipco
-        red  <- as.numeric(correlation > 0) * vipco
-        blue <- as.numeric(correlation < 0) * vipco
-        plus_cor <- correlation
-        plus_cov <- covariance
-        cex <- 0.75
-        plot(
-          y = plus_cor
-        , x = plus_cov
-        , type="p"
-        , xlim=c( -lim_x - off(0.2), lim_x + off(0.2) )
-        , ylim=c( -1.0   - off(0.2), 1.0   + off(0.2) )
-        , xlab = sprintf("relative covariance(feature,t1)")
-        , ylab = sprintf("correlation(feature,t1)")
-        , main = main_label
-        , cex.main = main_cex
-        , cex = cex
-        , pch = 16
-        , col = rgb(blue = blue, red = red, green = 0, alpha = alpha)
+    do_s_plot <- function(parallel_x) {
+      my_cor_vs_cov <- cor_vs_cov(
+          matrix_x   = x_dataMatrix
+        , ropls_x    = my_oplsda
+        , parallel_x = parallel_x
         )
-        low_x <- -0.7 * lim_x
-        high_x <- 0.7 * lim_x
-        text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
-        text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
-        if ( x_show_labels != "0" ) {
-          my_loadp <- loadp
-          my_loado <- loado
-          names(my_loadp) <- tsv1$featureID
-          names(my_loado) <- tsv1$featureID
-          if ( x_show_labels == "ALL" ) {
-            n_labels <- length(loadp)
+      with(
+        my_cor_vs_cov
+      , {
+          min_x <- min(covariance)
+          max_x <- max(covariance)
+          lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
+          covariance <- covariance / lim_x
+          lim_x <- 1.2
+          # "It is generally accepted that a variable should be selected if vj>1, [27–29],
+          #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
+          #   (Mehmood 2012 doi:10.1186/1748-7188-6-27)
+          plus_cor <- correlation
+          plus_cov <- covariance
+          cex <- 0.75
+          which_projection <- if (projection == 1) "t1" else "o1"
+          which_loading <- if (projection == 1) "parallel" else "orthogonal"
+          if (projection == 1) {
+            my_xlab <- "relative covariance(feature,t1)"
+            my_x <- plus_cov
+            my_ylab <- "correlation(feature,t1) [~ parallel loading]"
+            my_y <- plus_cor
+            my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
+            my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
+            my_load_distal <- loadp
+            my_load_proximal <- loado
+            vipcp <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
+            red  <- as.numeric(correlation > 0) * vipcp
+            blue <- as.numeric(correlation < 0) * vipcp
+            alpha <- 0.1 + 0.4 * vipcp
+            my_col = rgb(blue = blue, red = red, green = 0, alpha = alpha)
+            main_label <- sprintf("%s for level %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
           } else {
-            n_labels <- as.numeric(x_show_labels)
+            my_xlab <- "relative covariance(feature,to1)"
+            my_x <- -plus_cov
+            my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
+            my_ylab <- "correlation(feature,to1) [~ orthogonal loading]"
+            my_y <- plus_cor
+            my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
+            my_load_distal <- loado
+            my_load_proximal <- loadp
+            vipco <- pmax(0, pmin(1,(vip4o-0.83)/(1.21-0.83)))
+            alpha <- 0.1 + 0.4 * vipco
+            my_col = rgb(blue = 0, red = 0, green = 0, alpha = alpha)
+            main_label <- sprintf("Features influencing orthogonal projection for level %s versus %s", fctr_lvl_1, fctr_lvl_2)
           }
-          n_labels <- min( n_labels, (1 + length(loadp)) / 2 )
-          labels_to_show <- c(
-            names(head(sort(my_loadp),n = n_labels))
-          , names(tail(sort(my_loadp),n = n_labels))
+          main_cex <- min(1.0, 46.0/nchar(main_label))
+          my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
+          plot(
+            y = my_y
+          , x = my_x
+          , type = "p"
+          , xlim = my_xlim
+          , ylim = my_ylim
+          , xlab = my_xlab
+          , ylab = my_ylab
+          , main = main_label
+          , cex.main = main_cex
+          , cex = cex
+          , pch = 16
+          , col = my_col
           )
-          if ( x_show_loado_labels ) {
-            labels_to_show <- c(
-              labels_to_show
-            , names(head(sort(my_loado),n = n_labels))
-            , names(tail(sort(my_loado),n = n_labels))
-            )
+          low_x <- -0.7 * lim_x
+          high_x <- 0.7 * lim_x
+          if (projection == 1) {
+            text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
+            text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
           }
-          labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" ))
-          text(
-            y = plus_cor - 0.013
-          , x = plus_cov + 0.020
-          , cex = 0.4
-          , labels = labels
-          , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha)
-          , srt = -30 # slant 30 degrees downward
-          , adj = 0   # left-justified
-          )
+          if ( x_show_labels != "0" ) {
+            names(my_load_distal) <- tsv1$featureID
+            names(my_load_proximal) <- tsv1$featureID
+            if ( x_show_labels == "ALL" ) {
+              n_labels <- length(my_load_distal)
+            } else {
+              n_labels <- as.numeric(x_show_labels)
+            }
+            n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 )
+            labels_to_show <- c(
+              names(head(sort(my_load_distal),n = n_labels))
+            , names(tail(sort(my_load_distal),n = n_labels))
+            )
+            labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" ))
+            x_text_offset <- 0.024
+            y_text_offset <- (if (projection == 1) 1 else -1) * -0.017
+            label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
+              print("str(x_arg)")
+              print(str(x_arg))
+              print("str(y_arg)")
+              print(str(y_arg))
+              print("str(labels_arg)")
+              print(str(labels_arg))
+              text(
+                y = y_arg
+              , x = x_arg + x_text_offset
+              , cex = 0.4
+              , labels = labels_arg
+              , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+              , srt = slant_arg
+              , adj = 0   # left-justified
+              )
+            }
+            my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant
+            if (length(my_x) > 1) {
+              label_features( 
+                x_arg      = my_x  [my_x > 0]
+              , y_arg      = my_y  [my_x > 0] - y_text_offset
+              , labels_arg = labels[my_x > 0]
+              , slant_arg = -my_slant
+              )
+              label_features( 
+                x_arg      = my_x  [my_x < 0]
+              , y_arg      = my_y  [my_x < 0] + y_text_offset
+              , labels_arg = labels[my_x < 0]
+              , slant_arg = my_slant
+              )
+            } else {
+              label_features( 
+                x_arg = my_x
+              , y_arg = my_y + (if (my_x > 1) -1 else 1) * y_text_offset
+              , labels_arg = labels
+              , slant_arg = (if (my_x > 1) -1 else 1) * my_slant
+              )
+            }
+          }
         }
-      }
-    )
+      )
+      return (my_cor_vs_cov)
+    }
+    my_cor_vs_cov <- do_s_plot( parallel_x = TRUE )
     typeVc <- c("correlation",      # 1
                 "outlier",          # 2
                 "overview",         # 3
@@ -120,28 +177,29 @@
     }
     for (my_type in my_typevc) {
       if (my_type %in% typeVc) {
-        # print(sprintf("plotting type %s", my_type))
         tryCatch({
-          plot(
-            x            = my_oplsda
-          , typeVc       = my_type
-          , parCexN      = 0.4
-          , parDevNewL   = FALSE
-          , parLayL      = TRUE
-          , parEllipsesL = TRUE
-        )
+          if ( my_type != "x-loading" ) {
+             plot(
+               x            = my_oplsda
+             , typeVc       = my_type
+             , parCexN      = 0.4
+             , parDevNewL   = FALSE
+             , parLayL      = TRUE
+             , parEllipsesL = TRUE
+             )
+          } else {
+            do_s_plot( parallel_x = FALSE )
+          }
         }, error = function(e) {
-          x_progress(sprintf("factor level %s or %s may have only one sample", fctr_lvl_1, fctr_lvl_2))
+          x_progress( sprintf( "factor level %s or %s may have only one sample - %s", fctr_lvl_1, fctr_lvl_2, e$message ) )
         })
       } else {
-        # print("plotting dummy graph")
         plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
         text(x=1, y=1, labels="no orthogonal projection is possible")
       }
     }
     return (my_cor_vs_cov)
   } else {
-    # x_progress(sprintf("x_is_match = %s, ncol(x_dataMatrix) = %d, length(unique(x_predictor)) = %d",x_is_match, ncol(x_dataMatrix), length(unique(x_predictor))))
     return (NULL)
   }
 }
@@ -174,7 +232,6 @@
   # matchingC is one of { "none", "wildcard", "regex" }
   matchingC <- calc_env$matchingC
   labelFeatures <- calc_env$labelFeatures
-  labelOrthoFeatures <- calc_env$labelOrthoFeatures
 
   # arg/env checking
   if (!(facC %in% names(smpl_metadata))) {
@@ -308,7 +365,6 @@
         , x_algorithm   = algoC
         , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
         , x_show_labels = labelFeatures
-        , x_show_loado_labels = labelOrthoFeatures
         , x_progress    = progress_action
         , x_crossval_i  = min(7, length(chosen_samples))
         , x_env         = calc_env
@@ -349,8 +405,6 @@
           # only process this column if both factors are members of lvlCSV
           is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2)
           progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
-          # TODO delete next line displaying currently-processed column
-          # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match))
           # choose only samples with one of the two factors for this column
           chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
           predictor <- smpl_metadata_facC[chosen_samples]
@@ -365,7 +419,6 @@
           , x_algorithm   = algoC
           , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
           , x_show_labels = labelFeatures
-          , x_show_loado_labels = labelOrthoFeatures
           , x_progress    = progress_action
           , x_crossval_i  = min(7, length(chosen_samples))
           , x_env         = calc_env
@@ -402,7 +455,6 @@
             }
             chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
             fctr_lvl_2 <- "other"
-            # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) )
             progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
             if (length(unique(chosen_samples)) < 1) {
               progress_action("NOTHING TO PLOT...")
@@ -419,7 +471,6 @@
               , x_algorithm   = algoC
               , x_prefix      = "Features"
               , x_show_labels = labelFeatures
-              , x_show_loado_labels = labelOrthoFeatures
               , x_progress    = progress_action
               , x_crossval_i  = min(7, length(chosen_samples))
               , x_env         = calc_env
@@ -434,7 +485,6 @@
                 did_plot <<- TRUE
               }
             }
-            #print("baz")
             "dummy" # need to return a value; otherwise combn fails with an error
           }
         )
@@ -448,7 +498,6 @@
           fctr_lvl_1 <- x[1]
           fctr_lvl_2 <- x[2]
           chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
-          # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) )
           progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
           if (length(unique(chosen_samples)) < 1) {
             progress_action("NOTHING TO PLOT...")
@@ -465,7 +514,6 @@
             , x_algorithm   = algoC
             , x_prefix      = "Features"
             , x_show_labels = labelFeatures
-            , x_show_loado_labels = labelOrthoFeatures
             , x_progress    = progress_action
             , x_crossval_i  = min(7, length(chosen_samples))
             , x_env         = calc_env
@@ -480,7 +528,6 @@
               did_plot <<- TRUE
             }
           }
-          #print("baz")
           "dummy" # need to return a value; otherwise combn fails with an error
         }
       )
@@ -500,18 +547,22 @@
 #     Wiklund_2008 doi:10.1021/ac0713510
 #     Galindo_Prieto_2014 doi:10.1002/cem.2627
 #     https://github.com/HegemanLab/extra_tools/blob/master/generic_PCA.R
-cor_vs_cov <- function(matrix_x, ropls_x) {
+cor_vs_cov <- function(matrix_x, ropls_x, parallel_x = TRUE) {
   x_class <- class(ropls_x)
   if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) ) 
     stop( "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class ", as.character(x_class) )
   }
   result <- list()
+  result$projection <- projection <- if (parallel_x) 1 else 2
   # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS
   if ( ropls_x@suppLs$algoC == "nipals") {
     # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510
     mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
     mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x]))
-    score_matrix <- ropls_x@scoreMN
+    if (parallel_x)
+       score_matrix <- ropls_x@scoreMN
+    else
+       score_matrix <- ropls_x@orthoScoreMN
     score_matrix_transposed <- t(score_matrix)
     score_matrix_magnitude <- mag(score_matrix)
     result$covariance <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
@@ -520,7 +571,10 @@
     # WARNING - untested code - I don't have test data to exercise this branch
     # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
     # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F
-    score_matrix <- ropls_x@scoreMN
+    if (parallel_x)
+       score_matrix <- ropls_x@scoreMN
+    else
+       score_matrix <- ropls_x@orthoScoreMN
     score_matrix_transposed <- t(score_matrix)
     cov_divisor <- nrow(matrix_x) - 1
     result$covariance <- sapply(
@@ -540,8 +594,8 @@
       }
     )
   }
-  result$correlation <- result$correlation[1,,drop = TRUE]
-  result$covariance  <- result$covariance[1,,drop = TRUE]
+  result$correlation <- result$correlation[ 1, , drop = TRUE ]
+  result$covariance  <- result$covariance [ 1, , drop = TRUE ]
 
   # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014
   #    Length = number of features; labels = feature identifiers.  (The same is true for $correlation and $covariance.)
@@ -556,7 +610,6 @@
   feature_count    <- length(ropls_x@vipVn)
   result$level1    <- rep.int(x = fctr_lvl_1, times = feature_count)
   result$level2    <- rep.int(x = fctr_lvl_2, times = feature_count)
-  # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation)))
   superresult <- list()
   if (length(result$vip4o) == 0) result$vip4o <- NA
   greaterLevel <- sapply( X = result$correlation, FUN = function(my_corr) if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 )
@@ -565,6 +618,7 @@
   , factorLevel1        = result$level1
   , factorLevel2        = result$level2
   , greaterLevel        = greaterLevel
+  , projection          = result$projection
   , correlation         = result$correlation
   , covariance          = result$covariance
   , vip4p               = result$vip4p
@@ -574,6 +628,7 @@
   , row.names           = NULL
   )
   rownames(superresult$tsv1) <- superresult$tsv1$featureID
+  superresult$projection <- result$projection
   superresult$covariance <- result$covariance
   superresult$correlation <- result$correlation
   superresult$vip4p <- result$vip4p
@@ -581,7 +636,6 @@
   superresult$loadp <- result$loadp
   superresult$loado <- result$loado
   superresult$details <- result
-  # #print(superresult$tsv1)
   result$superresult <- superresult
   # Include thise in case future consumers of this routine want to use it in currently unanticipated ways
   result$oplsda    <- ropls_x          
@@ -589,4 +643,4 @@
   return (superresult)
 }
 
-
+# vim: sw=2 ts=2 et :