diff w4mcorcov_calc.R @ 1:0c2ad44b6c9c draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 01d4a951cf09e7b88fcec96b8043bc7568cc5c92
author eschen42
date Sun, 22 Oct 2017 18:47:57 -0400
parents 23f9fad4edfc
children e03582f26617
line wrap: on
line diff
--- a/w4mcorcov_calc.R	Mon Oct 16 14:56:52 2017 -0400
+++ b/w4mcorcov_calc.R	Sun Oct 22 18:47:57 2017 -0400
@@ -8,11 +8,7 @@
 algoC <- "nipals"
 
 do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) {
-  off <- function(x) if (x_show_labels) x else 0
-  salience_lookup <- x_env$salience_lookup
-  salient_rcv_lookup <- x_env$salient_rcv_lookup
-  # x_progress("head(salience_df): ", head(salience_df))
-  # x_progress("head(salience): ", head(salience))
+  off <- function(x) if (x_show_labels == "0") x else 0
   if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) {
     my_oplsda <- opls(
         x      = x_dataMatrix
@@ -39,24 +35,20 @@
         covariance <- covariance / lim_x
         lim_x <- 1.2
         main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
-        # print("main_label")
-        # print(main_label)
         main_cex <- min(1.0, 46.0/nchar(main_label))
         # "It is generally accepted that a variable should be selected if vj>1, [27–29],
         #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
         #   (Mehmood 2012 doi:10.1186/1748-7188-6-27)
         vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
         alpha <- 0.1 + 0.4 * vipco
-        red  <- as.numeric(correlation < 0) * vipco
-        blue <- as.numeric(correlation > 0) * vipco
-        minus_cor <- -correlation
-        minus_cov <- -covariance
-        # cex <- salience_lookup(feature = names(minus_cor))
-        # cex <- 0.25 + (1.25 * cex / max(cex))
+        red  <- as.numeric(correlation > 0) * vipco
+        blue <- as.numeric(correlation < 0) * vipco
+        plus_cor <- correlation
+        plus_cov <- covariance
         cex <- 0.75
         plot(
-          y = minus_cor
-        , x = minus_cov
+          y = plus_cor
+        , x = plus_cov
         , type="p"
         , xlim=c(-lim_x, lim_x + off(0.1))
         , ylim=c(-1.0 - off(0.1), 1.0)
@@ -70,14 +62,31 @@
         )
         low_x <- -0.7 * lim_x
         high_x <- 0.7 * lim_x
-        text(x = low_x, y = -0.15, labels =  fctr_lvl_1)
-        text(x = high_x, y = 0.15, labels =  fctr_lvl_2)
-        if (x_show_labels) {
+        text(x = low_x, y = -0.05, labels =  fctr_lvl_1)
+        text(x = high_x, y = 0.05, labels =  fctr_lvl_2)
+        if ( x_show_labels != "0" ) {
+          my_loadp <- loadp
+          my_loado <- loado
+          names(my_loadp) <- tsv1$featureID
+          names(my_loado) <- tsv1$featureID
+          if ( x_show_labels == "ALL" ) {
+            n_labels <- length(loadp)
+          } else {
+            n_labels <- as.numeric(x_show_labels)
+          }
+          n_labels <- min( n_labels, (1 + length(loadp)) / 2 )
+          labels_to_show <- c(
+            names(head(sort(my_loadp),n = n_labels))
+          , names(head(sort(my_loado),n = n_labels))
+          , names(tail(sort(my_loadp),n = n_labels))
+          , names(tail(sort(my_loado),n = n_labels))
+          )
+          labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" ))
           text(
-            y = minus_cor - 0.013
-          , x = minus_cov + 0.020
+            y = plus_cor - 0.013
+          , x = plus_cov + 0.020
           , cex = 0.3
-          , labels = tsv1$featureID
+          , labels = labels
           , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha)
           , srt = -30 # slant 30 degrees downward
           , adj = 0   # left-justified
@@ -162,6 +171,14 @@
     return ( FALSE )
   }
 
+  mz             <- vrbl_metadata$mz
+  names(mz)      <- vrbl_metadata$variableMetadata
+  mz_lookup      <- function(feature) unname(mz[feature])
+  
+  rt             <- vrbl_metadata$rt
+  names(rt)      <- vrbl_metadata$variableMetadata
+  rt_lookup      <- function(feature) unname(rt[feature])
+  
   # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv)
   salience_df <- calc_env$salience_df <- w4msalience(
     data_matrix    = data_matrix
@@ -174,19 +191,12 @@
     , salientLevel = salience_df$max_level
     , salientRCV   = salience_df$salient_rcv
     , salience     = salience_df$salience
+    , mz           = mz_lookup(salience_df$feature)
+    , rt           = rt_lookup(salience_df$feature)
     )
     my_df[order(-my_df$salience),]
   })
-  salience             <- salience_df$salience
-  names(salience)      <- salience_df$feature
-  salience_lookup      <- calc_env$salience_lookup <- function(feature) unname(salience[feature])
-  salient_rcv          <- salience_df$salient_rcv
-  names(salient_rcv)   <- salience_df$feature
-  salient_rcv_lookup   <- calc_env$salient_rcv_lookup <- function(feature) unname(salient_rcv[feature])
-  salient_level        <- salience_df$max_level
-  names(salient_level) <- salience_df$feature
-  salient_level_lookup <- calc_env$salient_level_lookup <- function(feature) unname(salient_level[feature])
-  
+
   # transform wildcards to regexen
   if (matchingC == "wildcard") {
     # strsplit(x = "hello,wild,world", split = ",")
@@ -271,14 +281,13 @@
       }
     }
     level_union <- unique(sort(level_union))
-    overall_significant <- 1 == vrbl_metadata[,intersample_sig_col]
-    if ( length(level_union) > 1 ) {
+    overall_significant <- 1 == ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE )
+    if ( length(level_union) > 2 ) {
       chosen_samples <- smpl_metadata_facC %in% level_union
       chosen_facC <- as.character(smpl_metadata_facC[chosen_samples])
       col_selector <- vrbl_metadata_names[ overall_significant ]
       my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ]
-      fctr_lvl_2 <- "other"
-      for ( fctr_lvl_1 in level_union[1:length(level_union)] ) {
+      plot_action <- function(fctr_lvl_1, fctr_lvl_2) {
         progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
         predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 )
         my_cor_cov <- do_detail_plot(
@@ -294,115 +303,125 @@
         if ( is.null(my_cor_cov) ) {
           progress_action("NOTHING TO PLOT.")
         } else {
-          tsv <- my_cor_cov$tsv1
-          # tsv$salientLevel <- salient_level_lookup(tsv$featureID)
-          # tsv$salientRCV   <- salient_rcv_lookup(tsv$featureID)
-          # tsv$salience     <- salience_lookup(tsv$featureID)
-          tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] 
+          my_tsv <- my_cor_cov$tsv1
+          my_tsv$mz <- mz_lookup(my_tsv$featureID)
+          my_tsv$rt <- rt_lookup(my_tsv$featureID)
+          my_tsv["level1Level2Sig"] <- vrbl_metadata[ match(my_tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] 
+          tsv <<- my_tsv
           corcov_tsv_action(tsv)
-          did_plot <- TRUE
+          did_plot <<- TRUE
         }
       }
+      if ( length(level_union) != 2 ) {
+        fctr_lvl_2 <- "other"
+        for ( fctr_lvl_1 in level_union[1:length(level_union)] ) {
+          plot_action(fctr_lvl_1, fctr_lvl_2)
+        }
+      } else {
+        plot_action(fctr_lvl_1 = level_union[1], fctr_lvl_2 = level_union[2])
+      }
     }
 
-    ## next, contrast each selected level with each of the other levels individually ##
-    # process columns matching the pattern
-    for (i in 1:length(col_matches)) {
-      # for each potential match of the pattern
-      col_match <- col_matches[[i]]
-      if (length(col_match) > 0) {
-        # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig
-        vrbl_metadata_col <- col_match[1]               # ^^^^^^^^^^^^^^^^^^^^^  # Column name
-        fctr_lvl_1        <- col_match[2]               #             ^^         # Factor-level 1
-        fctr_lvl_2        <- col_match[3]               #                ^^      # Factor-level 2
-        # only process this column if both factors are members of lvlCSV
-        is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2)
-        progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
-        # TODO delete next line displaying currently-processed column
-        # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match))
-        # choose only samples with one of the two factors for this column
-        chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
-        predictor <- smpl_metadata_facC[chosen_samples]
-        # extract only the significantly-varying features and the chosen samples
-        fully_significant   <- 1 == vrbl_metadata[,vrbl_metadata_col] * vrbl_metadata[,intersample_sig_col]
-        col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ]
-        my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ]
-        my_cor_cov <- do_detail_plot(
-          x_dataMatrix  = my_matrix
-        , x_predictor   = predictor
-        , x_is_match    = is_match
-        , x_algorithm   = algoC
-        , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
-        , x_show_labels = labelFeatures
-        , x_progress    = progress_action
-        , x_env         = calc_env
-        )
-        if ( is.null(my_cor_cov) ) {
-          progress_action("NOTHING TO PLOT.")
-        } else {
-          tsv <- my_cor_cov$tsv1
-          # tsv$salientLevel <- salient_level_lookup(tsv$featureID)
-          # tsv$salientRCV   <- salient_rcv_lookup(tsv$featureID)
-          # tsv$salience     <- salience_lookup(tsv$featureID)
-          tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] 
-          corcov_tsv_action(tsv)
-          did_plot <- TRUE
+    if ( length(level_union) > 1 ) {
+      ## next, contrast each selected level with each of the other levels individually ##
+      # process columns matching the pattern
+      for (i in 1:length(col_matches)) {
+        # for each potential match of the pattern
+        col_match <- col_matches[[i]]
+        if (length(col_match) > 0) {
+          # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig
+          vrbl_metadata_col <- col_match[1]               # ^^^^^^^^^^^^^^^^^^^^^  # Column name
+          fctr_lvl_1        <- col_match[2]               #             ^^         # Factor-level 1
+          fctr_lvl_2        <- col_match[3]               #                ^^      # Factor-level 2
+          # only process this column if both factors are members of lvlCSV
+          is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2)
+          progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
+          # TODO delete next line displaying currently-processed column
+          # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match))
+          # choose only samples with one of the two factors for this column
+          chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
+          predictor <- smpl_metadata_facC[chosen_samples]
+          # extract only the significantly-varying features and the chosen samples
+          fully_significant   <- 1 == vrbl_metadata[,vrbl_metadata_col] * ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE )
+          col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ]
+          my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ]
+          my_cor_cov <- do_detail_plot(
+            x_dataMatrix  = my_matrix
+          , x_predictor   = predictor
+          , x_is_match    = is_match
+          , x_algorithm   = algoC
+          , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
+          , x_show_labels = labelFeatures
+          , x_progress    = progress_action
+          , x_env         = calc_env
+          )
+          if ( is.null(my_cor_cov) ) {
+            progress_action("NOTHING TO PLOT.")
+          } else {
+            tsv <- my_cor_cov$tsv1
+            tsv$mz <- mz_lookup(tsv$featureID)
+            tsv$rt <- rt_lookup(tsv$featureID)
+            tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] 
+            corcov_tsv_action(tsv)
+            did_plot <- TRUE
+          }
         }
       }
     }
   } else { # tesC == "none"
     level_union <- unique(sort(smpl_metadata_facC))
     if ( length(level_union) > 1 ) {
-      ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ##
-      completed <- c()
-      lapply(
-        X = level_union
-      , FUN = function(x) { 
-          fctr_lvl_1 <- x[1]
-          fctr_lvl_2 <- {
-            if ( fctr_lvl_1 %in% completed )
-              return("DUMMY")
-            # strF(completed)
-            completed <<- c(completed, fctr_lvl_1)
-            setdiff(level_union, fctr_lvl_1)
+      if ( length(level_union) > 2 ) {
+        ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ##
+        completed <- c()
+        lapply(
+          X = level_union
+        , FUN = function(x) { 
+            fctr_lvl_1 <- x[1]
+            fctr_lvl_2 <- {
+              if ( fctr_lvl_1 %in% completed )
+                return("DUMMY")
+              # strF(completed)
+              completed <<- c(completed, fctr_lvl_1)
+              setdiff(level_union, fctr_lvl_1)
+            }
+            chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
+            fctr_lvl_2 <- "other"
+            # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) )
+            progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
+            if (length(unique(chosen_samples)) < 1) {
+              progress_action("NOTHING TO PLOT...")
+            } else {
+              chosen_facC <- as.character(smpl_metadata_facC[chosen_samples])
+              predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 )
+              my_matrix <- scdm[ chosen_samples, , drop = FALSE ]
+              # only process this column if both factors are members of lvlCSV
+              is_match <- isLevelSelected(fctr_lvl_1)
+              my_cor_cov <- do_detail_plot(
+                x_dataMatrix  = my_matrix
+              , x_predictor   = predictor
+              , x_is_match    = is_match
+              , x_algorithm   = algoC
+              , x_prefix      = "Features"
+              , x_show_labels = labelFeatures
+              , x_progress    = progress_action
+              , x_env         = calc_env
+              )
+              if ( is.null(my_cor_cov) ) {
+                progress_action("NOTHING TO PLOT")
+              } else {
+                tsv <- my_cor_cov$tsv1
+                tsv$mz <- mz_lookup(tsv$featureID)
+                tsv$rt <- rt_lookup(tsv$featureID)
+                corcov_tsv_action(tsv)
+                did_plot <<- TRUE
+              }
+            }
+            #print("baz")
+            "dummy" # need to return a value; otherwise combn fails with an error
           }
-          chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
-          fctr_lvl_2 <- "other"
-          # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) )
-          progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
-          if (length(unique(chosen_samples)) < 1) {
-            progress_action("NOTHING TO PLOT...")
-          } else {
-            chosen_facC <- as.character(smpl_metadata_facC[chosen_samples])
-            predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 )
-            my_matrix <- scdm[ chosen_samples, , drop = FALSE ]
-            # only process this column if both factors are members of lvlCSV
-            is_match <- isLevelSelected(fctr_lvl_1)
-            my_cor_cov <- do_detail_plot(
-              x_dataMatrix  = my_matrix
-            , x_predictor   = predictor
-            , x_is_match    = is_match
-            , x_algorithm   = algoC
-            , x_prefix      = "Features"
-            , x_show_labels = labelFeatures
-            , x_progress    = progress_action
-            , x_env         = calc_env
-            )
-            if ( is.null(my_cor_cov) ) {
-              progress_action("NOTHING TO PLOT")
-            } else {
-              tsv <- my_cor_cov$tsv1
-              # tsv$salientLevel <- salient_level_lookup(tsv$featureID)
-              # tsv$salientRCV   <- salient_rcv_lookup(tsv$featureID)
-              # tsv$salience     <- salience_lookup(tsv$featureID)
-              corcov_tsv_action(tsv)
-              did_plot <<- TRUE
-            }
-          }
-          #print("baz")
-          "dummy" # need to return a value; otherwise combn fails with an error
-        }
-      )
+        )
+      }
       ## pass 2 - contrast each selected level with each of the other levels individually ##
       completed <- c()
       utils::combn(
@@ -436,9 +455,8 @@
               progress_action("NOTHING TO PLOT")
             } else {
               tsv <- my_cor_cov$tsv1
-              # tsv$salientLevel <- salient_level_lookup(tsv$featureID)
-              # tsv$salientRCV   <- salient_rcv_lookup(tsv$featureID)
-              # tsv$salience     <- salience_lookup(tsv$featureID)
+              tsv$mz <- mz_lookup(tsv$featureID)
+              tsv$rt <- rt_lookup(tsv$featureID)
               corcov_tsv_action(tsv)
               did_plot <<- TRUE
             }
@@ -510,22 +528,30 @@
   #    Length = number of features; labels = feature identifiers.  (The same is true for $correlation and $covariance.)
   result$vip4p     <- as.numeric(ropls_x@vipVn)
   result$vip4o     <- as.numeric(ropls_x@orthoVipVn)
+  result$loadp     <- as.numeric(ropls_x@loadingMN)
+  result$loado     <- as.numeric(ropls_x@orthoLoadingMN)
   # get the level names
   level_names      <- sort(levels(as.factor(ropls_x@suppLs$y)))
+  fctr_lvl_1       <- level_names[1]
+  fctr_lvl_2       <- level_names[2]
   feature_count    <- length(ropls_x@vipVn)
-  result$level1    <- rep.int(x = level_names[1], times = feature_count)
-  result$level2    <- rep.int(x = level_names[2], times = feature_count)
+  result$level1    <- rep.int(x = fctr_lvl_1, times = feature_count)
+  result$level2    <- rep.int(x = fctr_lvl_2, times = feature_count)
   # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation)))
   superresult <- list()
   if (length(result$vip4o) == 0) result$vip4o <- NA
+  greaterLevel <- sapply( X = result$correlation, FUN = function(my_corr) if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 )
   superresult$tsv1 <- data.frame(
     featureID           = names(ropls_x@vipVn)
   , factorLevel1        = result$level1
   , factorLevel2        = result$level2
+  , greaterLevel        = greaterLevel
   , correlation         = result$correlation
   , covariance          = result$covariance
   , vip4p               = result$vip4p
   , vip4o               = result$vip4o
+  , loadp               = result$loadp
+  , loado               = result$loado
   , row.names           = NULL
   )
   rownames(superresult$tsv1) <- superresult$tsv1$featureID
@@ -533,6 +559,8 @@
   superresult$correlation <- result$correlation
   superresult$vip4p <- result$vip4p
   superresult$vip4o <- result$vip4o
+  superresult$loadp <- result$loadp
+  superresult$loado <- result$loado
   superresult$details <- result
   # #print(superresult$tsv1)
   result$superresult <- superresult