Mercurial > repos > eschen42 > w4mcorcov
diff w4mcorcov_calc.R @ 1:0c2ad44b6c9c draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 01d4a951cf09e7b88fcec96b8043bc7568cc5c92
author | eschen42 |
---|---|
date | Sun, 22 Oct 2017 18:47:57 -0400 |
parents | 23f9fad4edfc |
children | e03582f26617 |
line wrap: on
line diff
--- a/w4mcorcov_calc.R Mon Oct 16 14:56:52 2017 -0400 +++ b/w4mcorcov_calc.R Sun Oct 22 18:47:57 2017 -0400 @@ -8,11 +8,7 @@ algoC <- "nipals" do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) { - off <- function(x) if (x_show_labels) x else 0 - salience_lookup <- x_env$salience_lookup - salient_rcv_lookup <- x_env$salient_rcv_lookup - # x_progress("head(salience_df): ", head(salience_df)) - # x_progress("head(salience): ", head(salience)) + off <- function(x) if (x_show_labels == "0") x else 0 if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) { my_oplsda <- opls( x = x_dataMatrix @@ -39,24 +35,20 @@ covariance <- covariance / lim_x lim_x <- 1.2 main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2) - # print("main_label") - # print(main_label) main_cex <- min(1.0, 46.0/nchar(main_label)) # "It is generally accepted that a variable should be selected if vj>1, [27–29], # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." # (Mehmood 2012 doi:10.1186/1748-7188-6-27) vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) alpha <- 0.1 + 0.4 * vipco - red <- as.numeric(correlation < 0) * vipco - blue <- as.numeric(correlation > 0) * vipco - minus_cor <- -correlation - minus_cov <- -covariance - # cex <- salience_lookup(feature = names(minus_cor)) - # cex <- 0.25 + (1.25 * cex / max(cex)) + red <- as.numeric(correlation > 0) * vipco + blue <- as.numeric(correlation < 0) * vipco + plus_cor <- correlation + plus_cov <- covariance cex <- 0.75 plot( - y = minus_cor - , x = minus_cov + y = plus_cor + , x = plus_cov , type="p" , xlim=c(-lim_x, lim_x + off(0.1)) , ylim=c(-1.0 - off(0.1), 1.0) @@ -70,14 +62,31 @@ ) low_x <- -0.7 * lim_x high_x <- 0.7 * lim_x - text(x = low_x, y = -0.15, labels = fctr_lvl_1) - text(x = high_x, y = 0.15, labels = fctr_lvl_2) - if (x_show_labels) { + text(x = low_x, y = -0.05, labels = fctr_lvl_1) + text(x = high_x, y = 0.05, labels = fctr_lvl_2) + if ( x_show_labels != "0" ) { + my_loadp <- loadp + my_loado <- loado + names(my_loadp) <- tsv1$featureID + names(my_loado) <- tsv1$featureID + if ( x_show_labels == "ALL" ) { + n_labels <- length(loadp) + } else { + n_labels <- as.numeric(x_show_labels) + } + n_labels <- min( n_labels, (1 + length(loadp)) / 2 ) + labels_to_show <- c( + names(head(sort(my_loadp),n = n_labels)) + , names(head(sort(my_loado),n = n_labels)) + , names(tail(sort(my_loadp),n = n_labels)) + , names(tail(sort(my_loado),n = n_labels)) + ) + labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" )) text( - y = minus_cor - 0.013 - , x = minus_cov + 0.020 + y = plus_cor - 0.013 + , x = plus_cov + 0.020 , cex = 0.3 - , labels = tsv1$featureID + , labels = labels , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha) , srt = -30 # slant 30 degrees downward , adj = 0 # left-justified @@ -162,6 +171,14 @@ return ( FALSE ) } + mz <- vrbl_metadata$mz + names(mz) <- vrbl_metadata$variableMetadata + mz_lookup <- function(feature) unname(mz[feature]) + + rt <- vrbl_metadata$rt + names(rt) <- vrbl_metadata$variableMetadata + rt_lookup <- function(feature) unname(rt[feature]) + # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) salience_df <- calc_env$salience_df <- w4msalience( data_matrix = data_matrix @@ -174,19 +191,12 @@ , salientLevel = salience_df$max_level , salientRCV = salience_df$salient_rcv , salience = salience_df$salience + , mz = mz_lookup(salience_df$feature) + , rt = rt_lookup(salience_df$feature) ) my_df[order(-my_df$salience),] }) - salience <- salience_df$salience - names(salience) <- salience_df$feature - salience_lookup <- calc_env$salience_lookup <- function(feature) unname(salience[feature]) - salient_rcv <- salience_df$salient_rcv - names(salient_rcv) <- salience_df$feature - salient_rcv_lookup <- calc_env$salient_rcv_lookup <- function(feature) unname(salient_rcv[feature]) - salient_level <- salience_df$max_level - names(salient_level) <- salience_df$feature - salient_level_lookup <- calc_env$salient_level_lookup <- function(feature) unname(salient_level[feature]) - + # transform wildcards to regexen if (matchingC == "wildcard") { # strsplit(x = "hello,wild,world", split = ",") @@ -271,14 +281,13 @@ } } level_union <- unique(sort(level_union)) - overall_significant <- 1 == vrbl_metadata[,intersample_sig_col] - if ( length(level_union) > 1 ) { + overall_significant <- 1 == ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE ) + if ( length(level_union) > 2 ) { chosen_samples <- smpl_metadata_facC %in% level_union chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) col_selector <- vrbl_metadata_names[ overall_significant ] my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] - fctr_lvl_2 <- "other" - for ( fctr_lvl_1 in level_union[1:length(level_union)] ) { + plot_action <- function(fctr_lvl_1, fctr_lvl_2) { progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) my_cor_cov <- do_detail_plot( @@ -294,115 +303,125 @@ if ( is.null(my_cor_cov) ) { progress_action("NOTHING TO PLOT.") } else { - tsv <- my_cor_cov$tsv1 - # tsv$salientLevel <- salient_level_lookup(tsv$featureID) - # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) - # tsv$salience <- salience_lookup(tsv$featureID) - tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] + my_tsv <- my_cor_cov$tsv1 + my_tsv$mz <- mz_lookup(my_tsv$featureID) + my_tsv$rt <- rt_lookup(my_tsv$featureID) + my_tsv["level1Level2Sig"] <- vrbl_metadata[ match(my_tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] + tsv <<- my_tsv corcov_tsv_action(tsv) - did_plot <- TRUE + did_plot <<- TRUE } } + if ( length(level_union) != 2 ) { + fctr_lvl_2 <- "other" + for ( fctr_lvl_1 in level_union[1:length(level_union)] ) { + plot_action(fctr_lvl_1, fctr_lvl_2) + } + } else { + plot_action(fctr_lvl_1 = level_union[1], fctr_lvl_2 = level_union[2]) + } } - ## next, contrast each selected level with each of the other levels individually ## - # process columns matching the pattern - for (i in 1:length(col_matches)) { - # for each potential match of the pattern - col_match <- col_matches[[i]] - if (length(col_match) > 0) { - # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig - vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name - fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 - fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 - # only process this column if both factors are members of lvlCSV - is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) - progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) - # TODO delete next line displaying currently-processed column - # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match)) - # choose only samples with one of the two factors for this column - chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) - predictor <- smpl_metadata_facC[chosen_samples] - # extract only the significantly-varying features and the chosen samples - fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * vrbl_metadata[,intersample_sig_col] - col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ] - my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] - my_cor_cov <- do_detail_plot( - x_dataMatrix = my_matrix - , x_predictor = predictor - , x_is_match = is_match - , x_algorithm = algoC - , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" - , x_show_labels = labelFeatures - , x_progress = progress_action - , x_env = calc_env - ) - if ( is.null(my_cor_cov) ) { - progress_action("NOTHING TO PLOT.") - } else { - tsv <- my_cor_cov$tsv1 - # tsv$salientLevel <- salient_level_lookup(tsv$featureID) - # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) - # tsv$salience <- salience_lookup(tsv$featureID) - tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] - corcov_tsv_action(tsv) - did_plot <- TRUE + if ( length(level_union) > 1 ) { + ## next, contrast each selected level with each of the other levels individually ## + # process columns matching the pattern + for (i in 1:length(col_matches)) { + # for each potential match of the pattern + col_match <- col_matches[[i]] + if (length(col_match) > 0) { + # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig + vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name + fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1 + fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2 + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2) + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + # TODO delete next line displaying currently-processed column + # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match)) + # choose only samples with one of the two factors for this column + chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) + predictor <- smpl_metadata_facC[chosen_samples] + # extract only the significantly-varying features and the chosen samples + fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * ( if (intersample_sig_col %in% colnames(vrbl_metadata)) vrbl_metadata[,intersample_sig_col] else TRUE ) + col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ] + my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ] + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT.") + } else { + tsv <- my_cor_cov$tsv1 + tsv$mz <- mz_lookup(tsv$featureID) + tsv$rt <- rt_lookup(tsv$featureID) + tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ] + corcov_tsv_action(tsv) + did_plot <- TRUE + } } } } } else { # tesC == "none" level_union <- unique(sort(smpl_metadata_facC)) if ( length(level_union) > 1 ) { - ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ## - completed <- c() - lapply( - X = level_union - , FUN = function(x) { - fctr_lvl_1 <- x[1] - fctr_lvl_2 <- { - if ( fctr_lvl_1 %in% completed ) - return("DUMMY") - # strF(completed) - completed <<- c(completed, fctr_lvl_1) - setdiff(level_union, fctr_lvl_1) + if ( length(level_union) > 2 ) { + ## pass 1 - contrast each selected level with all other levels combined into one "super-level" ## + completed <- c() + lapply( + X = level_union + , FUN = function(x) { + fctr_lvl_1 <- x[1] + fctr_lvl_2 <- { + if ( fctr_lvl_1 %in% completed ) + return("DUMMY") + # strF(completed) + completed <<- c(completed, fctr_lvl_1) + setdiff(level_union, fctr_lvl_1) + } + chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) + fctr_lvl_2 <- "other" + # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) + progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) + if (length(unique(chosen_samples)) < 1) { + progress_action("NOTHING TO PLOT...") + } else { + chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) + predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) + my_matrix <- scdm[ chosen_samples, , drop = FALSE ] + # only process this column if both factors are members of lvlCSV + is_match <- isLevelSelected(fctr_lvl_1) + my_cor_cov <- do_detail_plot( + x_dataMatrix = my_matrix + , x_predictor = predictor + , x_is_match = is_match + , x_algorithm = algoC + , x_prefix = "Features" + , x_show_labels = labelFeatures + , x_progress = progress_action + , x_env = calc_env + ) + if ( is.null(my_cor_cov) ) { + progress_action("NOTHING TO PLOT") + } else { + tsv <- my_cor_cov$tsv1 + tsv$mz <- mz_lookup(tsv$featureID) + tsv$rt <- rt_lookup(tsv$featureID) + corcov_tsv_action(tsv) + did_plot <<- TRUE + } + } + #print("baz") + "dummy" # need to return a value; otherwise combn fails with an error } - chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2) - fctr_lvl_2 <- "other" - # print( sprintf("sum(chosen_samples) %d, factor_level_2 %s", sum(chosen_samples), fctr_lvl_2) ) - progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2)) - if (length(unique(chosen_samples)) < 1) { - progress_action("NOTHING TO PLOT...") - } else { - chosen_facC <- as.character(smpl_metadata_facC[chosen_samples]) - predictor <- sapply( X = chosen_facC, FUN = function(fac) if ( fac == fctr_lvl_1 ) fctr_lvl_1 else fctr_lvl_2 ) - my_matrix <- scdm[ chosen_samples, , drop = FALSE ] - # only process this column if both factors are members of lvlCSV - is_match <- isLevelSelected(fctr_lvl_1) - my_cor_cov <- do_detail_plot( - x_dataMatrix = my_matrix - , x_predictor = predictor - , x_is_match = is_match - , x_algorithm = algoC - , x_prefix = "Features" - , x_show_labels = labelFeatures - , x_progress = progress_action - , x_env = calc_env - ) - if ( is.null(my_cor_cov) ) { - progress_action("NOTHING TO PLOT") - } else { - tsv <- my_cor_cov$tsv1 - # tsv$salientLevel <- salient_level_lookup(tsv$featureID) - # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) - # tsv$salience <- salience_lookup(tsv$featureID) - corcov_tsv_action(tsv) - did_plot <<- TRUE - } - } - #print("baz") - "dummy" # need to return a value; otherwise combn fails with an error - } - ) + ) + } ## pass 2 - contrast each selected level with each of the other levels individually ## completed <- c() utils::combn( @@ -436,9 +455,8 @@ progress_action("NOTHING TO PLOT") } else { tsv <- my_cor_cov$tsv1 - # tsv$salientLevel <- salient_level_lookup(tsv$featureID) - # tsv$salientRCV <- salient_rcv_lookup(tsv$featureID) - # tsv$salience <- salience_lookup(tsv$featureID) + tsv$mz <- mz_lookup(tsv$featureID) + tsv$rt <- rt_lookup(tsv$featureID) corcov_tsv_action(tsv) did_plot <<- TRUE } @@ -510,22 +528,30 @@ # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) result$vip4p <- as.numeric(ropls_x@vipVn) result$vip4o <- as.numeric(ropls_x@orthoVipVn) + result$loadp <- as.numeric(ropls_x@loadingMN) + result$loado <- as.numeric(ropls_x@orthoLoadingMN) # get the level names level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) + fctr_lvl_1 <- level_names[1] + fctr_lvl_2 <- level_names[2] feature_count <- length(ropls_x@vipVn) - result$level1 <- rep.int(x = level_names[1], times = feature_count) - result$level2 <- rep.int(x = level_names[2], times = feature_count) + result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count) + result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count) # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation))) superresult <- list() if (length(result$vip4o) == 0) result$vip4o <- NA + greaterLevel <- sapply( X = result$correlation, FUN = function(my_corr) if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 ) superresult$tsv1 <- data.frame( featureID = names(ropls_x@vipVn) , factorLevel1 = result$level1 , factorLevel2 = result$level2 + , greaterLevel = greaterLevel , correlation = result$correlation , covariance = result$covariance , vip4p = result$vip4p , vip4o = result$vip4o + , loadp = result$loadp + , loado = result$loado , row.names = NULL ) rownames(superresult$tsv1) <- superresult$tsv1$featureID @@ -533,6 +559,8 @@ superresult$correlation <- result$correlation superresult$vip4p <- result$vip4p superresult$vip4o <- result$vip4o + superresult$loadp <- result$loadp + superresult$loado <- result$loado superresult$details <- result # #print(superresult$tsv1) result$superresult <- superresult