Mercurial > repos > bgruening > music_construct_eset
diff scripts/compare.R @ 3:7ffaa0968da3 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author | bgruening |
---|---|
date | Thu, 10 Feb 2022 12:53:22 +0000 |
parents | |
children | 282819d09a4f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/compare.R Thu Feb 10 12:53:22 2022 +0000 @@ -0,0 +1,440 @@ +suppressWarnings(suppressPackageStartupMessages(library(xbioc))) +suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) +suppressWarnings(suppressPackageStartupMessages(library(reshape2))) +suppressWarnings(suppressPackageStartupMessages(library(cowplot))) +## We use this script to estimate the effectiveness of proportion methods + +## Load Conf +args <- commandArgs(trailingOnly = TRUE) +source(args[1]) + +method_key <- list("MuSiC" = "est_music", + "NNLS" = "est_nnls")[[est_method]] + + +scale_yaxes <- function(gplot, value) { + if (is.na(value)) { + gplot + } else { + gplot + scale_y_continuous(lim = c(0, value)) + } +} + + +set_factor_data <- function(bulk_data, factor_name = NULL) { + if (is.null(factor_name)) { + factor_name <- "None" ## change to something plottable + } + pdat <- pData(bulk_data) + sam_fact <- NULL + if (factor_name %in% colnames(pdat)) { + sam_fact <- cbind(rownames(pdat), + as.character(pdat[[factor_name]])) + cat(paste0(" - factor: ", factor_name, + " found in phenotypes\n")) + } else { + ## We assign this as the factor for the entire dataset + sam_fact <- cbind(rownames(pdat), + factor_name) + cat(paste0(" - factor: assigning \"", factor_name, + "\" to whole dataset\n")) + } + colnames(sam_fact) <- c("Samples", "Factors") + return(as.data.frame(sam_fact)) +} + +## Due to limiting sizes, we need to load and unload +## possibly very large datasets. +process_pair <- function(sc_data, bulk_data, + ctypes_label, samples_label, ctypes, + factor_group) { + ## - Generate + est_prop <- music_prop( + bulk.eset = bulk_data, sc.eset = sc_data, + clusters = ctypes_label, + samples = samples_label, select.ct = ctypes, verbose = T) + ## - + estimated_music_props <- est_prop$Est.prop.weighted + estimated_nnls_props <- est_prop$Est.prop.allgene + ## - + fact_data <- set_factor_data(bulk_data, factor_group) + ## - + return(list(est_music = estimated_music_props, + est_nnls = estimated_nnls_props, + bulk_sample_totals = colSums(exprs(bulk_data)), + plot_groups = fact_data)) +} + +music_on_all <- function(files) { + results <- list() + for (sc_name in names(files)) { + cat(paste0("sc-group:", sc_name, "\n")) + scgroup <- files[[sc_name]] + ## - sc Data + sc_est <- readRDS(scgroup$dataset) + ## - params + celltypes_label <- scgroup$label_cell + samples_label <- scgroup$label_sample + celltypes <- scgroup$celltype + + results[[sc_name]] <- list() + for (bulk_name in names(scgroup$bulk)) { + cat(paste0(" - bulk-group:", bulk_name, "\n")) + bulkgroup <- scgroup$bulk[[bulk_name]] + ## - bulk Data + bulk_est <- readRDS(bulkgroup$dataset) + ## - bulk params + pheno_facts <- bulkgroup$pheno_facts + pheno_excl <- bulkgroup$pheno_excl + ## + results[[sc_name]][[bulk_name]] <- process_pair( + sc_est, bulk_est, + celltypes_label, samples_label, + celltypes, bulkgroup$factor_group) + ## + rm(bulk_est) ## unload + } + rm(sc_est) ## unload + } + return(results) +} + +plot_all_individual_heatmaps <- function(results) { + pdf(out_heatmulti_pdf, width = 8, height = 8) + for (sc_name in names(results)) { + for (bk_name in names(results[[sc_name]])) { + res <- results[[sc_name]][[bk_name]] + plot_hmap <- Prop_heat_Est( + data.matrix(res[[method_key]]), method.name = est_method) + + ggtitle(paste0("[", est_method, "Cell type ", + "proportions in ", + bk_name, " (Bulk) based on ", + sc_name, " (scRNA)")) + + xlab("Cell Types (scRNA)") + + ylab("Samples (Bulk)") + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + print(plot_hmap) + } + } + dev.off() +} + +merge_factors_spread <- function(grudat_spread, factor_groups) { + ## Generated + merge_it <- function(matr, plot_groups, valname) { + ren <- melt(lapply(matr, function(mat) { + mat["ct"] <- rownames(mat); return(mat)})) + ## - Grab factors and merge into list + ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") + colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") + return(ren_new) + } + tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), + merge_it(grudat$spread$scale, factor_groups, "value.scale"), + by = c("Sample", "Cell", "Bulk", "Factors")) + return(tab) +} + + +plot_grouped_heatmaps <- function(results) { + pdf(out_heatmulti_pdf, width = 8, height = 8) + for (sc_name in names(results)) { + named_list <- sapply( + names(results[[sc_name]]), + function(n) { + ## We transpose the data here, because + ## the plotting function omits by default + ## the Y-axis which are the samples. + ## Since the celltypes are the common factor + ## these should be the Y-axis instead. + t(data.matrix(results[[sc_name]][[n]][[method_key]])) + }, simplify = F, USE.NAMES = T) + named_methods <- names(results[[sc_name]]) + ## + plot_hmap <- Prop_heat_Est( + named_list, + method.name = named_methods) + + ggtitle(paste0("[", est_method, "] Cell type ", + "proportions of ", + "Bulk Datasets based on ", + sc_name, " (scRNA)")) + + xlab("Samples (Bulk)") + + ylab("Cell Types (scRNA)") + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + print(plot_hmap) + } + dev.off() +} + +## Desired plots +## 1. Pie chart: +## - Per Bulk dataset (using just normalised proportions) +## - Per Bulk dataset (multiplying proportions by nreads) + +unlist_names <- function(results, method, prepend_bkname=FALSE) { + unique(sort( + unlist(lapply(names(results), function(scname) { + lapply(names(results[[scname]]), function(bkname) { + res <- get(method)(results[[scname]][[bkname]][[method_key]]) + if (prepend_bkname) { + ## We *do not* assume unique bulk sample names + ## across different bulk datasets. + res <- paste0(bkname, "::", res) + } + return(res) + }) + })) + )) +} + +summarized_matrix <- function(results) { # nolint + ## We assume that cell types MUST be unique, but that sample + ## names do not need to be. For this reason, we must prepend + ## the bulk dataset name to the individual sample names. + all_celltypes <- unlist_names(results, "colnames") + all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) + + ## Iterate through all possible samples and populate a table. + ddff <- data.frame() + ddff_scale <- data.frame() + for (cell in all_celltypes) { + for (sample in all_samples) { + group_sname <- unlist(strsplit(sample, split = "::")) + bulk <- group_sname[1] + id_sample <- group_sname[2] + for (scgroup in names(results)) { + if (bulk %in% names(results[[scgroup]])) { + mat_prop <- results[[scgroup]][[bulk]][[method_key]] + vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals + ## - We use sample instead of id_sample because we need to + ## extract bulk sets from the complete matrix later. It's + ## messy, yes. + if (cell %in% colnames(mat_prop)) { + ddff[cell, sample] <- mat_prop[id_sample, cell] + ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint + } else { + ddff[cell, sample] <- 0 + ddff_scale[cell, sample] <- 0 + } + } + } + } + } + return(list(prop = ddff, scaled = ddff_scale)) +} + +flatten_factor_list <- function(results) { + ## Get a 2d DF of all factors across all bulk samples. + res <- c() + for (scgroup in names(results)) { + for (bulkgroup in names(results[[scgroup]])) { + dat <- results[[scgroup]][[bulkgroup]]$plot_groups + dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint + res <- rbind(res, dat) + } + } + return(res) +} + +group_by_dataset <- function(summat) { + bulk_names <- unlist( + lapply(names(files), function(x) names(files[[x]]$bulk))) + mat_names <- colnames(summat$prop) + bd <- list() + bd_scale <- list() + bd_spread_scale <- list() + bd_spread_prop <- list() + for (bname in bulk_names) { + subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] + ## - + bd[[bname]] <- rowSums(summat$prop[, subs]) + bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) + bd_spread_scale[[bname]] <- summat$scaled[, subs] + bd_spread_prop[[bname]] <- summat$prop[, subs] + } + return(list(prop = as.data.frame(bd), + scaled = as.data.frame(bd_scale), + spread = list(scale = bd_spread_scale, + prop = bd_spread_prop))) +} + +summarize_heatmaps <- function(grudat_spread_melt, do_factors) { + ## - + do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, + ylabs = element_blank(), xlabs = element_blank(), + use_log = TRUE, size = 11) { + ## Convert from matrix to long format + melted <- grudat_melted ## copy? + if (use_log) { + melted[[fillval]] <- log10(melted[[fillval]] + 1) + } + return(ggplot(melted) + + geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), + colour = "white") + + scale_fill_gradient2(low = "steelblue", high = "red", + mid = "white", name = element_blank()) + + theme(axis.text.x = element_text(angle = -90, hjust = 0, + size = size)) + + ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) + } + + do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { + do_logged <- (plot %in% c("log", "both")) + do_normal <- (plot %in% c("normal", "both")) + plist <- list() + if (do_logged) { + plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.scale", "Reads (log10+1)", + size = size) + plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.prop", "Sample (log10+1)", + size = size) + } + if (do_normal) { + plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.scale", "Reads", use_log = F, + size = size) + plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.prop", "Sample", use_log = F, + size = size) + } + return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), + plot_grid(plotlist = plist, ncol = ncol), + ncol = 1, rel_heights = c(0.05, 0.95))) + + } + p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) + p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, + size = 8) + p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, + size = 8) + p3 <- ggplot + theme_void() + if (do_factors) { + p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") + } + return(list(bulk = p1, + samples = list(log = p2b, normal = p2a), + factors = p3)) +} + +summarize_boxplots <- function(grudat_spread, do_factors) { + common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + + xlab(element_blank()) + ylab(element_blank()) + common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + + xlab(element_blank()) + ylab(element_blank()) + + A <- B <- list() #nolint + ## Cell type by sample + A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) + A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) + ## Sample by Cell type + B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + + ylab("Bulk Dataset") + B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + + ylab("Bulk Dataset") + ## -- Factor plots are optional + A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() + + if (do_factors) { + A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) + A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) + B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + + ylab("Bulk Dataset") + B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + + ylab("Bulk Dataset") + } + + title_a <- "Cell Types against Bulk" + title_b <- "Bulk Datasets against Cells" + if (do_factors) { + title_a <- paste0(title_a, " and Factors") + title_b <- paste0(title_b, " and Factors") + } + + a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), + plot_grid(plotlist = A, ncol = 2), + ncol = 1, rel_heights = c(0.05, 0.95)) + b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), + plot_grid(plotlist = B, ncol = 2), + ncol = 1, rel_heights = c(0.05, 0.95)) + return(list(cell = a_all, bulk = b_all)) +} + +filter_output <- function(grudat_spread_melt, out_filt) { + print_red <- function(comment, red_list) { + cat(paste(comment, paste(red_list, collapse = ", "), "\n")) + } + grudat_filt <- grudat_spread_melt + print_red("Total Cell types:", unique(grudat_filt$Cell)) + if (!is.null(out_filt$cells)) { + grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] + print_red(" - selecting:", out_filt$cells) + } + print_red("Total Factors:", unique(grudat_spread_melt$Factors)) + if (!is.null(out_filt$facts)) { + grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] + print_red(" - selecting:", out_filt$facts) + } + return(grudat_filt) +} + + +results <- music_on_all(files) + +if (heat_grouped_p) { + plot_grouped_heatmaps(results) +} else { + plot_all_individual_heatmaps(results) +} + +save.image("/tmp/sesh.rds") + +summat <- summarized_matrix(results) +grudat <- group_by_dataset(summat) +grudat_spread_melt <- merge_factors_spread(grudat$spread, + flatten_factor_list(results)) + + + +## The output filters ONLY apply to boxplots, since these take +do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) + +grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) + +heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) +box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) + +pdf(out_heatsumm_pdf, width = 14, height = 14) +print(heat_maps) +print(box_plots) +dev.off() + +## Generate output tables +stats_prop <- lapply(grudat$spread$prop, function(x) { + t(apply(x, 1, summary))}) +stats_scale <- lapply(grudat$spread$scale, function(x) { + t(apply(x, 1, summary))}) + +writable2 <- function(obj, prefix, title) { + write.table(obj, + file = paste0("report_data/", prefix, "_", + title, ".tabular"), + quote = F, sep = "\t", col.names = NA) +} +## Make the value table printable +grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint +colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", + "CT Prop in Sample", "Number of Reads") + +writable2(grudat_spread_melt, "values", "Data Table") +writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") +writable2({ + aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa +}, "values", "Matrix of Cell Type Read Counts") + +for (bname in names(stats_prop)) { + writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) + writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) +}