Mercurial > repos > bgruening > music_deconvolution
view scripts/compare.R @ 4:56371b5a2da9 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author | bgruening |
---|---|
date | Thu, 10 Feb 2022 12:52:31 +0000 |
parents | |
children | 2ba99a52bd44 |
line wrap: on
line source
suppressWarnings(suppressPackageStartupMessages(library(xbioc))) suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) suppressWarnings(suppressPackageStartupMessages(library(reshape2))) suppressWarnings(suppressPackageStartupMessages(library(cowplot))) ## We use this script to estimate the effectiveness of proportion methods ## Load Conf args <- commandArgs(trailingOnly = TRUE) source(args[1]) method_key <- list("MuSiC" = "est_music", "NNLS" = "est_nnls")[[est_method]] scale_yaxes <- function(gplot, value) { if (is.na(value)) { gplot } else { gplot + scale_y_continuous(lim = c(0, value)) } } set_factor_data <- function(bulk_data, factor_name = NULL) { if (is.null(factor_name)) { factor_name <- "None" ## change to something plottable } pdat <- pData(bulk_data) sam_fact <- NULL if (factor_name %in% colnames(pdat)) { sam_fact <- cbind(rownames(pdat), as.character(pdat[[factor_name]])) cat(paste0(" - factor: ", factor_name, " found in phenotypes\n")) } else { ## We assign this as the factor for the entire dataset sam_fact <- cbind(rownames(pdat), factor_name) cat(paste0(" - factor: assigning \"", factor_name, "\" to whole dataset\n")) } colnames(sam_fact) <- c("Samples", "Factors") return(as.data.frame(sam_fact)) } ## Due to limiting sizes, we need to load and unload ## possibly very large datasets. process_pair <- function(sc_data, bulk_data, ctypes_label, samples_label, ctypes, factor_group) { ## - Generate est_prop <- music_prop( bulk.eset = bulk_data, sc.eset = sc_data, clusters = ctypes_label, samples = samples_label, select.ct = ctypes, verbose = T) ## - estimated_music_props <- est_prop$Est.prop.weighted estimated_nnls_props <- est_prop$Est.prop.allgene ## - fact_data <- set_factor_data(bulk_data, factor_group) ## - return(list(est_music = estimated_music_props, est_nnls = estimated_nnls_props, bulk_sample_totals = colSums(exprs(bulk_data)), plot_groups = fact_data)) } music_on_all <- function(files) { results <- list() for (sc_name in names(files)) { cat(paste0("sc-group:", sc_name, "\n")) scgroup <- files[[sc_name]] ## - sc Data sc_est <- readRDS(scgroup$dataset) ## - params celltypes_label <- scgroup$label_cell samples_label <- scgroup$label_sample celltypes <- scgroup$celltype results[[sc_name]] <- list() for (bulk_name in names(scgroup$bulk)) { cat(paste0(" - bulk-group:", bulk_name, "\n")) bulkgroup <- scgroup$bulk[[bulk_name]] ## - bulk Data bulk_est <- readRDS(bulkgroup$dataset) ## - bulk params pheno_facts <- bulkgroup$pheno_facts pheno_excl <- bulkgroup$pheno_excl ## results[[sc_name]][[bulk_name]] <- process_pair( sc_est, bulk_est, celltypes_label, samples_label, celltypes, bulkgroup$factor_group) ## rm(bulk_est) ## unload } rm(sc_est) ## unload } return(results) } plot_all_individual_heatmaps <- function(results) { pdf(out_heatmulti_pdf, width = 8, height = 8) for (sc_name in names(results)) { for (bk_name in names(results[[sc_name]])) { res <- results[[sc_name]][[bk_name]] plot_hmap <- Prop_heat_Est( data.matrix(res[[method_key]]), method.name = est_method) + ggtitle(paste0("[", est_method, "Cell type ", "proportions in ", bk_name, " (Bulk) based on ", sc_name, " (scRNA)")) + xlab("Cell Types (scRNA)") + ylab("Samples (Bulk)") + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 6)) print(plot_hmap) } } dev.off() } merge_factors_spread <- function(grudat_spread, factor_groups) { ## Generated merge_it <- function(matr, plot_groups, valname) { ren <- melt(lapply(matr, function(mat) { mat["ct"] <- rownames(mat); return(mat)})) ## - Grab factors and merge into list ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") return(ren_new) } tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), merge_it(grudat$spread$scale, factor_groups, "value.scale"), by = c("Sample", "Cell", "Bulk", "Factors")) return(tab) } plot_grouped_heatmaps <- function(results) { pdf(out_heatmulti_pdf, width = 8, height = 8) for (sc_name in names(results)) { named_list <- sapply( names(results[[sc_name]]), function(n) { ## We transpose the data here, because ## the plotting function omits by default ## the Y-axis which are the samples. ## Since the celltypes are the common factor ## these should be the Y-axis instead. t(data.matrix(results[[sc_name]][[n]][[method_key]])) }, simplify = F, USE.NAMES = T) named_methods <- names(results[[sc_name]]) ## plot_hmap <- Prop_heat_Est( named_list, method.name = named_methods) + ggtitle(paste0("[", est_method, "] Cell type ", "proportions of ", "Bulk Datasets based on ", sc_name, " (scRNA)")) + xlab("Samples (Bulk)") + ylab("Cell Types (scRNA)") + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 6)) print(plot_hmap) } dev.off() } ## Desired plots ## 1. Pie chart: ## - Per Bulk dataset (using just normalised proportions) ## - Per Bulk dataset (multiplying proportions by nreads) unlist_names <- function(results, method, prepend_bkname=FALSE) { unique(sort( unlist(lapply(names(results), function(scname) { lapply(names(results[[scname]]), function(bkname) { res <- get(method)(results[[scname]][[bkname]][[method_key]]) if (prepend_bkname) { ## We *do not* assume unique bulk sample names ## across different bulk datasets. res <- paste0(bkname, "::", res) } return(res) }) })) )) } summarized_matrix <- function(results) { # nolint ## We assume that cell types MUST be unique, but that sample ## names do not need to be. For this reason, we must prepend ## the bulk dataset name to the individual sample names. all_celltypes <- unlist_names(results, "colnames") all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) ## Iterate through all possible samples and populate a table. ddff <- data.frame() ddff_scale <- data.frame() for (cell in all_celltypes) { for (sample in all_samples) { group_sname <- unlist(strsplit(sample, split = "::")) bulk <- group_sname[1] id_sample <- group_sname[2] for (scgroup in names(results)) { if (bulk %in% names(results[[scgroup]])) { mat_prop <- results[[scgroup]][[bulk]][[method_key]] vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals ## - We use sample instead of id_sample because we need to ## extract bulk sets from the complete matrix later. It's ## messy, yes. if (cell %in% colnames(mat_prop)) { ddff[cell, sample] <- mat_prop[id_sample, cell] ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint } else { ddff[cell, sample] <- 0 ddff_scale[cell, sample] <- 0 } } } } } return(list(prop = ddff, scaled = ddff_scale)) } flatten_factor_list <- function(results) { ## Get a 2d DF of all factors across all bulk samples. res <- c() for (scgroup in names(results)) { for (bulkgroup in names(results[[scgroup]])) { dat <- results[[scgroup]][[bulkgroup]]$plot_groups dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint res <- rbind(res, dat) } } return(res) } group_by_dataset <- function(summat) { bulk_names <- unlist( lapply(names(files), function(x) names(files[[x]]$bulk))) mat_names <- colnames(summat$prop) bd <- list() bd_scale <- list() bd_spread_scale <- list() bd_spread_prop <- list() for (bname in bulk_names) { subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] ## - bd[[bname]] <- rowSums(summat$prop[, subs]) bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) bd_spread_scale[[bname]] <- summat$scaled[, subs] bd_spread_prop[[bname]] <- summat$prop[, subs] } return(list(prop = as.data.frame(bd), scaled = as.data.frame(bd_scale), spread = list(scale = bd_spread_scale, prop = bd_spread_prop))) } summarize_heatmaps <- function(grudat_spread_melt, do_factors) { ## - do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, ylabs = element_blank(), xlabs = element_blank(), use_log = TRUE, size = 11) { ## Convert from matrix to long format melted <- grudat_melted ## copy? if (use_log) { melted[[fillval]] <- log10(melted[[fillval]] + 1) } return(ggplot(melted) + geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), colour = "white") + scale_fill_gradient2(low = "steelblue", high = "red", mid = "white", name = element_blank()) + theme(axis.text.x = element_text(angle = -90, hjust = 0, size = size)) + ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) } do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { do_logged <- (plot %in% c("log", "both")) do_normal <- (plot %in% c("normal", "both")) plist <- list() if (do_logged) { plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, "value.scale", "Reads (log10+1)", size = size) plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, "value.prop", "Sample (log10+1)", size = size) } if (do_normal) { plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, "value.scale", "Reads", use_log = F, size = size) plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, "value.prop", "Sample", use_log = F, size = size) } return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), plot_grid(plotlist = plist, ncol = ncol), ncol = 1, rel_heights = c(0.05, 0.95))) } p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, size = 8) p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, size = 8) p3 <- ggplot + theme_void() if (do_factors) { p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") } return(list(bulk = p1, samples = list(log = p2b, normal = p2a), factors = p3)) } summarize_boxplots <- function(grudat_spread, do_factors) { common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + xlab(element_blank()) + ylab(element_blank()) common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + xlab(element_blank()) + ylab(element_blank()) A <- B <- list() #nolint ## Cell type by sample A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) ## Sample by Cell type B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + ylab("Bulk Dataset") B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + ylab("Bulk Dataset") ## -- Factor plots are optional A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() if (do_factors) { A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + ylab("Bulk Dataset") B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + ylab("Bulk Dataset") } title_a <- "Cell Types against Bulk" title_b <- "Bulk Datasets against Cells" if (do_factors) { title_a <- paste0(title_a, " and Factors") title_b <- paste0(title_b, " and Factors") } a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), plot_grid(plotlist = A, ncol = 2), ncol = 1, rel_heights = c(0.05, 0.95)) b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), plot_grid(plotlist = B, ncol = 2), ncol = 1, rel_heights = c(0.05, 0.95)) return(list(cell = a_all, bulk = b_all)) } filter_output <- function(grudat_spread_melt, out_filt) { print_red <- function(comment, red_list) { cat(paste(comment, paste(red_list, collapse = ", "), "\n")) } grudat_filt <- grudat_spread_melt print_red("Total Cell types:", unique(grudat_filt$Cell)) if (!is.null(out_filt$cells)) { grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] print_red(" - selecting:", out_filt$cells) } print_red("Total Factors:", unique(grudat_spread_melt$Factors)) if (!is.null(out_filt$facts)) { grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] print_red(" - selecting:", out_filt$facts) } return(grudat_filt) } results <- music_on_all(files) if (heat_grouped_p) { plot_grouped_heatmaps(results) } else { plot_all_individual_heatmaps(results) } save.image("/tmp/sesh.rds") summat <- summarized_matrix(results) grudat <- group_by_dataset(summat) grudat_spread_melt <- merge_factors_spread(grudat$spread, flatten_factor_list(results)) ## The output filters ONLY apply to boxplots, since these take do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) pdf(out_heatsumm_pdf, width = 14, height = 14) print(heat_maps) print(box_plots) dev.off() ## Generate output tables stats_prop <- lapply(grudat$spread$prop, function(x) { t(apply(x, 1, summary))}) stats_scale <- lapply(grudat$spread$scale, function(x) { t(apply(x, 1, summary))}) writable2 <- function(obj, prefix, title) { write.table(obj, file = paste0("report_data/", prefix, "_", title, ".tabular"), quote = F, sep = "\t", col.names = NA) } ## Make the value table printable grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", "CT Prop in Sample", "Number of Reads") writable2(grudat_spread_melt, "values", "Data Table") writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") writable2({ aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa }, "values", "Matrix of Cell Type Read Counts") for (bname in names(stats_prop)) { writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) }