music_manipulate_eset: scripts/compare.R comparison

comparison scripts/compare.R @ 0:22232092be53 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"

author	bgruening
date	Mon, 02 May 2022 09:59:18 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:22232092be53
+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
+## We use this script to estimate the effectiveness of proportion methods
+## Load Conf
+args <- commandArgs(trailingOnly = TRUE)
+source(args[1])
+method_key <- list("MuSiC" = "est_music",
+"NNLS" = "est_nnls")[[est_method]]
+delim <- "::" ## separator bulk datasets and their samples
+scale_yaxes <- function(gplot, value) {
+if (is.na(value)) {
+gplot
+} else {
+gplot + scale_y_continuous(lim = c(0, value))
+}
+}
+set_factor_data <- function(bulk_data, factor_name = NULL) {
+if (is.null(factor_name)) {
+factor_name <- "None" ## change to something plottable
+}
+pdat <- pData(bulk_data)
+sam_fact <- NULL
+if (factor_name %in% colnames(pdat)) {
+sam_fact <- cbind(rownames(pdat),
+as.character(pdat[[factor_name]]))
+cat(paste0("   - factor: ", factor_name,
+" found in phenotypes\n"))
+} else {
+## We assign this as the factor for the entire dataset
+sam_fact <- cbind(rownames(pdat),
+factor_name)
+cat(paste0("   - factor: assigning \"", factor_name,
+"\" to whole dataset\n"))
+}
+colnames(sam_fact) <- c("Samples", "Factors")
+return(as.data.frame(sam_fact))
+}
+## Due to limiting sizes, we need to load and unload
+## possibly very large datasets.
+process_pair <- function(sc_data, bulk_data,
+ctypes_label, samples_label, ctypes,
+factor_group) {
+## - Generate
+est_prop <- music_prop(
+bulk.eset = bulk_data, sc.eset = sc_data,
+clusters = ctypes_label,
+samples = samples_label, select.ct = ctypes, verbose = T)
+## -
+estimated_music_props <- est_prop$Est.prop.weighted
+estimated_nnls_props <- est_prop$Est.prop.allgene
+## -
+fact_data <- set_factor_data(bulk_data, factor_group)
+## -
+return(list(est_music = estimated_music_props,
+est_nnls = estimated_nnls_props,
+bulk_sample_totals = colSums(exprs(bulk_data)),
+plot_groups = fact_data))
+}
+music_on_all <- function(files) {
+results <- list()
+for (sc_name in names(files)) {
+cat(paste0("sc-group:", sc_name, "\n"))
+scgroup <- files[[sc_name]]
+## - sc Data
+sc_est <- readRDS(scgroup$dataset)
+## - params
+celltypes_label <- scgroup$label_cell
+samples_label <- scgroup$label_sample
+celltypes <- scgroup$celltype
+results[[sc_name]] <- list()
+for (bulk_name in names(scgroup$bulk)) {
+cat(paste0(" - bulk-group:", bulk_name, "\n"))
+bulkgroup <- scgroup$bulk[[bulk_name]]
+## - bulk Data
+bulk_est <- readRDS(bulkgroup$dataset)
+## - bulk params
+pheno_facts <- bulkgroup$pheno_facts
+pheno_excl <- bulkgroup$pheno_excl
+##
+results[[sc_name]][[bulk_name]] <- process_pair(
+sc_est, bulk_est,
+celltypes_label, samples_label,
+celltypes, bulkgroup$factor_group)
+##
+rm(bulk_est) ## unload
+}
+rm(sc_est) ## unload
+}
+return(results)
+}
+plot_all_individual_heatmaps <- function(results) {
+pdf(out_heatmulti_pdf, width = 8, height = 8)
+for (sc_name in names(results)) {
+for (bk_name in names(results[[sc_name]])) {
+res <- results[[sc_name]][[bk_name]]
+plot_hmap <- Prop_heat_Est(
+data.matrix(res[[method_key]]), method.name = est_method) +
+ggtitle(paste0("[", est_method, "Cell type ",
+"proportions in ",
+bk_name, " (Bulk) based on ",
+sc_name, " (scRNA)")) +
+xlab("Cell Types (scRNA)") +
+ylab("Samples (Bulk)") +
+theme(axis.text.x = element_text(angle = -90),
+axis.text.y = element_text(size = 6))
+print(plot_hmap)
+}
+}
+dev.off()
+}
+merge_factors_spread <- function(grudat_spread, factor_groups) {
+## Generated
+merge_it <- function(matr, plot_groups, valname) {
+ren <- melt(lapply(matr, function(mat) {
+mat["ct"] <- rownames(mat); return(mat)}))
+## - Grab factors and merge into list
+ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
+colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
+return(ren_new)
+}
+tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
+merge_it(grudat$spread$scale, factor_groups, "value.scale"),
+by = c("Sample", "Cell", "Bulk", "Factors"))
+return(tab)
+}
+unlist_names <- function(results, method, prepend_bkname=FALSE) {
+unique(sort(
+unlist(lapply(names(results), function(scname) {
+lapply(names(results[[scname]]), function(bkname) {
+res <- get(method)(results[[scname]][[bkname]][[method_key]])
+if (prepend_bkname) {
+## We *do not* assume unique bulk sample names
+## across different bulk datasets.
+res <- paste0(bkname, delim, res)
+}
+return(res)
+})
+}))
+))
+}
+summarized_matrix <- function(results) {  # nolint
+## We assume that cell types MUST be unique, but that sample
+## names do not need to be. For this reason, we must prepend
+## the bulk dataset name to the individual sample names.
+all_celltypes <- unlist_names(results, "colnames")
+all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)
+## Iterate through all possible samples and populate a table.
+ddff <- data.frame()
+ddff_scale <- data.frame()
+for (cell in all_celltypes) {
+for (sample in all_samples) {
+group_sname <- unlist(strsplit(sample, split = delim))
+bulk <- group_sname[1]
+id_sample <- group_sname[2]
+for (scgroup in names(results)) {
+if (bulk %in% names(results[[scgroup]])) {
+mat_prop <- results[[scgroup]][[bulk]][[method_key]]
+vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
+## - We use sample instead of id_sample because we need to
+##   extract bulk sets from the complete matrix later. It's
+##   messy, yes.
+if (cell %in% colnames(mat_prop)) {
+ddff[cell, sample] <- mat_prop[id_sample, cell]
+ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
+} else {
+ddff[cell, sample] <- 0
+ddff_scale[cell, sample] <- 0
+}
+}
+}
+}
+}
+return(list(prop = ddff, scaled = ddff_scale))
+}
+flatten_factor_list <- function(results) {
+## Get a 2d DF of all factors across all bulk samples.
+res <- c()
+for (scgroup in names(results)) {
+for (bulkgroup in names(results[[scgroup]])) {
+dat <- results[[scgroup]][[bulkgroup]]$plot_groups
+dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint
+res <- rbind(res, dat)
+}
+}
+return(res)
+}
+group_by_dataset <- function(summat) {
+bulk_names <- unlist(
+lapply(names(files), function(x) names(files[[x]]$bulk)))
+mat_names <- colnames(summat$prop)
+bd <- list()
+bd_scale <- list()
+bd_spread_scale <- list()
+bd_spread_prop <- list()
+for (bname in bulk_names) {
+subs <- mat_names[startsWith(mat_names, paste0(bname, delim))]
+## -
+bd[[bname]] <- rowSums(summat$prop[, subs])
+bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
+bd_spread_scale[[bname]] <- summat$scaled[, subs]
+bd_spread_prop[[bname]] <- summat$prop[, subs]
+}
+return(list(prop = as.data.frame(bd),
+scaled = as.data.frame(bd_scale),
+spread = list(scale = bd_spread_scale,
+prop = bd_spread_prop)))
+}
+do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name,
+xlabs="", ylabs="", titled="",
+order_col=T, order_row=T, size=11) {
+data_m <- grudat_spread_melt
+data_matrix <- {
+tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name)
+rownames(tmp) <- tmp[[yaxis]]
+tmp[[yaxis]] <- NULL
+tmp
+}
+dist_method <- "euclidean"
+clust_method <- "complete"
+if (order_row) {
+dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method))
+row_ord <- order.dendrogram(dd_row)
+ordered_row_names <- row.names(data_matrix[row_ord, ])
+data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names)
+}
+if (order_col) {
+dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method),
+method = clust_method))
+col_ord <- order.dendrogram(dd_col)
+ordered_col_names <- colnames(data_matrix[, col_ord])
+data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names)
+}
+heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) +
+geom_tile(colour = "white") +
+scale_fill_gradient2(low = "steelblue", high = "red", mid = "white",
+name = element_blank()) +
+scale_y_discrete(position = "right") +
+theme(axis.text.x = element_text(angle = -90, hjust = 0,
+size = size)) +
+ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs)
+## Graphics
+dendro_linesize <- 0.5
+dendro_colunit <- 0.2
+dendro_rowunit <- 0.1
+final_plot <- heat_plot
+if (order_row) {
+dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle")
+dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) +
+ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row),
+ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend),
+size = dendro_linesize) + ggplot2::coord_flip()
+final_plot <- cowplot::insert_yaxis_grob(
+final_plot, dendro_row, grid::unit(dendro_colunit, "null"),
+position = "left")
+}
+if (order_col) {
+dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle")
+dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") +
+ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col),
+ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
+size = dendro_linesize)
+final_plot <- cowplot::insert_xaxis_grob(
+final_plot, dendro_col, grid::unit(dendro_rowunit, "null"),
+position = "top")
+}
+return(cowplot::ggdraw(final_plot))
+}
+summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") {
+## - Cluster is either "Rows", "Cols", "Both", or "None"
+do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
+ylabs = element_blank(), xlabs = element_blank(),
+use_log = TRUE, size = 11) {
+## Convert from matrix to long format
+melted <- grudat_melted ## copy?
+if (use_log) {
+melted[[fillval]] <- log10(melted[[fillval]] + 1)
+}
+if (cluster == "None") {
+return(ggplot(melted) +
+geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
+colour = "white") +
+scale_fill_gradient2(
+low = "steelblue", high = "red", mid = "white",
+name = element_blank()) +
+theme(axis.text.x = element_text(
+angle = -90, hjust = 0, size = size)) +
+ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
+} else {
+return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval,
+xlabs, ylabs, title,
+(cluster %in% c("Cols", "Both")),
+(cluster %in% c("Rows", "Both"))))
+}
+}
+do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
+do_logged <- (plot %in% c("log", "both"))
+do_normal <- (plot %in% c("normal", "both"))
+plist <- list()
+if (do_logged) {
+plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+"value.scale", "Reads (log10+1)",
+size = size)
+plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+"value.prop", "Sample (log10+1)",
+size = size)
+}
+if (do_normal) {
+plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+"value.scale", "Reads", use_log = F,
+size = size)
+plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+"value.prop", "Sample", use_log = F,
+size = size)
+}
+return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
+plot_grid(plotlist = plist, ncol = ncol),
+ncol = 1, rel_heights = c(0.05, 0.95)))
+}
+p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both")
+p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal",
+ncol = 1, size = 8)
+p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log",
+ncol = 1, size = 8)
+p3 <- ggplot + theme_void()
+if (do_factors) {
+p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both")
+}
+return(list(bulk = p1,
+samples = list(log = p2b, normal = p2a),
+factors = p3))
+}
+summarize_boxplots <- function(grudat_spread, do_factors) {
+common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
+xlab(element_blank()) + ylab(element_blank())
+common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
+xlab(element_blank()) + ylab(element_blank())
+A <- B <- list() #nolint
+## Cell type by sample
+A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
+A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
+## Sample by Cell type
+B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+ylab("Bulk Dataset")
+B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+ylab("Bulk Dataset")
+## -- Factor plots are optional
+A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()
+if (do_factors) {
+A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
+A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
+B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+ylab("Bulk Dataset")
+B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+ylab("Bulk Dataset")
+}
+title_a <- "Cell Types vs Bulk Datasets"
+title_b <- "Bulk Datasets vs Cell Types"
+if (do_factors) {
+title_a <- paste0(title_a, " and Factors")
+title_b <- paste0(title_b, " and Factors")
+}
+a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
+plot_grid(plotlist = A, ncol = 2),
+ncol = 1, rel_heights = c(0.05, 0.95))
+b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
+plot_grid(plotlist = B, ncol = 2),
+ncol = 1, rel_heights = c(0.05, 0.95))
+return(list(cell = a_all, bulk = b_all))
+}
+filter_output <- function(grudat_spread_melt, out_filt) {
+print_red <- function(comment, red_list) {
+cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
+}
+grudat_filt <- grudat_spread_melt
+print_red("Total Cell types:", unique(grudat_filt$Cell))
+if (!is.null(out_filt$cells)) {
+grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
+print_red(" - selecting:", out_filt$cells)
+}
+print_red("Total Factors:", unique(grudat_spread_melt$Factors))
+if (!is.null(out_filt$facts)) {
+grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
+print_red(" - selecting:", out_filt$facts)
+}
+return(grudat_filt)
+}
+writable2 <- function(obj, prefix, title) {
+write.table(obj,
+file = paste0("report_data/", prefix, "_",
+title, ".tabular"),
+quote = F, sep = "\t", col.names = NA)
+}
+results <- music_on_all(files)
+summat <- summarized_matrix(results)
+grudat <- group_by_dataset(summat)
+grudat_spread_melt <- merge_factors_spread(grudat$spread,
+flatten_factor_list(results))
+grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
+plot_all_individual_heatmaps(results)
+## The output filters ONLY apply to boxplots, since these take
+do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
+box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
+heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors,
+dendro_setting)
+pdf(out_heatsumm_pdf, width = 14, height = 14)
+print(heat_maps)
+print(box_plots)
+dev.off()
+## Generate output tables
+stats_prop <- lapply(grudat$spread$prop, function(x) {
+t(apply(x, 1, summary))})
+stats_scale <- lapply(grudat$spread$scale, function(x) {
+t(apply(x, 1, summary))})
+## Make the value table printable
+grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
+colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
+"CT Prop in Sample", "Number of Reads")
+writable2(grudat_spread_melt, "values", "Data Table")
+writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
+writable2({
+aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
+}, "values", "Matrix of Cell Type Read Counts")
+for (bname in names(stats_prop)) {
+writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
+writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
+}

Mercurial > repos > bgruening > music_manipulate_eset

comparison scripts/compare.R @ 0:22232092be53 draft