diff scripts/compare.R @ 4:56371b5a2da9 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author bgruening
date Thu, 10 Feb 2022 12:52:31 +0000
parents
children 2ba99a52bd44
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/compare.R	Thu Feb 10 12:52:31 2022 +0000
@@ -0,0 +1,440 @@
+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
+## We use this script to estimate the effectiveness of proportion methods
+
+## Load Conf
+args <- commandArgs(trailingOnly = TRUE)
+source(args[1])
+
+method_key <- list("MuSiC" = "est_music",
+                   "NNLS" = "est_nnls")[[est_method]]
+
+
+scale_yaxes <- function(gplot, value) {
+    if (is.na(value)) {
+        gplot
+    } else {
+        gplot + scale_y_continuous(lim = c(0, value))
+    }
+}
+
+
+set_factor_data <- function(bulk_data, factor_name = NULL) {
+    if (is.null(factor_name)) {
+        factor_name <- "None" ## change to something plottable
+    }
+    pdat <- pData(bulk_data)
+    sam_fact <- NULL
+    if (factor_name %in% colnames(pdat)) {
+        sam_fact <- cbind(rownames(pdat),
+                          as.character(pdat[[factor_name]]))
+        cat(paste0("   - factor: ", factor_name,
+                   " found in phenotypes\n"))
+    } else {
+        ## We assign this as the factor for the entire dataset
+        sam_fact <- cbind(rownames(pdat),
+                          factor_name)
+        cat(paste0("   - factor: assigning \"", factor_name,
+                   "\" to whole dataset\n"))
+    }
+    colnames(sam_fact) <- c("Samples", "Factors")
+    return(as.data.frame(sam_fact))
+}
+
+## Due to limiting sizes, we need to load and unload
+## possibly very large datasets.
+process_pair <- function(sc_data, bulk_data,
+                         ctypes_label, samples_label, ctypes,
+                         factor_group) {
+    ## - Generate
+    est_prop <- music_prop(
+        bulk.eset = bulk_data, sc.eset = sc_data,
+        clusters = ctypes_label,
+        samples = samples_label, select.ct = ctypes, verbose = T)
+    ## -
+    estimated_music_props <- est_prop$Est.prop.weighted
+    estimated_nnls_props <- est_prop$Est.prop.allgene
+    ## -
+    fact_data <- set_factor_data(bulk_data, factor_group)
+    ## -
+    return(list(est_music = estimated_music_props,
+                est_nnls = estimated_nnls_props,
+                bulk_sample_totals = colSums(exprs(bulk_data)),
+                plot_groups = fact_data))
+}
+
+music_on_all <- function(files) {
+    results <- list()
+    for (sc_name in names(files)) {
+        cat(paste0("sc-group:", sc_name, "\n"))
+        scgroup <- files[[sc_name]]
+        ## - sc Data
+        sc_est <- readRDS(scgroup$dataset)
+        ## - params
+        celltypes_label <- scgroup$label_cell
+        samples_label <- scgroup$label_sample
+        celltypes <- scgroup$celltype
+
+        results[[sc_name]] <- list()
+        for (bulk_name in names(scgroup$bulk)) {
+            cat(paste0(" - bulk-group:", bulk_name, "\n"))
+            bulkgroup <- scgroup$bulk[[bulk_name]]
+            ## - bulk Data
+            bulk_est <- readRDS(bulkgroup$dataset)
+            ## - bulk params
+            pheno_facts <- bulkgroup$pheno_facts
+            pheno_excl <- bulkgroup$pheno_excl
+            ##
+            results[[sc_name]][[bulk_name]] <- process_pair(
+                sc_est, bulk_est,
+                celltypes_label, samples_label,
+                celltypes, bulkgroup$factor_group)
+            ##
+            rm(bulk_est) ## unload
+        }
+        rm(sc_est) ## unload
+    }
+    return(results)
+}
+
+plot_all_individual_heatmaps <- function(results) {
+    pdf(out_heatmulti_pdf, width = 8, height = 8)
+    for (sc_name in names(results)) {
+        for (bk_name in names(results[[sc_name]])) {
+            res <- results[[sc_name]][[bk_name]]
+            plot_hmap <- Prop_heat_Est(
+                data.matrix(res[[method_key]]), method.name = est_method) +
+                ggtitle(paste0("[", est_method, "Cell type ",
+                               "proportions in ",
+                               bk_name, " (Bulk) based on ",
+                               sc_name, " (scRNA)")) +
+                xlab("Cell Types (scRNA)") +
+                ylab("Samples (Bulk)") +
+                theme(axis.text.x = element_text(angle = -90),
+                      axis.text.y = element_text(size = 6))
+            print(plot_hmap)
+        }
+    }
+    dev.off()
+}
+
+merge_factors_spread <- function(grudat_spread, factor_groups) {
+    ## Generated
+    merge_it <- function(matr, plot_groups, valname) {
+        ren <- melt(lapply(matr, function(mat) {
+            mat["ct"] <- rownames(mat); return(mat)}))
+        ## - Grab factors and merge into list
+        ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
+        colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
+        return(ren_new)
+    }
+    tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
+                 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
+                 by = c("Sample", "Cell", "Bulk", "Factors"))
+    return(tab)
+}
+
+
+plot_grouped_heatmaps <- function(results) {
+    pdf(out_heatmulti_pdf, width = 8, height = 8)
+    for (sc_name in names(results)) {
+        named_list <- sapply(
+            names(results[[sc_name]]),
+            function(n) {
+                ## We transpose the data here, because
+                ## the plotting function omits by default
+                ## the Y-axis which are the samples.
+                ##  Since the celltypes are the common factor
+                ## these should be the Y-axis instead.
+                t(data.matrix(results[[sc_name]][[n]][[method_key]]))
+            }, simplify = F, USE.NAMES = T)
+        named_methods <- names(results[[sc_name]])
+        ##
+        plot_hmap <- Prop_heat_Est(
+            named_list,
+            method.name = named_methods) +
+            ggtitle(paste0("[", est_method, "] Cell type ",
+                           "proportions of ",
+                           "Bulk Datasets based on ",
+                           sc_name, " (scRNA)")) +
+            xlab("Samples (Bulk)") +
+            ylab("Cell Types (scRNA)") +
+            theme(axis.text.x = element_text(angle = -90),
+                  axis.text.y = element_text(size = 6))
+        print(plot_hmap)
+    }
+    dev.off()
+}
+
+## Desired plots
+## 1. Pie chart:
+##  - Per Bulk dataset (using just normalised proportions)
+##  - Per Bulk dataset (multiplying proportions by nreads)
+
+unlist_names <- function(results, method, prepend_bkname=FALSE) {
+    unique(sort(
+        unlist(lapply(names(results), function(scname) {
+            lapply(names(results[[scname]]), function(bkname) {
+                res <- get(method)(results[[scname]][[bkname]][[method_key]])
+                if (prepend_bkname) {
+                    ## We *do not* assume unique bulk sample names
+                    ## across different bulk datasets.
+                    res <- paste0(bkname, "::", res)
+                }
+                return(res)
+            })
+        }))
+    ))
+}
+
+summarized_matrix <- function(results) {  # nolint
+    ## We assume that cell types MUST be unique, but that sample
+    ## names do not need to be. For this reason, we must prepend
+    ## the bulk dataset name to the individual sample names.
+    all_celltypes <- unlist_names(results, "colnames")
+    all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)
+
+    ## Iterate through all possible samples and populate a table.
+    ddff <- data.frame()
+    ddff_scale <- data.frame()
+    for (cell in all_celltypes) {
+        for (sample in all_samples) {
+            group_sname <- unlist(strsplit(sample, split = "::"))
+            bulk <- group_sname[1]
+            id_sample <- group_sname[2]
+            for (scgroup in names(results)) {
+                if (bulk %in% names(results[[scgroup]])) {
+                    mat_prop <- results[[scgroup]][[bulk]][[method_key]]
+                    vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
+                    ## - We use sample instead of id_sample because we need to
+                    ##   extract bulk sets from the complete matrix later. It's
+                    ##   messy, yes.
+                    if (cell %in% colnames(mat_prop)) {
+                        ddff[cell, sample] <- mat_prop[id_sample, cell]
+                        ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
+                    } else {
+                        ddff[cell, sample] <- 0
+                        ddff_scale[cell, sample] <- 0
+                    }
+                }
+            }
+        }
+    }
+    return(list(prop = ddff, scaled = ddff_scale))
+}
+
+flatten_factor_list <- function(results) {
+    ## Get a 2d DF of all factors across all bulk samples.
+    res <- c()
+    for (scgroup in names(results)) {
+        for (bulkgroup in names(results[[scgroup]])) {
+            dat <- results[[scgroup]][[bulkgroup]]$plot_groups
+            dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint
+            res <- rbind(res, dat)
+        }
+    }
+    return(res)
+}
+
+group_by_dataset <- function(summat) {
+    bulk_names <- unlist(
+        lapply(names(files), function(x) names(files[[x]]$bulk)))
+    mat_names <- colnames(summat$prop)
+    bd <- list()
+    bd_scale <- list()
+    bd_spread_scale <- list()
+    bd_spread_prop <- list()
+    for (bname in bulk_names) {
+        subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))]
+        ## -
+        bd[[bname]] <- rowSums(summat$prop[, subs])
+        bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
+        bd_spread_scale[[bname]] <- summat$scaled[, subs]
+        bd_spread_prop[[bname]] <- summat$prop[, subs]
+    }
+    return(list(prop = as.data.frame(bd),
+                scaled = as.data.frame(bd_scale),
+                spread = list(scale = bd_spread_scale,
+                              prop = bd_spread_prop)))
+}
+
+summarize_heatmaps <- function(grudat_spread_melt, do_factors) {
+    ## -
+    do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
+                          ylabs = element_blank(), xlabs = element_blank(),
+                          use_log = TRUE, size = 11) {
+        ## Convert from matrix to long format
+        melted <- grudat_melted ## copy?
+        if (use_log) {
+            melted[[fillval]] <- log10(melted[[fillval]] + 1)
+        }
+        return(ggplot(melted) +
+               geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
+                         colour = "white") +
+               scale_fill_gradient2(low = "steelblue", high = "red",
+                                    mid = "white", name = element_blank()) +
+               theme(axis.text.x = element_text(angle = -90, hjust = 0,
+                                                size = size)) +
+               ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
+    }
+
+    do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
+        do_logged <- (plot %in% c("log", "both"))
+        do_normal <- (plot %in% c("normal", "both"))
+        plist <- list()
+        if (do_logged) {
+            plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.scale", "Reads (log10+1)",
+                                      size = size)
+            plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.prop", "Sample (log10+1)",
+                                      size = size)
+        }
+        if (do_normal) {
+            plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.scale", "Reads", use_log = F,
+                                      size = size)
+            plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.prop", "Sample", use_log = F,
+                                      size = size)
+        }
+        return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
+                         plot_grid(plotlist = plist, ncol = ncol),
+                         ncol = 1, rel_heights = c(0.05, 0.95)))
+
+    }
+    p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", )
+    p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1,
+                       size = 8)
+    p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1,
+                       size = 8)
+    p3 <- ggplot + theme_void()
+    if (do_factors) {
+        p3 <- do_gridplot("Cell Types against Factors", "Factors", "both")
+    }
+    return(list(bulk = p1,
+                samples = list(log = p2b, normal = p2a),
+                factors = p3))
+}
+
+summarize_boxplots <- function(grudat_spread, do_factors) {
+    common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
+        xlab(element_blank()) + ylab(element_blank())
+    common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
+        xlab(element_blank()) + ylab(element_blank())
+
+    A <- B <- list() #nolint
+    ## Cell type by sample
+    A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
+    A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
+    ## Sample by Cell type
+    B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+        ylab("Bulk Dataset")
+    B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+        ylab("Bulk Dataset")
+    ## -- Factor plots are optional
+    A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()
+
+    if (do_factors) {
+        A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
+        A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
+        B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+            ylab("Bulk Dataset")
+        B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+            ylab("Bulk Dataset")
+    }
+
+    title_a <- "Cell Types against Bulk"
+    title_b <- "Bulk Datasets against Cells"
+    if (do_factors) {
+        title_a <- paste0(title_a, " and Factors")
+        title_b <- paste0(title_b, " and Factors")
+    }
+
+    a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
+                       plot_grid(plotlist = A, ncol = 2),
+                       ncol = 1, rel_heights = c(0.05, 0.95))
+    b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
+                       plot_grid(plotlist = B, ncol = 2),
+                       ncol = 1, rel_heights = c(0.05, 0.95))
+    return(list(cell = a_all, bulk = b_all))
+}
+
+filter_output <- function(grudat_spread_melt, out_filt) {
+    print_red <- function(comment, red_list) {
+        cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
+    }
+    grudat_filt <- grudat_spread_melt
+    print_red("Total Cell types:", unique(grudat_filt$Cell))
+    if (!is.null(out_filt$cells)) {
+        grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
+        print_red(" - selecting:", out_filt$cells)
+    }
+    print_red("Total Factors:", unique(grudat_spread_melt$Factors))
+    if (!is.null(out_filt$facts)) {
+        grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
+        print_red(" - selecting:", out_filt$facts)
+    }
+    return(grudat_filt)
+}
+
+
+results <- music_on_all(files)
+
+if (heat_grouped_p) {
+    plot_grouped_heatmaps(results)
+} else {
+    plot_all_individual_heatmaps(results)
+}
+
+save.image("/tmp/sesh.rds")
+
+summat <- summarized_matrix(results)
+grudat <- group_by_dataset(summat)
+grudat_spread_melt <- merge_factors_spread(grudat$spread,
+                                           flatten_factor_list(results))
+
+
+
+## The output filters ONLY apply to boxplots, since these take
+do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
+
+grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
+
+heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors)
+box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
+
+pdf(out_heatsumm_pdf, width = 14, height = 14)
+print(heat_maps)
+print(box_plots)
+dev.off()
+
+## Generate output tables
+stats_prop <- lapply(grudat$spread$prop, function(x) {
+    t(apply(x, 1, summary))})
+stats_scale <- lapply(grudat$spread$scale, function(x) {
+    t(apply(x, 1, summary))})
+
+writable2 <- function(obj, prefix, title) {
+    write.table(obj,
+                file = paste0("report_data/", prefix, "_",
+                              title, ".tabular"),
+                quote = F, sep = "\t", col.names = NA)
+}
+## Make the value table printable
+grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
+colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
+                                  "CT Prop in Sample", "Number of Reads")
+
+writable2(grudat_spread_melt, "values", "Data Table")
+writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
+writable2({
+    aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
+}, "values", "Matrix of Cell Type Read Counts")
+
+for (bname in names(stats_prop)) {
+    writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
+    writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
+}