diff scripts/compare.R @ 0:22232092be53 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"
author bgruening
date Mon, 02 May 2022 09:59:18 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/compare.R	Mon May 02 09:59:18 2022 +0000
@@ -0,0 +1,470 @@
+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
+## We use this script to estimate the effectiveness of proportion methods
+
+## Load Conf
+args <- commandArgs(trailingOnly = TRUE)
+source(args[1])
+
+method_key <- list("MuSiC" = "est_music",
+                   "NNLS" = "est_nnls")[[est_method]]
+
+delim <- "::" ## separator bulk datasets and their samples
+
+scale_yaxes <- function(gplot, value) {
+    if (is.na(value)) {
+        gplot
+    } else {
+        gplot + scale_y_continuous(lim = c(0, value))
+    }
+}
+
+
+set_factor_data <- function(bulk_data, factor_name = NULL) {
+    if (is.null(factor_name)) {
+        factor_name <- "None" ## change to something plottable
+    }
+    pdat <- pData(bulk_data)
+    sam_fact <- NULL
+    if (factor_name %in% colnames(pdat)) {
+        sam_fact <- cbind(rownames(pdat),
+                          as.character(pdat[[factor_name]]))
+        cat(paste0("   - factor: ", factor_name,
+                   " found in phenotypes\n"))
+    } else {
+        ## We assign this as the factor for the entire dataset
+        sam_fact <- cbind(rownames(pdat),
+                          factor_name)
+        cat(paste0("   - factor: assigning \"", factor_name,
+                   "\" to whole dataset\n"))
+    }
+    colnames(sam_fact) <- c("Samples", "Factors")
+    return(as.data.frame(sam_fact))
+}
+
+## Due to limiting sizes, we need to load and unload
+## possibly very large datasets.
+process_pair <- function(sc_data, bulk_data,
+                         ctypes_label, samples_label, ctypes,
+                         factor_group) {
+    ## - Generate
+    est_prop <- music_prop(
+        bulk.eset = bulk_data, sc.eset = sc_data,
+        clusters = ctypes_label,
+        samples = samples_label, select.ct = ctypes, verbose = T)
+    ## -
+    estimated_music_props <- est_prop$Est.prop.weighted
+    estimated_nnls_props <- est_prop$Est.prop.allgene
+    ## -
+    fact_data <- set_factor_data(bulk_data, factor_group)
+    ## -
+    return(list(est_music = estimated_music_props,
+                est_nnls = estimated_nnls_props,
+                bulk_sample_totals = colSums(exprs(bulk_data)),
+                plot_groups = fact_data))
+}
+
+music_on_all <- function(files) {
+    results <- list()
+    for (sc_name in names(files)) {
+        cat(paste0("sc-group:", sc_name, "\n"))
+        scgroup <- files[[sc_name]]
+        ## - sc Data
+        sc_est <- readRDS(scgroup$dataset)
+        ## - params
+        celltypes_label <- scgroup$label_cell
+        samples_label <- scgroup$label_sample
+        celltypes <- scgroup$celltype
+
+        results[[sc_name]] <- list()
+        for (bulk_name in names(scgroup$bulk)) {
+            cat(paste0(" - bulk-group:", bulk_name, "\n"))
+            bulkgroup <- scgroup$bulk[[bulk_name]]
+            ## - bulk Data
+            bulk_est <- readRDS(bulkgroup$dataset)
+            ## - bulk params
+            pheno_facts <- bulkgroup$pheno_facts
+            pheno_excl <- bulkgroup$pheno_excl
+            ##
+            results[[sc_name]][[bulk_name]] <- process_pair(
+                sc_est, bulk_est,
+                celltypes_label, samples_label,
+                celltypes, bulkgroup$factor_group)
+            ##
+            rm(bulk_est) ## unload
+        }
+        rm(sc_est) ## unload
+    }
+    return(results)
+}
+
+plot_all_individual_heatmaps <- function(results) {
+    pdf(out_heatmulti_pdf, width = 8, height = 8)
+    for (sc_name in names(results)) {
+        for (bk_name in names(results[[sc_name]])) {
+            res <- results[[sc_name]][[bk_name]]
+            plot_hmap <- Prop_heat_Est(
+                data.matrix(res[[method_key]]), method.name = est_method) +
+                ggtitle(paste0("[", est_method, "Cell type ",
+                               "proportions in ",
+                               bk_name, " (Bulk) based on ",
+                               sc_name, " (scRNA)")) +
+                xlab("Cell Types (scRNA)") +
+                ylab("Samples (Bulk)") +
+                theme(axis.text.x = element_text(angle = -90),
+                      axis.text.y = element_text(size = 6))
+            print(plot_hmap)
+        }
+    }
+    dev.off()
+}
+
+merge_factors_spread <- function(grudat_spread, factor_groups) {
+    ## Generated
+    merge_it <- function(matr, plot_groups, valname) {
+        ren <- melt(lapply(matr, function(mat) {
+            mat["ct"] <- rownames(mat); return(mat)}))
+        ## - Grab factors and merge into list
+        ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
+        colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
+        return(ren_new)
+    }
+    tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
+                 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
+                 by = c("Sample", "Cell", "Bulk", "Factors"))
+    return(tab)
+}
+
+unlist_names <- function(results, method, prepend_bkname=FALSE) {
+    unique(sort(
+        unlist(lapply(names(results), function(scname) {
+            lapply(names(results[[scname]]), function(bkname) {
+                res <- get(method)(results[[scname]][[bkname]][[method_key]])
+                if (prepend_bkname) {
+                    ## We *do not* assume unique bulk sample names
+                    ## across different bulk datasets.
+                    res <- paste0(bkname, delim, res)
+                }
+                return(res)
+            })
+        }))
+    ))
+}
+
+summarized_matrix <- function(results) {  # nolint
+    ## We assume that cell types MUST be unique, but that sample
+    ## names do not need to be. For this reason, we must prepend
+    ## the bulk dataset name to the individual sample names.
+    all_celltypes <- unlist_names(results, "colnames")
+    all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)
+
+    ## Iterate through all possible samples and populate a table.
+    ddff <- data.frame()
+    ddff_scale <- data.frame()
+    for (cell in all_celltypes) {
+        for (sample in all_samples) {
+            group_sname <- unlist(strsplit(sample, split = delim))
+            bulk <- group_sname[1]
+            id_sample <- group_sname[2]
+            for (scgroup in names(results)) {
+                if (bulk %in% names(results[[scgroup]])) {
+                    mat_prop <- results[[scgroup]][[bulk]][[method_key]]
+                    vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
+                    ## - We use sample instead of id_sample because we need to
+                    ##   extract bulk sets from the complete matrix later. It's
+                    ##   messy, yes.
+                    if (cell %in% colnames(mat_prop)) {
+                        ddff[cell, sample] <- mat_prop[id_sample, cell]
+                        ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
+                    } else {
+                        ddff[cell, sample] <- 0
+                        ddff_scale[cell, sample] <- 0
+                    }
+                }
+            }
+        }
+    }
+    return(list(prop = ddff, scaled = ddff_scale))
+}
+
+flatten_factor_list <- function(results) {
+    ## Get a 2d DF of all factors across all bulk samples.
+    res <- c()
+    for (scgroup in names(results)) {
+        for (bulkgroup in names(results[[scgroup]])) {
+            dat <- results[[scgroup]][[bulkgroup]]$plot_groups
+            dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint
+            res <- rbind(res, dat)
+        }
+    }
+    return(res)
+}
+
+group_by_dataset <- function(summat) {
+    bulk_names <- unlist(
+        lapply(names(files), function(x) names(files[[x]]$bulk)))
+    mat_names <- colnames(summat$prop)
+    bd <- list()
+    bd_scale <- list()
+    bd_spread_scale <- list()
+    bd_spread_prop <- list()
+    for (bname in bulk_names) {
+        subs <- mat_names[startsWith(mat_names, paste0(bname, delim))]
+        ## -
+        bd[[bname]] <- rowSums(summat$prop[, subs])
+        bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
+        bd_spread_scale[[bname]] <- summat$scaled[, subs]
+        bd_spread_prop[[bname]] <- summat$prop[, subs]
+    }
+    return(list(prop = as.data.frame(bd),
+                scaled = as.data.frame(bd_scale),
+                spread = list(scale = bd_spread_scale,
+                              prop = bd_spread_prop)))
+}
+
+do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name,
+                       xlabs="", ylabs="", titled="",
+                       order_col=T, order_row=T, size=11) {
+
+    data_m <- grudat_spread_melt
+    data_matrix <- {
+        tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name)
+        rownames(tmp) <- tmp[[yaxis]]
+        tmp[[yaxis]] <- NULL
+        tmp
+    }
+    dist_method <- "euclidean"
+    clust_method <- "complete"
+
+    if (order_row) {
+        dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method))
+        row_ord <- order.dendrogram(dd_row)
+        ordered_row_names <- row.names(data_matrix[row_ord, ])
+        data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names)
+    }
+
+    if (order_col) {
+        dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method),
+                                       method = clust_method))
+        col_ord <- order.dendrogram(dd_col)
+        ordered_col_names <- colnames(data_matrix[, col_ord])
+        data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names)
+    }
+
+    heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) +
+        geom_tile(colour = "white") +
+        scale_fill_gradient2(low = "steelblue", high = "red", mid = "white",
+                             name = element_blank()) +
+        scale_y_discrete(position = "right") +
+        theme(axis.text.x = element_text(angle = -90, hjust = 0,
+                                         size = size)) +
+        ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs)
+
+    ## Graphics
+    dendro_linesize <- 0.5
+    dendro_colunit <- 0.2
+    dendro_rowunit <- 0.1
+    final_plot <- heat_plot
+
+    if (order_row) {
+        dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle")
+        dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) +
+            ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row),
+                                  ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend),
+                                  size = dendro_linesize) + ggplot2::coord_flip()
+        final_plot <- cowplot::insert_yaxis_grob(
+                                   final_plot, dendro_row, grid::unit(dendro_colunit, "null"),
+                                   position = "left")
+    }
+    if (order_col) {
+        dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle")
+        dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") +
+            ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col),
+                                  ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
+                                  size = dendro_linesize)
+        final_plot <- cowplot::insert_xaxis_grob(
+                                   final_plot, dendro_col, grid::unit(dendro_rowunit, "null"),
+                                   position = "top")
+    }
+    return(cowplot::ggdraw(final_plot))
+}
+
+summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") {
+    ## - Cluster is either "Rows", "Cols", "Both", or "None"
+    do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
+                          ylabs = element_blank(), xlabs = element_blank(),
+                          use_log = TRUE, size = 11) {
+        ## Convert from matrix to long format
+        melted <- grudat_melted ## copy?
+        if (use_log) {
+            melted[[fillval]] <- log10(melted[[fillval]] + 1)
+        }
+        if (cluster == "None") {
+            return(ggplot(melted) +
+                   geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
+                             colour = "white") +
+                   scale_fill_gradient2(
+                       low = "steelblue", high = "red", mid = "white",
+                       name = element_blank()) +
+                   theme(axis.text.x = element_text(
+                             angle = -90, hjust = 0, size = size)) +
+                   ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
+        } else {
+            return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval,
+                              xlabs, ylabs, title,
+                              (cluster %in% c("Cols", "Both")),
+                              (cluster %in% c("Rows", "Both"))))
+        }
+    }
+
+    do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
+        do_logged <- (plot %in% c("log", "both"))
+        do_normal <- (plot %in% c("normal", "both"))
+        plist <- list()
+        if (do_logged) {
+            plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.scale", "Reads (log10+1)",
+                                      size = size)
+            plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.prop", "Sample (log10+1)",
+                                      size = size)
+        }
+        if (do_normal) {
+            plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.scale", "Reads", use_log = F,
+                                      size = size)
+            plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
+                                      "value.prop", "Sample", use_log = F,
+                                      size = size)
+        }
+        return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
+                         plot_grid(plotlist = plist, ncol = ncol),
+                         ncol = 1, rel_heights = c(0.05, 0.95)))
+    }
+
+    p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both")
+    p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal",
+                       ncol = 1, size = 8)
+    p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log",
+                       ncol = 1, size = 8)
+    p3 <- ggplot + theme_void()
+    if (do_factors) {
+        p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both")
+    }
+    return(list(bulk = p1,
+                samples = list(log = p2b, normal = p2a),
+                factors = p3))
+}
+
+summarize_boxplots <- function(grudat_spread, do_factors) {
+    common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
+        xlab(element_blank()) + ylab(element_blank())
+    common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
+        xlab(element_blank()) + ylab(element_blank())
+
+    A <- B <- list() #nolint
+    ## Cell type by sample
+    A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
+    A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
+    ## Sample by Cell type
+    B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+        ylab("Bulk Dataset")
+    B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
+        ylab("Bulk Dataset")
+    ## -- Factor plots are optional
+    A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()
+
+    if (do_factors) {
+        A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
+        A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
+        B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+            ylab("Bulk Dataset")
+        B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
+            ylab("Bulk Dataset")
+    }
+
+    title_a <- "Cell Types vs Bulk Datasets"
+    title_b <- "Bulk Datasets vs Cell Types"
+    if (do_factors) {
+        title_a <- paste0(title_a, " and Factors")
+        title_b <- paste0(title_b, " and Factors")
+    }
+
+    a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
+                       plot_grid(plotlist = A, ncol = 2),
+                       ncol = 1, rel_heights = c(0.05, 0.95))
+    b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
+                       plot_grid(plotlist = B, ncol = 2),
+                       ncol = 1, rel_heights = c(0.05, 0.95))
+    return(list(cell = a_all, bulk = b_all))
+}
+
+filter_output <- function(grudat_spread_melt, out_filt) {
+    print_red <- function(comment, red_list) {
+        cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
+    }
+    grudat_filt <- grudat_spread_melt
+    print_red("Total Cell types:", unique(grudat_filt$Cell))
+    if (!is.null(out_filt$cells)) {
+        grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
+        print_red(" - selecting:", out_filt$cells)
+    }
+    print_red("Total Factors:", unique(grudat_spread_melt$Factors))
+    if (!is.null(out_filt$facts)) {
+        grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
+        print_red(" - selecting:", out_filt$facts)
+    }
+    return(grudat_filt)
+}
+
+writable2 <- function(obj, prefix, title) {
+    write.table(obj,
+                file = paste0("report_data/", prefix, "_",
+                              title, ".tabular"),
+                quote = F, sep = "\t", col.names = NA)
+}
+
+
+results <- music_on_all(files)
+summat <- summarized_matrix(results)
+grudat <- group_by_dataset(summat)
+grudat_spread_melt <- merge_factors_spread(grudat$spread,
+                                           flatten_factor_list(results))
+grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
+
+plot_all_individual_heatmaps(results)
+
+## The output filters ONLY apply to boxplots, since these take
+do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
+box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
+heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors,
+                                dendro_setting)
+
+pdf(out_heatsumm_pdf, width = 14, height = 14)
+print(heat_maps)
+print(box_plots)
+dev.off()
+
+## Generate output tables
+stats_prop <- lapply(grudat$spread$prop, function(x) {
+    t(apply(x, 1, summary))})
+stats_scale <- lapply(grudat$spread$scale, function(x) {
+    t(apply(x, 1, summary))})
+
+## Make the value table printable
+grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
+colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
+                                  "CT Prop in Sample", "Number of Reads")
+
+writable2(grudat_spread_melt, "values", "Data Table")
+writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
+writable2({
+    aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
+}, "values", "Matrix of Cell Type Read Counts")
+
+for (bname in names(stats_prop)) {
+    writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
+    writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
+}