Mercurial > repos > bgruening > music_compare
view scripts/estimateprops.R.orig @ 2:6c75960f6e56 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d5c7ca22af1d4f0eaa7a607886554bebb95e8c50
author | bgruening |
---|---|
date | Mon, 28 Oct 2024 17:32:54 +0000 |
parents | |
children |
line wrap: on
line source
suppressWarnings(suppressPackageStartupMessages(library(xbioc))) suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) suppressWarnings(suppressPackageStartupMessages(library(reshape2))) suppressWarnings(suppressPackageStartupMessages(library(cowplot))) ## We use this script to estimate the effectiveness of proportion methods ## Load Conf args <- commandArgs(trailingOnly = TRUE) source(args[1]) ## Estimate cell type proportions est_prop <- music_prop( bulk.eset = bulk_eset, sc.eset = scrna_eset, clusters = celltypes_label, samples = samples_label, select.ct = celltypes, verbose = T) estimated_music_props <- est_prop$Est.prop.weighted estimated_nnls_props <- est_prop$Est.prop.allgene ## estimated_music_props_flat <- melt(estimated_music_props) estimated_nnls_props_flat <- melt(estimated_nnls_props) scale_yaxes <- function(gplot, value) { if (is.na(value)) { gplot } else { gplot + scale_y_continuous(lim = c(0, value)) } } sieve_data <- function(func, music_data, nnls_data) { if (func == "list") { res <- list(if ("MuSiC" %in% methods) music_data else NULL, if ("NNLS" %in% methods) nnls_data else NULL) res[lengths(res) > 0] ## filter out NULL elements } else if (func == "rbind") { rbind(if ("MuSiC" %in% methods) music_data else NULL, if ("NNLS" %in% methods) nnls_data else NULL) } else if (func == "c") { c(if ("MuSiC" %in% methods) music_data else NULL, if ("NNLS" %in% methods) nnls_data else NULL) } } ## Show different in estimation methods ## Jitter plot of estimated cell type proportions jitter_fig <- scale_yaxes(Jitter_Est( sieve_data("list", data.matrix(estimated_music_props), data.matrix(estimated_nnls_props)), method.name = methods, title = "Jitter plot of Est Proportions", size = 2, alpha = 0.7) + theme_minimal(), maxyscale) ## Make a Plot ## A more sophisticated jitter plot is provided as below. We separated ## the T2D subjects and normal subjects by their disease factor levels. m_prop <- sieve_data("rbind", estimated_music_props_flat, estimated_nnls_props_flat) colnames(m_prop) <- c("Sub", "CellType", "Prop") if (is.null(celltypes)) { celltypes <- levels(m_prop$CellType) message("No celltypes declared, using:") message(celltypes) } if (is.null(phenotype_factors)) { phenotype_factors <- colnames(pData(bulk_eset)) } ## filter out unwanted factors like "sampleID" and "subjectName" phenotype_factors <- phenotype_factors[ !(phenotype_factors %in% phenotype_factors_always_exclude)] message("Phenotype Factors to use:") message(paste0(phenotype_factors, collapse = ", ")) m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint levels = methods) if (use_disease_factor) { if (phenotype_target_threshold == -99) { phenotype_target_threshold <- -Inf message("phenotype target threshold set to -Inf") } ## the "2" here is to do with the sample groups, not number of methods m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 sample_groups <- c("Normal", sample_disease_group) m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint levels = sample_groups) ## Binary to scale: e.g. TRUE / 5 = 0.2 m_prop$D <- (m_prop$Disease == # nolint sample_disease_group) / sample_disease_group_scale ## NA's are not included in the comparison below m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), subset(m_prop, Disease == sample_disease_group)) jitter_new <- scale_yaxes( ggplot(m_prop, aes(Method, Prop)) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7, position = position_jitter(width = 0.25, height = 0)) + facet_wrap(~ CellType, scales = "free") + scale_colour_manual(values = c("white", "gray20")) + scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale) } if (use_disease_factor) { ## Plot to compare method effectiveness ## Create dataframe for beta cell proportions and Disease_factor levels ## - Ugly code. Essentially, doubles the cell type proportions for each ## set of MuSiC and NNLS methods m_prop_ana <- data.frame( pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint phenotype_factors], ## get proportions of target cell type ct.prop = sieve_data("c", estimated_music_props[, phenotype_scrna_target], estimated_nnls_props[, phenotype_scrna_target]), ## Method = factor(rep(methods, each = nrow(estimated_music_props)), levels = methods)) ## - fix headers colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint ## - drop NA for target phenotype (e.g. hba1c) m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) m_prop_ana$Disease <- factor( # nolint ## - Here we set Normal/Disease assignments across the methods sample_groups[( m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 ], sample_groups) ## - Then we scale this binary assignment to a plotable factor m_prop_ana$D <- (m_prop_ana$Disease == # nolint sample_disease_group) / sample_disease_group_scale jitt_compare <- scale_yaxes( ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7) + facet_wrap(~ Method) + ggtitle(paste0(toupper(phenotype_target), " vs. ", toupper(phenotype_scrna_target), " Cell Type Proportion")) + theme_minimal() + ylab(paste0("Proportion of ", phenotype_scrna_target, " cells")) + xlab(paste0("Level of bulk factor (", phenotype_target, ")")) + scale_colour_manual(values = c("white", "gray20")) + scale_shape_manual(values = c(21, 24)), maxyscale) } ## BoxPlot plot_box <- scale_yaxes(Boxplot_Est( sieve_data("list", data.matrix(estimated_music_props), data.matrix(estimated_nnls_props)), method.name = methods) + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 8)) + ggtitle(element_blank()) + theme_minimal(), maxyscale) ## Heatmap plot_hmap <- Prop_heat_Est( sieve_data( "list", data.matrix(estimated_music_props), data.matrix(estimated_nnls_props)), method.name = methods) + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 6)) pdf(file = outfile_pdf, width = 8, height = 8) if (length(celltypes) <= 8) { plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) } else { print(jitter_fig) plot_box } if (use_disease_factor) { plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) } plot_hmap message(dev.off()) writable <- function(obj, prefix, title) { write.table(obj, file = paste0("report_data/", prefix, "_", title, ".tabular"), quote = F, sep = "\t", col.names = NA) } ## Output Proportions if ("NNLS" %in% methods) { writable(est_prop$Est.prop.allgene, "prop", "NNLS Estimated Proportions of Cell Types") } if ("MuSiC" %in% methods) { writable(est_prop$Est.prop.weighted, "prop", "Music Estimated Proportions of Cell Types") writable(est_prop$Weight.gene, "weightgene", "Music Estimated Proportions of Cell Types (by Gene)") writable(est_prop$r.squared.full, "rsquared", "Music R-sqr Estimated Proportions of Each Subject") writable(est_prop$Var.prop, "varprop", "Matrix of Variance of MuSiC Estimates") } <<<<<<< HEAD ======= write.table(est_prop$Est.prop.weighted, file = paste0("report_data/prop_", "Music Estimated Proportions of Cell Types", ".tabular"), quote = F, sep = "\t", col.names = NA) write.table(est_prop$Est.prop.allgene, file = paste0("report_data/prop_", "NNLS Estimated Proportions of Cell Types", ".tabular"), quote = F, sep = "\t", col.names = NA) write.table(est_prop$Weight.gene, file = paste0("report_data/weightgene_", "Music Estimated Proportions of Cell Types (by Gene)", ".tabular"), quote = F, sep = "\t", col.names = NA) write.table(est_prop$r.squared.full, file = paste0("report_data/rsquared_", "Music R-sqr Estimated Proportions of Each Subject", ".tabular"), quote = F, sep = "\t", col.names = NA) write.table(est_prop$Var.prop, file = paste0("report_data/varprop_", "Matrix of Variance of MuSiC Estimates", ".tabular"), quote = F, sep = "\t", col.names = NA) >>>>>>> 7a416140 (fitting summaries only apply when disease factor is used) if (use_disease_factor) { ## Summary table of linear regressions of disease factors for (meth in methods) { ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = sub_data <- subset(m_prop_ana, Method == meth) ## We can only do regression where there are more than 1 factors ## so we must find and exclude the ones which are not gt1_facts <- sapply(phenotype_factors, function(facname) { return(length(unique(sort(sub_data[[facname]]))) == 1) }) form_factors <- phenotype_factors exclude_facts <- names(gt1_facts)[gt1_facts] if (length(exclude_facts) > 0) { message("Factors with only one level will be excluded:") message(exclude_facts) form_factors <- phenotype_factors[ !(phenotype_factors %in% exclude_facts)] } lm_beta_meth <- lm(as.formula( paste("ct.prop", paste(form_factors, collapse = " + "), sep = " ~ ")), data = sub_data) message(paste0("Summary: ", meth)) capture.output(summary(lm_beta_meth), file = paste0("report_data/summ_Log of ", meth, " fitting.txt")) } }