Mercurial > repos > bgruening > music_construct_eset
diff scripts/estimateprops.R.orig @ 5:ada0d6224015 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d5c7ca22af1d4f0eaa7a607886554bebb95e8c50
author | bgruening |
---|---|
date | Mon, 28 Oct 2024 17:32:30 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/estimateprops.R.orig Mon Oct 28 17:32:30 2024 +0000 @@ -0,0 +1,281 @@ +suppressWarnings(suppressPackageStartupMessages(library(xbioc))) +suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) +suppressWarnings(suppressPackageStartupMessages(library(reshape2))) +suppressWarnings(suppressPackageStartupMessages(library(cowplot))) +## We use this script to estimate the effectiveness of proportion methods + +## Load Conf +args <- commandArgs(trailingOnly = TRUE) +source(args[1]) + +## Estimate cell type proportions +est_prop <- music_prop( + bulk.eset = bulk_eset, sc.eset = scrna_eset, + clusters = celltypes_label, + samples = samples_label, select.ct = celltypes, verbose = T) + + +estimated_music_props <- est_prop$Est.prop.weighted +estimated_nnls_props <- est_prop$Est.prop.allgene +## +estimated_music_props_flat <- melt(estimated_music_props) +estimated_nnls_props_flat <- melt(estimated_nnls_props) + +scale_yaxes <- function(gplot, value) { + if (is.na(value)) { + gplot + } else { + gplot + scale_y_continuous(lim = c(0, value)) + } +} + +sieve_data <- function(func, music_data, nnls_data) { + if (func == "list") { + res <- list(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + res[lengths(res) > 0] ## filter out NULL elements + } else if (func == "rbind") { + rbind(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } else if (func == "c") { + c(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } +} + + +## Show different in estimation methods +## Jitter plot of estimated cell type proportions +jitter_fig <- scale_yaxes(Jitter_Est( + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + theme_minimal(), maxyscale) + +## Make a Plot +## A more sophisticated jitter plot is provided as below. We separated +## the T2D subjects and normal subjects by their disease factor levels. +m_prop <- sieve_data("rbind", + estimated_music_props_flat, + estimated_nnls_props_flat) +colnames(m_prop) <- c("Sub", "CellType", "Prop") + +if (is.null(celltypes)) { + celltypes <- levels(m_prop$CellType) + message("No celltypes declared, using:") + message(celltypes) +} + +if (is.null(phenotype_factors)) { + phenotype_factors <- colnames(pData(bulk_eset)) +} +## filter out unwanted factors like "sampleID" and "subjectName" +phenotype_factors <- phenotype_factors[ + !(phenotype_factors %in% phenotype_factors_always_exclude)] +message("Phenotype Factors to use:") +message(paste0(phenotype_factors, collapse = ", ")) + +m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint +m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint + levels = methods) + +if (use_disease_factor) { + + if (phenotype_target_threshold == -99) { + phenotype_target_threshold <- -Inf + message("phenotype target threshold set to -Inf") + } + ## the "2" here is to do with the sample groups, not number of methods + m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint + m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] + ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 + sample_groups <- c("Normal", sample_disease_group) + m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint + levels = sample_groups) + + ## Binary to scale: e.g. TRUE / 5 = 0.2 + m_prop$D <- (m_prop$Disease == # nolint + sample_disease_group) / sample_disease_group_scale + ## NA's are not included in the comparison below + m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), + subset(m_prop, Disease == sample_disease_group)) + + jitter_new <- scale_yaxes( + ggplot(m_prop, aes(Method, Prop)) + + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), + size = 2, alpha = 0.7, + position = position_jitter(width = 0.25, height = 0)) + + facet_wrap(~ CellType, scales = "free") + + scale_colour_manual(values = c("white", "gray20")) + + scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale) + +} + +if (use_disease_factor) { + + ## Plot to compare method effectiveness + ## Create dataframe for beta cell proportions and Disease_factor levels + ## - Ugly code. Essentially, doubles the cell type proportions for each + ## set of MuSiC and NNLS methods + m_prop_ana <- data.frame( + pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint + phenotype_factors], + ## get proportions of target cell type + ct.prop = sieve_data("c", + estimated_music_props[, phenotype_scrna_target], + estimated_nnls_props[, phenotype_scrna_target]), + ## + Method = factor(rep(methods, + each = nrow(estimated_music_props)), + levels = methods)) + ## - fix headers + colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint + ## - drop NA for target phenotype (e.g. hba1c) + m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) + m_prop_ana$Disease <- factor( # nolint + ## - Here we set Normal/Disease assignments across the methods + sample_groups[( + m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 + ], + sample_groups) + ## - Then we scale this binary assignment to a plotable factor + m_prop_ana$D <- (m_prop_ana$Disease == # nolint + sample_disease_group) / sample_disease_group_scale + + jitt_compare <- scale_yaxes( + ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), + size = 2, alpha = 0.7) + facet_wrap(~ Method) + + ggtitle(paste0(toupper(phenotype_target), " vs. ", + toupper(phenotype_scrna_target), + " Cell Type Proportion")) + + theme_minimal() + + ylab(paste0("Proportion of ", + phenotype_scrna_target, " cells")) + + xlab(paste0("Level of bulk factor (", phenotype_target, ")")) + + scale_colour_manual(values = c("white", "gray20")) + + scale_shape_manual(values = c(21, 24)), maxyscale) +} + +## BoxPlot +plot_box <- scale_yaxes(Boxplot_Est( + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 8)) + + ggtitle(element_blank()) + theme_minimal(), maxyscale) + +## Heatmap +plot_hmap <- Prop_heat_Est( + sieve_data( + "list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + +pdf(file = outfile_pdf, width = 8, height = 8) +if (length(celltypes) <= 8) { + plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) +} else { + print(jitter_fig) + plot_box +} +if (use_disease_factor) { + plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) +} +plot_hmap +message(dev.off()) + +writable <- function(obj, prefix, title) { + write.table(obj, + file = paste0("report_data/", prefix, "_", + title, ".tabular"), + quote = F, sep = "\t", col.names = NA) +} + +## Output Proportions +if ("NNLS" %in% methods) { + writable(est_prop$Est.prop.allgene, "prop", + "NNLS Estimated Proportions of Cell Types") +} + +if ("MuSiC" %in% methods) { + writable(est_prop$Est.prop.weighted, "prop", + "Music Estimated Proportions of Cell Types") + writable(est_prop$Weight.gene, "weightgene", + "Music Estimated Proportions of Cell Types (by Gene)") + writable(est_prop$r.squared.full, "rsquared", + "Music R-sqr Estimated Proportions of Each Subject") + writable(est_prop$Var.prop, "varprop", + "Matrix of Variance of MuSiC Estimates") +} + + +<<<<<<< HEAD +======= +write.table(est_prop$Est.prop.weighted, + file = paste0("report_data/prop_", + "Music Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Est.prop.allgene, + file = paste0("report_data/prop_", + "NNLS Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Weight.gene, + file = paste0("report_data/weightgene_", + "Music Estimated Proportions of Cell Types (by Gene)", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$r.squared.full, + file = paste0("report_data/rsquared_", + "Music R-sqr Estimated Proportions of Each Subject", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Var.prop, + file = paste0("report_data/varprop_", + "Matrix of Variance of MuSiC Estimates", + ".tabular"), + quote = F, sep = "\t", col.names = NA) + + +>>>>>>> 7a416140 (fitting summaries only apply when disease factor is used) +if (use_disease_factor) { + ## Summary table of linear regressions of disease factors + for (meth in methods) { + ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = + sub_data <- subset(m_prop_ana, Method == meth) + + ## We can only do regression where there are more than 1 factors + ## so we must find and exclude the ones which are not + gt1_facts <- sapply(phenotype_factors, function(facname) { + return(length(unique(sort(sub_data[[facname]]))) == 1) + }) + form_factors <- phenotype_factors + exclude_facts <- names(gt1_facts)[gt1_facts] + if (length(exclude_facts) > 0) { + message("Factors with only one level will be excluded:") + message(exclude_facts) + form_factors <- phenotype_factors[ + !(phenotype_factors %in% exclude_facts)] + } + lm_beta_meth <- lm(as.formula( + paste("ct.prop", paste(form_factors, collapse = " + "), + sep = " ~ ")), data = sub_data) + message(paste0("Summary: ", meth)) + capture.output(summary(lm_beta_meth), + file = paste0("report_data/summ_Log of ", + meth, + " fitting.txt")) + } +} +