comparison scripts/estimateprops.R @ 0:10bf0f89035f draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author bgruening
date Thu, 10 Feb 2022 12:53:49 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:10bf0f89035f
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
5 ## We use this script to estimate the effectiveness of proportion methods
6
7 ## Load Conf
8 args <- commandArgs(trailingOnly = TRUE)
9 source(args[1])
10
11 ## Estimate cell type proportions
12 est_prop <- music_prop(
13 bulk.eset = bulk_eset, sc.eset = scrna_eset,
14 clusters = celltypes_label,
15 samples = samples_label, select.ct = celltypes, verbose = T)
16
17
18 estimated_music_props <- est_prop$Est.prop.weighted
19 estimated_nnls_props <- est_prop$Est.prop.allgene
20 ##
21 estimated_music_props_flat <- melt(estimated_music_props)
22 estimated_nnls_props_flat <- melt(estimated_nnls_props)
23
24 scale_yaxes <- function(gplot, value) {
25 if (is.na(value)) {
26 gplot
27 } else {
28 gplot + scale_y_continuous(lim = c(0, value))
29 }
30 }
31
32 sieve_data <- function(func, music_data, nnls_data) {
33 if (func == "list") {
34 res <- list(if ("MuSiC" %in% methods) music_data else NULL,
35 if ("NNLS" %in% methods) nnls_data else NULL)
36 res[lengths(res) > 0] ## filter out NULL elements
37 } else if (func == "rbind") {
38 rbind(if ("MuSiC" %in% methods) music_data else NULL,
39 if ("NNLS" %in% methods) nnls_data else NULL)
40 } else if (func == "c") {
41 c(if ("MuSiC" %in% methods) music_data else NULL,
42 if ("NNLS" %in% methods) nnls_data else NULL)
43 }
44 }
45
46
47 ## Show different in estimation methods
48 ## Jitter plot of estimated cell type proportions
49 jitter_fig <- scale_yaxes(Jitter_Est(
50 sieve_data("list",
51 data.matrix(estimated_music_props),
52 data.matrix(estimated_nnls_props)),
53 method.name = methods, title = "Jitter plot of Est Proportions",
54 size = 2, alpha = 0.7) + theme_minimal(), maxyscale)
55
56 ## Make a Plot
57 ## A more sophisticated jitter plot is provided as below. We separated
58 ## the T2D subjects and normal subjects by their disease factor levels.
59 m_prop <- sieve_data("rbind",
60 estimated_music_props_flat,
61 estimated_nnls_props_flat)
62 colnames(m_prop) <- c("Sub", "CellType", "Prop")
63
64 if (is.null(celltypes)) {
65 celltypes <- levels(m_prop$CellType)
66 message("No celltypes declared, using:")
67 message(celltypes)
68 }
69
70 if (is.null(phenotype_factors)) {
71 phenotype_factors <- colnames(pData(bulk_eset))
72 }
73 ## filter out unwanted factors like "sampleID" and "subjectName"
74 phenotype_factors <- phenotype_factors[
75 !(phenotype_factors %in% phenotype_factors_always_exclude)]
76 message("Phenotype Factors to use:")
77 message(paste0(phenotype_factors, collapse = ", "))
78
79 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint
80 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint
81 levels = methods)
82
83 if (use_disease_factor) {
84
85 if (phenotype_target_threshold == -99) {
86 phenotype_target_threshold <- -Inf
87 message("phenotype target threshold set to -Inf")
88 }
89 ## the "2" here is to do with the sample groups, not number of methods
90 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint
91 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]
92 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2
93 sample_groups <- c("Normal", sample_disease_group)
94 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint
95 levels = sample_groups)
96
97 ## Binary to scale: e.g. TRUE / 5 = 0.2
98 m_prop$D <- (m_prop$Disease == # nolint
99 sample_disease_group) / sample_disease_group_scale
100 ## NA's are not included in the comparison below
101 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),
102 subset(m_prop, Disease == sample_disease_group))
103
104 jitter_new <- scale_yaxes(
105 ggplot(m_prop, aes(Method, Prop)) +
106 geom_point(aes(fill = Method, color = Disease,
107 stroke = D, shape = Disease),
108 size = 2, alpha = 0.7,
109 position = position_jitter(width = 0.25, height = 0)) +
110 facet_wrap(~ CellType, scales = "free") +
111 scale_colour_manual(values = c("white", "gray20")) +
112 scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale)
113
114 }
115
116 if (use_disease_factor) {
117
118 ## Plot to compare method effectiveness
119 ## Create dataframe for beta cell proportions and Disease_factor levels
120 ## - Ugly code. Essentially, doubles the cell type proportions for each
121 ## set of MuSiC and NNLS methods
122 m_prop_ana <- data.frame(
123 pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint
124 phenotype_factors],
125 ## get proportions of target cell type
126 ct.prop = sieve_data("c",
127 estimated_music_props[, phenotype_scrna_target],
128 estimated_nnls_props[, phenotype_scrna_target]),
129 ##
130 Method = factor(rep(methods,
131 each = nrow(estimated_music_props)),
132 levels = methods))
133 ## - fix headers
134 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint
135 ## - drop NA for target phenotype (e.g. hba1c)
136 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target]))
137 m_prop_ana$Disease <- factor( # nolint
138 ## - Here we set Normal/Disease assignments across the methods
139 sample_groups[(
140 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1
141 ],
142 sample_groups)
143 ## - Then we scale this binary assignment to a plotable factor
144 m_prop_ana$D <- (m_prop_ana$Disease == # nolint
145 sample_disease_group) / sample_disease_group_scale
146
147 jitt_compare <- scale_yaxes(
148 ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) +
149 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) +
150 geom_point(aes(fill = Method, color = Disease,
151 stroke = D, shape = Disease),
152 size = 2, alpha = 0.7) + facet_wrap(~ Method) +
153 ggtitle(paste0(toupper(phenotype_target), " vs. ",
154 toupper(phenotype_scrna_target),
155 " Cell Type Proportion")) +
156 theme_minimal() +
157 ylab(paste0("Proportion of ",
158 phenotype_scrna_target, " cells")) +
159 xlab(paste0("Level of bulk factor (", phenotype_target, ")")) +
160 scale_colour_manual(values = c("white", "gray20")) +
161 scale_shape_manual(values = c(21, 24)), maxyscale)
162 }
163
164 ## BoxPlot
165 plot_box <- scale_yaxes(Boxplot_Est(
166 sieve_data("list",
167 data.matrix(estimated_music_props),
168 data.matrix(estimated_nnls_props)),
169 method.name = methods) +
170 theme(axis.text.x = element_text(angle = -90),
171 axis.text.y = element_text(size = 8)) +
172 ggtitle(element_blank()) + theme_minimal(), maxyscale)
173
174 ## Heatmap
175 plot_hmap <- Prop_heat_Est(
176 sieve_data(
177 "list",
178 data.matrix(estimated_music_props),
179 data.matrix(estimated_nnls_props)),
180 method.name = methods) +
181 theme(axis.text.x = element_text(angle = -90),
182 axis.text.y = element_text(size = 6))
183
184 pdf(file = outfile_pdf, width = 8, height = 8)
185 if (length(celltypes) <= 8) {
186 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)
187 } else {
188 print(jitter_fig)
189 plot_box
190 }
191 if (use_disease_factor) {
192 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)
193 }
194 plot_hmap
195 message(dev.off())
196
197 writable <- function(obj, prefix, title) {
198 write.table(obj,
199 file = paste0("report_data/", prefix, "_",
200 title, ".tabular"),
201 quote = F, sep = "\t", col.names = NA)
202 }
203
204 ## Output Proportions
205 if ("NNLS" %in% methods) {
206 writable(est_prop$Est.prop.allgene, "prop",
207 "NNLS Estimated Proportions of Cell Types")
208 }
209
210 if ("MuSiC" %in% methods) {
211 writable(est_prop$Est.prop.weighted, "prop",
212 "Music Estimated Proportions of Cell Types")
213 writable(est_prop$Weight.gene, "weightgene",
214 "Music Estimated Proportions of Cell Types (by Gene)")
215 writable(est_prop$r.squared.full, "rsquared",
216 "Music R-sqr Estimated Proportions of Each Subject")
217 writable(est_prop$Var.prop, "varprop",
218 "Matrix of Variance of MuSiC Estimates")
219 }
220
221
222 if (use_disease_factor) {
223 ## Summary table of linear regressions of disease factors
224 for (meth in methods) {
225 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =
226 sub_data <- subset(m_prop_ana, Method == meth)
227
228 ## We can only do regression where there are more than 1 factors
229 ## so we must find and exclude the ones which are not
230 gt1_facts <- sapply(phenotype_factors, function(facname) {
231 return(length(unique(sort(sub_data[[facname]]))) == 1)
232 })
233 form_factors <- phenotype_factors
234 exclude_facts <- names(gt1_facts)[gt1_facts]
235 if (length(exclude_facts) > 0) {
236 message("Factors with only one level will be excluded:")
237 message(exclude_facts)
238 form_factors <- phenotype_factors[
239 !(phenotype_factors %in% exclude_facts)]
240 }
241 lm_beta_meth <- lm(as.formula(
242 paste("ct.prop", paste(form_factors, collapse = " + "),
243 sep = " ~ ")), data = sub_data)
244 message(paste0("Summary: ", meth))
245 capture.output(summary(lm_beta_meth),
246 file = paste0("report_data/summ_Log of ",
247 meth,
248 " fitting.txt"))
249 }
250 }