Mercurial > repos > bgruening > music_construct_eset
comparison scripts/estimateprops.R.orig @ 5:ada0d6224015 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d5c7ca22af1d4f0eaa7a607886554bebb95e8c50
author | bgruening |
---|---|
date | Mon, 28 Oct 2024 17:32:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:282819d09a4f | 5:ada0d6224015 |
---|---|
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc))) | |
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) | |
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2))) | |
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot))) | |
5 ## We use this script to estimate the effectiveness of proportion methods | |
6 | |
7 ## Load Conf | |
8 args <- commandArgs(trailingOnly = TRUE) | |
9 source(args[1]) | |
10 | |
11 ## Estimate cell type proportions | |
12 est_prop <- music_prop( | |
13 bulk.eset = bulk_eset, sc.eset = scrna_eset, | |
14 clusters = celltypes_label, | |
15 samples = samples_label, select.ct = celltypes, verbose = T) | |
16 | |
17 | |
18 estimated_music_props <- est_prop$Est.prop.weighted | |
19 estimated_nnls_props <- est_prop$Est.prop.allgene | |
20 ## | |
21 estimated_music_props_flat <- melt(estimated_music_props) | |
22 estimated_nnls_props_flat <- melt(estimated_nnls_props) | |
23 | |
24 scale_yaxes <- function(gplot, value) { | |
25 if (is.na(value)) { | |
26 gplot | |
27 } else { | |
28 gplot + scale_y_continuous(lim = c(0, value)) | |
29 } | |
30 } | |
31 | |
32 sieve_data <- function(func, music_data, nnls_data) { | |
33 if (func == "list") { | |
34 res <- list(if ("MuSiC" %in% methods) music_data else NULL, | |
35 if ("NNLS" %in% methods) nnls_data else NULL) | |
36 res[lengths(res) > 0] ## filter out NULL elements | |
37 } else if (func == "rbind") { | |
38 rbind(if ("MuSiC" %in% methods) music_data else NULL, | |
39 if ("NNLS" %in% methods) nnls_data else NULL) | |
40 } else if (func == "c") { | |
41 c(if ("MuSiC" %in% methods) music_data else NULL, | |
42 if ("NNLS" %in% methods) nnls_data else NULL) | |
43 } | |
44 } | |
45 | |
46 | |
47 ## Show different in estimation methods | |
48 ## Jitter plot of estimated cell type proportions | |
49 jitter_fig <- scale_yaxes(Jitter_Est( | |
50 sieve_data("list", | |
51 data.matrix(estimated_music_props), | |
52 data.matrix(estimated_nnls_props)), | |
53 method.name = methods, title = "Jitter plot of Est Proportions", | |
54 size = 2, alpha = 0.7) + theme_minimal(), maxyscale) | |
55 | |
56 ## Make a Plot | |
57 ## A more sophisticated jitter plot is provided as below. We separated | |
58 ## the T2D subjects and normal subjects by their disease factor levels. | |
59 m_prop <- sieve_data("rbind", | |
60 estimated_music_props_flat, | |
61 estimated_nnls_props_flat) | |
62 colnames(m_prop) <- c("Sub", "CellType", "Prop") | |
63 | |
64 if (is.null(celltypes)) { | |
65 celltypes <- levels(m_prop$CellType) | |
66 message("No celltypes declared, using:") | |
67 message(celltypes) | |
68 } | |
69 | |
70 if (is.null(phenotype_factors)) { | |
71 phenotype_factors <- colnames(pData(bulk_eset)) | |
72 } | |
73 ## filter out unwanted factors like "sampleID" and "subjectName" | |
74 phenotype_factors <- phenotype_factors[ | |
75 !(phenotype_factors %in% phenotype_factors_always_exclude)] | |
76 message("Phenotype Factors to use:") | |
77 message(paste0(phenotype_factors, collapse = ", ")) | |
78 | |
79 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint | |
80 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint | |
81 levels = methods) | |
82 | |
83 if (use_disease_factor) { | |
84 | |
85 if (phenotype_target_threshold == -99) { | |
86 phenotype_target_threshold <- -Inf | |
87 message("phenotype target threshold set to -Inf") | |
88 } | |
89 ## the "2" here is to do with the sample groups, not number of methods | |
90 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint | |
91 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] | |
92 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 | |
93 sample_groups <- c("Normal", sample_disease_group) | |
94 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint | |
95 levels = sample_groups) | |
96 | |
97 ## Binary to scale: e.g. TRUE / 5 = 0.2 | |
98 m_prop$D <- (m_prop$Disease == # nolint | |
99 sample_disease_group) / sample_disease_group_scale | |
100 ## NA's are not included in the comparison below | |
101 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), | |
102 subset(m_prop, Disease == sample_disease_group)) | |
103 | |
104 jitter_new <- scale_yaxes( | |
105 ggplot(m_prop, aes(Method, Prop)) + | |
106 geom_point(aes(fill = Method, color = Disease, | |
107 stroke = D, shape = Disease), | |
108 size = 2, alpha = 0.7, | |
109 position = position_jitter(width = 0.25, height = 0)) + | |
110 facet_wrap(~ CellType, scales = "free") + | |
111 scale_colour_manual(values = c("white", "gray20")) + | |
112 scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale) | |
113 | |
114 } | |
115 | |
116 if (use_disease_factor) { | |
117 | |
118 ## Plot to compare method effectiveness | |
119 ## Create dataframe for beta cell proportions and Disease_factor levels | |
120 ## - Ugly code. Essentially, doubles the cell type proportions for each | |
121 ## set of MuSiC and NNLS methods | |
122 m_prop_ana <- data.frame( | |
123 pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint | |
124 phenotype_factors], | |
125 ## get proportions of target cell type | |
126 ct.prop = sieve_data("c", | |
127 estimated_music_props[, phenotype_scrna_target], | |
128 estimated_nnls_props[, phenotype_scrna_target]), | |
129 ## | |
130 Method = factor(rep(methods, | |
131 each = nrow(estimated_music_props)), | |
132 levels = methods)) | |
133 ## - fix headers | |
134 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint | |
135 ## - drop NA for target phenotype (e.g. hba1c) | |
136 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) | |
137 m_prop_ana$Disease <- factor( # nolint | |
138 ## - Here we set Normal/Disease assignments across the methods | |
139 sample_groups[( | |
140 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 | |
141 ], | |
142 sample_groups) | |
143 ## - Then we scale this binary assignment to a plotable factor | |
144 m_prop_ana$D <- (m_prop_ana$Disease == # nolint | |
145 sample_disease_group) / sample_disease_group_scale | |
146 | |
147 jitt_compare <- scale_yaxes( | |
148 ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + | |
149 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + | |
150 geom_point(aes(fill = Method, color = Disease, | |
151 stroke = D, shape = Disease), | |
152 size = 2, alpha = 0.7) + facet_wrap(~ Method) + | |
153 ggtitle(paste0(toupper(phenotype_target), " vs. ", | |
154 toupper(phenotype_scrna_target), | |
155 " Cell Type Proportion")) + | |
156 theme_minimal() + | |
157 ylab(paste0("Proportion of ", | |
158 phenotype_scrna_target, " cells")) + | |
159 xlab(paste0("Level of bulk factor (", phenotype_target, ")")) + | |
160 scale_colour_manual(values = c("white", "gray20")) + | |
161 scale_shape_manual(values = c(21, 24)), maxyscale) | |
162 } | |
163 | |
164 ## BoxPlot | |
165 plot_box <- scale_yaxes(Boxplot_Est( | |
166 sieve_data("list", | |
167 data.matrix(estimated_music_props), | |
168 data.matrix(estimated_nnls_props)), | |
169 method.name = methods) + | |
170 theme(axis.text.x = element_text(angle = -90), | |
171 axis.text.y = element_text(size = 8)) + | |
172 ggtitle(element_blank()) + theme_minimal(), maxyscale) | |
173 | |
174 ## Heatmap | |
175 plot_hmap <- Prop_heat_Est( | |
176 sieve_data( | |
177 "list", | |
178 data.matrix(estimated_music_props), | |
179 data.matrix(estimated_nnls_props)), | |
180 method.name = methods) + | |
181 theme(axis.text.x = element_text(angle = -90), | |
182 axis.text.y = element_text(size = 6)) | |
183 | |
184 pdf(file = outfile_pdf, width = 8, height = 8) | |
185 if (length(celltypes) <= 8) { | |
186 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) | |
187 } else { | |
188 print(jitter_fig) | |
189 plot_box | |
190 } | |
191 if (use_disease_factor) { | |
192 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) | |
193 } | |
194 plot_hmap | |
195 message(dev.off()) | |
196 | |
197 writable <- function(obj, prefix, title) { | |
198 write.table(obj, | |
199 file = paste0("report_data/", prefix, "_", | |
200 title, ".tabular"), | |
201 quote = F, sep = "\t", col.names = NA) | |
202 } | |
203 | |
204 ## Output Proportions | |
205 if ("NNLS" %in% methods) { | |
206 writable(est_prop$Est.prop.allgene, "prop", | |
207 "NNLS Estimated Proportions of Cell Types") | |
208 } | |
209 | |
210 if ("MuSiC" %in% methods) { | |
211 writable(est_prop$Est.prop.weighted, "prop", | |
212 "Music Estimated Proportions of Cell Types") | |
213 writable(est_prop$Weight.gene, "weightgene", | |
214 "Music Estimated Proportions of Cell Types (by Gene)") | |
215 writable(est_prop$r.squared.full, "rsquared", | |
216 "Music R-sqr Estimated Proportions of Each Subject") | |
217 writable(est_prop$Var.prop, "varprop", | |
218 "Matrix of Variance of MuSiC Estimates") | |
219 } | |
220 | |
221 | |
222 <<<<<<< HEAD | |
223 ======= | |
224 write.table(est_prop$Est.prop.weighted, | |
225 file = paste0("report_data/prop_", | |
226 "Music Estimated Proportions of Cell Types", | |
227 ".tabular"), | |
228 quote = F, sep = "\t", col.names = NA) | |
229 write.table(est_prop$Est.prop.allgene, | |
230 file = paste0("report_data/prop_", | |
231 "NNLS Estimated Proportions of Cell Types", | |
232 ".tabular"), | |
233 quote = F, sep = "\t", col.names = NA) | |
234 write.table(est_prop$Weight.gene, | |
235 file = paste0("report_data/weightgene_", | |
236 "Music Estimated Proportions of Cell Types (by Gene)", | |
237 ".tabular"), | |
238 quote = F, sep = "\t", col.names = NA) | |
239 write.table(est_prop$r.squared.full, | |
240 file = paste0("report_data/rsquared_", | |
241 "Music R-sqr Estimated Proportions of Each Subject", | |
242 ".tabular"), | |
243 quote = F, sep = "\t", col.names = NA) | |
244 write.table(est_prop$Var.prop, | |
245 file = paste0("report_data/varprop_", | |
246 "Matrix of Variance of MuSiC Estimates", | |
247 ".tabular"), | |
248 quote = F, sep = "\t", col.names = NA) | |
249 | |
250 | |
251 >>>>>>> 7a416140 (fitting summaries only apply when disease factor is used) | |
252 if (use_disease_factor) { | |
253 ## Summary table of linear regressions of disease factors | |
254 for (meth in methods) { | |
255 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = | |
256 sub_data <- subset(m_prop_ana, Method == meth) | |
257 | |
258 ## We can only do regression where there are more than 1 factors | |
259 ## so we must find and exclude the ones which are not | |
260 gt1_facts <- sapply(phenotype_factors, function(facname) { | |
261 return(length(unique(sort(sub_data[[facname]]))) == 1) | |
262 }) | |
263 form_factors <- phenotype_factors | |
264 exclude_facts <- names(gt1_facts)[gt1_facts] | |
265 if (length(exclude_facts) > 0) { | |
266 message("Factors with only one level will be excluded:") | |
267 message(exclude_facts) | |
268 form_factors <- phenotype_factors[ | |
269 !(phenotype_factors %in% exclude_facts)] | |
270 } | |
271 lm_beta_meth <- lm(as.formula( | |
272 paste("ct.prop", paste(form_factors, collapse = " + "), | |
273 sep = " ~ ")), data = sub_data) | |
274 message(paste0("Summary: ", meth)) | |
275 capture.output(summary(lm_beta_meth), | |
276 file = paste0("report_data/summ_Log of ", | |
277 meth, | |
278 " fitting.txt")) | |
279 } | |
280 } | |
281 |