Mercurial > repos > bgruening > music_manipulate_eset
comparison scripts/compare.R @ 0:22232092be53 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"
author | bgruening |
---|---|
date | Mon, 02 May 2022 09:59:18 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:22232092be53 |
---|---|
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc))) | |
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) | |
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2))) | |
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot))) | |
5 ## We use this script to estimate the effectiveness of proportion methods | |
6 | |
7 ## Load Conf | |
8 args <- commandArgs(trailingOnly = TRUE) | |
9 source(args[1]) | |
10 | |
11 method_key <- list("MuSiC" = "est_music", | |
12 "NNLS" = "est_nnls")[[est_method]] | |
13 | |
14 delim <- "::" ## separator bulk datasets and their samples | |
15 | |
16 scale_yaxes <- function(gplot, value) { | |
17 if (is.na(value)) { | |
18 gplot | |
19 } else { | |
20 gplot + scale_y_continuous(lim = c(0, value)) | |
21 } | |
22 } | |
23 | |
24 | |
25 set_factor_data <- function(bulk_data, factor_name = NULL) { | |
26 if (is.null(factor_name)) { | |
27 factor_name <- "None" ## change to something plottable | |
28 } | |
29 pdat <- pData(bulk_data) | |
30 sam_fact <- NULL | |
31 if (factor_name %in% colnames(pdat)) { | |
32 sam_fact <- cbind(rownames(pdat), | |
33 as.character(pdat[[factor_name]])) | |
34 cat(paste0(" - factor: ", factor_name, | |
35 " found in phenotypes\n")) | |
36 } else { | |
37 ## We assign this as the factor for the entire dataset | |
38 sam_fact <- cbind(rownames(pdat), | |
39 factor_name) | |
40 cat(paste0(" - factor: assigning \"", factor_name, | |
41 "\" to whole dataset\n")) | |
42 } | |
43 colnames(sam_fact) <- c("Samples", "Factors") | |
44 return(as.data.frame(sam_fact)) | |
45 } | |
46 | |
47 ## Due to limiting sizes, we need to load and unload | |
48 ## possibly very large datasets. | |
49 process_pair <- function(sc_data, bulk_data, | |
50 ctypes_label, samples_label, ctypes, | |
51 factor_group) { | |
52 ## - Generate | |
53 est_prop <- music_prop( | |
54 bulk.eset = bulk_data, sc.eset = sc_data, | |
55 clusters = ctypes_label, | |
56 samples = samples_label, select.ct = ctypes, verbose = T) | |
57 ## - | |
58 estimated_music_props <- est_prop$Est.prop.weighted | |
59 estimated_nnls_props <- est_prop$Est.prop.allgene | |
60 ## - | |
61 fact_data <- set_factor_data(bulk_data, factor_group) | |
62 ## - | |
63 return(list(est_music = estimated_music_props, | |
64 est_nnls = estimated_nnls_props, | |
65 bulk_sample_totals = colSums(exprs(bulk_data)), | |
66 plot_groups = fact_data)) | |
67 } | |
68 | |
69 music_on_all <- function(files) { | |
70 results <- list() | |
71 for (sc_name in names(files)) { | |
72 cat(paste0("sc-group:", sc_name, "\n")) | |
73 scgroup <- files[[sc_name]] | |
74 ## - sc Data | |
75 sc_est <- readRDS(scgroup$dataset) | |
76 ## - params | |
77 celltypes_label <- scgroup$label_cell | |
78 samples_label <- scgroup$label_sample | |
79 celltypes <- scgroup$celltype | |
80 | |
81 results[[sc_name]] <- list() | |
82 for (bulk_name in names(scgroup$bulk)) { | |
83 cat(paste0(" - bulk-group:", bulk_name, "\n")) | |
84 bulkgroup <- scgroup$bulk[[bulk_name]] | |
85 ## - bulk Data | |
86 bulk_est <- readRDS(bulkgroup$dataset) | |
87 ## - bulk params | |
88 pheno_facts <- bulkgroup$pheno_facts | |
89 pheno_excl <- bulkgroup$pheno_excl | |
90 ## | |
91 results[[sc_name]][[bulk_name]] <- process_pair( | |
92 sc_est, bulk_est, | |
93 celltypes_label, samples_label, | |
94 celltypes, bulkgroup$factor_group) | |
95 ## | |
96 rm(bulk_est) ## unload | |
97 } | |
98 rm(sc_est) ## unload | |
99 } | |
100 return(results) | |
101 } | |
102 | |
103 plot_all_individual_heatmaps <- function(results) { | |
104 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
105 for (sc_name in names(results)) { | |
106 for (bk_name in names(results[[sc_name]])) { | |
107 res <- results[[sc_name]][[bk_name]] | |
108 plot_hmap <- Prop_heat_Est( | |
109 data.matrix(res[[method_key]]), method.name = est_method) + | |
110 ggtitle(paste0("[", est_method, "Cell type ", | |
111 "proportions in ", | |
112 bk_name, " (Bulk) based on ", | |
113 sc_name, " (scRNA)")) + | |
114 xlab("Cell Types (scRNA)") + | |
115 ylab("Samples (Bulk)") + | |
116 theme(axis.text.x = element_text(angle = -90), | |
117 axis.text.y = element_text(size = 6)) | |
118 print(plot_hmap) | |
119 } | |
120 } | |
121 dev.off() | |
122 } | |
123 | |
124 merge_factors_spread <- function(grudat_spread, factor_groups) { | |
125 ## Generated | |
126 merge_it <- function(matr, plot_groups, valname) { | |
127 ren <- melt(lapply(matr, function(mat) { | |
128 mat["ct"] <- rownames(mat); return(mat)})) | |
129 ## - Grab factors and merge into list | |
130 ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") | |
131 colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") | |
132 return(ren_new) | |
133 } | |
134 tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), | |
135 merge_it(grudat$spread$scale, factor_groups, "value.scale"), | |
136 by = c("Sample", "Cell", "Bulk", "Factors")) | |
137 return(tab) | |
138 } | |
139 | |
140 unlist_names <- function(results, method, prepend_bkname=FALSE) { | |
141 unique(sort( | |
142 unlist(lapply(names(results), function(scname) { | |
143 lapply(names(results[[scname]]), function(bkname) { | |
144 res <- get(method)(results[[scname]][[bkname]][[method_key]]) | |
145 if (prepend_bkname) { | |
146 ## We *do not* assume unique bulk sample names | |
147 ## across different bulk datasets. | |
148 res <- paste0(bkname, delim, res) | |
149 } | |
150 return(res) | |
151 }) | |
152 })) | |
153 )) | |
154 } | |
155 | |
156 summarized_matrix <- function(results) { # nolint | |
157 ## We assume that cell types MUST be unique, but that sample | |
158 ## names do not need to be. For this reason, we must prepend | |
159 ## the bulk dataset name to the individual sample names. | |
160 all_celltypes <- unlist_names(results, "colnames") | |
161 all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) | |
162 | |
163 ## Iterate through all possible samples and populate a table. | |
164 ddff <- data.frame() | |
165 ddff_scale <- data.frame() | |
166 for (cell in all_celltypes) { | |
167 for (sample in all_samples) { | |
168 group_sname <- unlist(strsplit(sample, split = delim)) | |
169 bulk <- group_sname[1] | |
170 id_sample <- group_sname[2] | |
171 for (scgroup in names(results)) { | |
172 if (bulk %in% names(results[[scgroup]])) { | |
173 mat_prop <- results[[scgroup]][[bulk]][[method_key]] | |
174 vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals | |
175 ## - We use sample instead of id_sample because we need to | |
176 ## extract bulk sets from the complete matrix later. It's | |
177 ## messy, yes. | |
178 if (cell %in% colnames(mat_prop)) { | |
179 ddff[cell, sample] <- mat_prop[id_sample, cell] | |
180 ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint | |
181 } else { | |
182 ddff[cell, sample] <- 0 | |
183 ddff_scale[cell, sample] <- 0 | |
184 } | |
185 } | |
186 } | |
187 } | |
188 } | |
189 return(list(prop = ddff, scaled = ddff_scale)) | |
190 } | |
191 | |
192 flatten_factor_list <- function(results) { | |
193 ## Get a 2d DF of all factors across all bulk samples. | |
194 res <- c() | |
195 for (scgroup in names(results)) { | |
196 for (bulkgroup in names(results[[scgroup]])) { | |
197 dat <- results[[scgroup]][[bulkgroup]]$plot_groups | |
198 dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint | |
199 res <- rbind(res, dat) | |
200 } | |
201 } | |
202 return(res) | |
203 } | |
204 | |
205 group_by_dataset <- function(summat) { | |
206 bulk_names <- unlist( | |
207 lapply(names(files), function(x) names(files[[x]]$bulk))) | |
208 mat_names <- colnames(summat$prop) | |
209 bd <- list() | |
210 bd_scale <- list() | |
211 bd_spread_scale <- list() | |
212 bd_spread_prop <- list() | |
213 for (bname in bulk_names) { | |
214 subs <- mat_names[startsWith(mat_names, paste0(bname, delim))] | |
215 ## - | |
216 bd[[bname]] <- rowSums(summat$prop[, subs]) | |
217 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) | |
218 bd_spread_scale[[bname]] <- summat$scaled[, subs] | |
219 bd_spread_prop[[bname]] <- summat$prop[, subs] | |
220 } | |
221 return(list(prop = as.data.frame(bd), | |
222 scaled = as.data.frame(bd_scale), | |
223 spread = list(scale = bd_spread_scale, | |
224 prop = bd_spread_prop))) | |
225 } | |
226 | |
227 do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name, | |
228 xlabs="", ylabs="", titled="", | |
229 order_col=T, order_row=T, size=11) { | |
230 | |
231 data_m <- grudat_spread_melt | |
232 data_matrix <- { | |
233 tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name) | |
234 rownames(tmp) <- tmp[[yaxis]] | |
235 tmp[[yaxis]] <- NULL | |
236 tmp | |
237 } | |
238 dist_method <- "euclidean" | |
239 clust_method <- "complete" | |
240 | |
241 if (order_row) { | |
242 dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method)) | |
243 row_ord <- order.dendrogram(dd_row) | |
244 ordered_row_names <- row.names(data_matrix[row_ord, ]) | |
245 data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names) | |
246 } | |
247 | |
248 if (order_col) { | |
249 dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method), | |
250 method = clust_method)) | |
251 col_ord <- order.dendrogram(dd_col) | |
252 ordered_col_names <- colnames(data_matrix[, col_ord]) | |
253 data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names) | |
254 } | |
255 | |
256 heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) + | |
257 geom_tile(colour = "white") + | |
258 scale_fill_gradient2(low = "steelblue", high = "red", mid = "white", | |
259 name = element_blank()) + | |
260 scale_y_discrete(position = "right") + | |
261 theme(axis.text.x = element_text(angle = -90, hjust = 0, | |
262 size = size)) + | |
263 ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs) | |
264 | |
265 ## Graphics | |
266 dendro_linesize <- 0.5 | |
267 dendro_colunit <- 0.2 | |
268 dendro_rowunit <- 0.1 | |
269 final_plot <- heat_plot | |
270 | |
271 if (order_row) { | |
272 dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle") | |
273 dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) + | |
274 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row), | |
275 ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend), | |
276 size = dendro_linesize) + ggplot2::coord_flip() | |
277 final_plot <- cowplot::insert_yaxis_grob( | |
278 final_plot, dendro_row, grid::unit(dendro_colunit, "null"), | |
279 position = "left") | |
280 } | |
281 if (order_col) { | |
282 dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle") | |
283 dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") + | |
284 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col), | |
285 ggplot2::aes(x = x, y = y, xend = xend, yend = yend), | |
286 size = dendro_linesize) | |
287 final_plot <- cowplot::insert_xaxis_grob( | |
288 final_plot, dendro_col, grid::unit(dendro_rowunit, "null"), | |
289 position = "top") | |
290 } | |
291 return(cowplot::ggdraw(final_plot)) | |
292 } | |
293 | |
294 summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") { | |
295 ## - Cluster is either "Rows", "Cols", "Both", or "None" | |
296 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, | |
297 ylabs = element_blank(), xlabs = element_blank(), | |
298 use_log = TRUE, size = 11) { | |
299 ## Convert from matrix to long format | |
300 melted <- grudat_melted ## copy? | |
301 if (use_log) { | |
302 melted[[fillval]] <- log10(melted[[fillval]] + 1) | |
303 } | |
304 if (cluster == "None") { | |
305 return(ggplot(melted) + | |
306 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), | |
307 colour = "white") + | |
308 scale_fill_gradient2( | |
309 low = "steelblue", high = "red", mid = "white", | |
310 name = element_blank()) + | |
311 theme(axis.text.x = element_text( | |
312 angle = -90, hjust = 0, size = size)) + | |
313 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) | |
314 } else { | |
315 return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval, | |
316 xlabs, ylabs, title, | |
317 (cluster %in% c("Cols", "Both")), | |
318 (cluster %in% c("Rows", "Both")))) | |
319 } | |
320 } | |
321 | |
322 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { | |
323 do_logged <- (plot %in% c("log", "both")) | |
324 do_normal <- (plot %in% c("normal", "both")) | |
325 plist <- list() | |
326 if (do_logged) { | |
327 plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
328 "value.scale", "Reads (log10+1)", | |
329 size = size) | |
330 plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
331 "value.prop", "Sample (log10+1)", | |
332 size = size) | |
333 } | |
334 if (do_normal) { | |
335 plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
336 "value.scale", "Reads", use_log = F, | |
337 size = size) | |
338 plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
339 "value.prop", "Sample", use_log = F, | |
340 size = size) | |
341 } | |
342 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), | |
343 plot_grid(plotlist = plist, ncol = ncol), | |
344 ncol = 1, rel_heights = c(0.05, 0.95))) | |
345 } | |
346 | |
347 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both") | |
348 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", | |
349 ncol = 1, size = 8) | |
350 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", | |
351 ncol = 1, size = 8) | |
352 p3 <- ggplot + theme_void() | |
353 if (do_factors) { | |
354 p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both") | |
355 } | |
356 return(list(bulk = p1, | |
357 samples = list(log = p2b, normal = p2a), | |
358 factors = p3)) | |
359 } | |
360 | |
361 summarize_boxplots <- function(grudat_spread, do_factors) { | |
362 common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + | |
363 xlab(element_blank()) + ylab(element_blank()) | |
364 common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + | |
365 xlab(element_blank()) + ylab(element_blank()) | |
366 | |
367 A <- B <- list() #nolint | |
368 ## Cell type by sample | |
369 A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
370 A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
371 ## Sample by Cell type | |
372 B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
373 ylab("Bulk Dataset") | |
374 B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
375 ylab("Bulk Dataset") | |
376 ## -- Factor plots are optional | |
377 A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() | |
378 | |
379 if (do_factors) { | |
380 A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) | |
381 A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) | |
382 B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
383 ylab("Bulk Dataset") | |
384 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
385 ylab("Bulk Dataset") | |
386 } | |
387 | |
388 title_a <- "Cell Types vs Bulk Datasets" | |
389 title_b <- "Bulk Datasets vs Cell Types" | |
390 if (do_factors) { | |
391 title_a <- paste0(title_a, " and Factors") | |
392 title_b <- paste0(title_b, " and Factors") | |
393 } | |
394 | |
395 a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), | |
396 plot_grid(plotlist = A, ncol = 2), | |
397 ncol = 1, rel_heights = c(0.05, 0.95)) | |
398 b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), | |
399 plot_grid(plotlist = B, ncol = 2), | |
400 ncol = 1, rel_heights = c(0.05, 0.95)) | |
401 return(list(cell = a_all, bulk = b_all)) | |
402 } | |
403 | |
404 filter_output <- function(grudat_spread_melt, out_filt) { | |
405 print_red <- function(comment, red_list) { | |
406 cat(paste(comment, paste(red_list, collapse = ", "), "\n")) | |
407 } | |
408 grudat_filt <- grudat_spread_melt | |
409 print_red("Total Cell types:", unique(grudat_filt$Cell)) | |
410 if (!is.null(out_filt$cells)) { | |
411 grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] | |
412 print_red(" - selecting:", out_filt$cells) | |
413 } | |
414 print_red("Total Factors:", unique(grudat_spread_melt$Factors)) | |
415 if (!is.null(out_filt$facts)) { | |
416 grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] | |
417 print_red(" - selecting:", out_filt$facts) | |
418 } | |
419 return(grudat_filt) | |
420 } | |
421 | |
422 writable2 <- function(obj, prefix, title) { | |
423 write.table(obj, | |
424 file = paste0("report_data/", prefix, "_", | |
425 title, ".tabular"), | |
426 quote = F, sep = "\t", col.names = NA) | |
427 } | |
428 | |
429 | |
430 results <- music_on_all(files) | |
431 summat <- summarized_matrix(results) | |
432 grudat <- group_by_dataset(summat) | |
433 grudat_spread_melt <- merge_factors_spread(grudat$spread, | |
434 flatten_factor_list(results)) | |
435 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) | |
436 | |
437 plot_all_individual_heatmaps(results) | |
438 | |
439 ## The output filters ONLY apply to boxplots, since these take | |
440 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) | |
441 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) | |
442 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors, | |
443 dendro_setting) | |
444 | |
445 pdf(out_heatsumm_pdf, width = 14, height = 14) | |
446 print(heat_maps) | |
447 print(box_plots) | |
448 dev.off() | |
449 | |
450 ## Generate output tables | |
451 stats_prop <- lapply(grudat$spread$prop, function(x) { | |
452 t(apply(x, 1, summary))}) | |
453 stats_scale <- lapply(grudat$spread$scale, function(x) { | |
454 t(apply(x, 1, summary))}) | |
455 | |
456 ## Make the value table printable | |
457 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint | |
458 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", | |
459 "CT Prop in Sample", "Number of Reads") | |
460 | |
461 writable2(grudat_spread_melt, "values", "Data Table") | |
462 writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") | |
463 writable2({ | |
464 aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa | |
465 }, "values", "Matrix of Cell Type Read Counts") | |
466 | |
467 for (bname in names(stats_prop)) { | |
468 writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) | |
469 writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) | |
470 } |