comparison scripts/compare.R @ 0:22232092be53 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"
author bgruening
date Mon, 02 May 2022 09:59:18 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:22232092be53
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
5 ## We use this script to estimate the effectiveness of proportion methods
6
7 ## Load Conf
8 args <- commandArgs(trailingOnly = TRUE)
9 source(args[1])
10
11 method_key <- list("MuSiC" = "est_music",
12 "NNLS" = "est_nnls")[[est_method]]
13
14 delim <- "::" ## separator bulk datasets and their samples
15
16 scale_yaxes <- function(gplot, value) {
17 if (is.na(value)) {
18 gplot
19 } else {
20 gplot + scale_y_continuous(lim = c(0, value))
21 }
22 }
23
24
25 set_factor_data <- function(bulk_data, factor_name = NULL) {
26 if (is.null(factor_name)) {
27 factor_name <- "None" ## change to something plottable
28 }
29 pdat <- pData(bulk_data)
30 sam_fact <- NULL
31 if (factor_name %in% colnames(pdat)) {
32 sam_fact <- cbind(rownames(pdat),
33 as.character(pdat[[factor_name]]))
34 cat(paste0(" - factor: ", factor_name,
35 " found in phenotypes\n"))
36 } else {
37 ## We assign this as the factor for the entire dataset
38 sam_fact <- cbind(rownames(pdat),
39 factor_name)
40 cat(paste0(" - factor: assigning \"", factor_name,
41 "\" to whole dataset\n"))
42 }
43 colnames(sam_fact) <- c("Samples", "Factors")
44 return(as.data.frame(sam_fact))
45 }
46
47 ## Due to limiting sizes, we need to load and unload
48 ## possibly very large datasets.
49 process_pair <- function(sc_data, bulk_data,
50 ctypes_label, samples_label, ctypes,
51 factor_group) {
52 ## - Generate
53 est_prop <- music_prop(
54 bulk.eset = bulk_data, sc.eset = sc_data,
55 clusters = ctypes_label,
56 samples = samples_label, select.ct = ctypes, verbose = T)
57 ## -
58 estimated_music_props <- est_prop$Est.prop.weighted
59 estimated_nnls_props <- est_prop$Est.prop.allgene
60 ## -
61 fact_data <- set_factor_data(bulk_data, factor_group)
62 ## -
63 return(list(est_music = estimated_music_props,
64 est_nnls = estimated_nnls_props,
65 bulk_sample_totals = colSums(exprs(bulk_data)),
66 plot_groups = fact_data))
67 }
68
69 music_on_all <- function(files) {
70 results <- list()
71 for (sc_name in names(files)) {
72 cat(paste0("sc-group:", sc_name, "\n"))
73 scgroup <- files[[sc_name]]
74 ## - sc Data
75 sc_est <- readRDS(scgroup$dataset)
76 ## - params
77 celltypes_label <- scgroup$label_cell
78 samples_label <- scgroup$label_sample
79 celltypes <- scgroup$celltype
80
81 results[[sc_name]] <- list()
82 for (bulk_name in names(scgroup$bulk)) {
83 cat(paste0(" - bulk-group:", bulk_name, "\n"))
84 bulkgroup <- scgroup$bulk[[bulk_name]]
85 ## - bulk Data
86 bulk_est <- readRDS(bulkgroup$dataset)
87 ## - bulk params
88 pheno_facts <- bulkgroup$pheno_facts
89 pheno_excl <- bulkgroup$pheno_excl
90 ##
91 results[[sc_name]][[bulk_name]] <- process_pair(
92 sc_est, bulk_est,
93 celltypes_label, samples_label,
94 celltypes, bulkgroup$factor_group)
95 ##
96 rm(bulk_est) ## unload
97 }
98 rm(sc_est) ## unload
99 }
100 return(results)
101 }
102
103 plot_all_individual_heatmaps <- function(results) {
104 pdf(out_heatmulti_pdf, width = 8, height = 8)
105 for (sc_name in names(results)) {
106 for (bk_name in names(results[[sc_name]])) {
107 res <- results[[sc_name]][[bk_name]]
108 plot_hmap <- Prop_heat_Est(
109 data.matrix(res[[method_key]]), method.name = est_method) +
110 ggtitle(paste0("[", est_method, "Cell type ",
111 "proportions in ",
112 bk_name, " (Bulk) based on ",
113 sc_name, " (scRNA)")) +
114 xlab("Cell Types (scRNA)") +
115 ylab("Samples (Bulk)") +
116 theme(axis.text.x = element_text(angle = -90),
117 axis.text.y = element_text(size = 6))
118 print(plot_hmap)
119 }
120 }
121 dev.off()
122 }
123
124 merge_factors_spread <- function(grudat_spread, factor_groups) {
125 ## Generated
126 merge_it <- function(matr, plot_groups, valname) {
127 ren <- melt(lapply(matr, function(mat) {
128 mat["ct"] <- rownames(mat); return(mat)}))
129 ## - Grab factors and merge into list
130 ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
131 colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
132 return(ren_new)
133 }
134 tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
135 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
136 by = c("Sample", "Cell", "Bulk", "Factors"))
137 return(tab)
138 }
139
140 unlist_names <- function(results, method, prepend_bkname=FALSE) {
141 unique(sort(
142 unlist(lapply(names(results), function(scname) {
143 lapply(names(results[[scname]]), function(bkname) {
144 res <- get(method)(results[[scname]][[bkname]][[method_key]])
145 if (prepend_bkname) {
146 ## We *do not* assume unique bulk sample names
147 ## across different bulk datasets.
148 res <- paste0(bkname, delim, res)
149 }
150 return(res)
151 })
152 }))
153 ))
154 }
155
156 summarized_matrix <- function(results) { # nolint
157 ## We assume that cell types MUST be unique, but that sample
158 ## names do not need to be. For this reason, we must prepend
159 ## the bulk dataset name to the individual sample names.
160 all_celltypes <- unlist_names(results, "colnames")
161 all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)
162
163 ## Iterate through all possible samples and populate a table.
164 ddff <- data.frame()
165 ddff_scale <- data.frame()
166 for (cell in all_celltypes) {
167 for (sample in all_samples) {
168 group_sname <- unlist(strsplit(sample, split = delim))
169 bulk <- group_sname[1]
170 id_sample <- group_sname[2]
171 for (scgroup in names(results)) {
172 if (bulk %in% names(results[[scgroup]])) {
173 mat_prop <- results[[scgroup]][[bulk]][[method_key]]
174 vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
175 ## - We use sample instead of id_sample because we need to
176 ## extract bulk sets from the complete matrix later. It's
177 ## messy, yes.
178 if (cell %in% colnames(mat_prop)) {
179 ddff[cell, sample] <- mat_prop[id_sample, cell]
180 ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
181 } else {
182 ddff[cell, sample] <- 0
183 ddff_scale[cell, sample] <- 0
184 }
185 }
186 }
187 }
188 }
189 return(list(prop = ddff, scaled = ddff_scale))
190 }
191
192 flatten_factor_list <- function(results) {
193 ## Get a 2d DF of all factors across all bulk samples.
194 res <- c()
195 for (scgroup in names(results)) {
196 for (bulkgroup in names(results[[scgroup]])) {
197 dat <- results[[scgroup]][[bulkgroup]]$plot_groups
198 dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint
199 res <- rbind(res, dat)
200 }
201 }
202 return(res)
203 }
204
205 group_by_dataset <- function(summat) {
206 bulk_names <- unlist(
207 lapply(names(files), function(x) names(files[[x]]$bulk)))
208 mat_names <- colnames(summat$prop)
209 bd <- list()
210 bd_scale <- list()
211 bd_spread_scale <- list()
212 bd_spread_prop <- list()
213 for (bname in bulk_names) {
214 subs <- mat_names[startsWith(mat_names, paste0(bname, delim))]
215 ## -
216 bd[[bname]] <- rowSums(summat$prop[, subs])
217 bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
218 bd_spread_scale[[bname]] <- summat$scaled[, subs]
219 bd_spread_prop[[bname]] <- summat$prop[, subs]
220 }
221 return(list(prop = as.data.frame(bd),
222 scaled = as.data.frame(bd_scale),
223 spread = list(scale = bd_spread_scale,
224 prop = bd_spread_prop)))
225 }
226
227 do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name,
228 xlabs="", ylabs="", titled="",
229 order_col=T, order_row=T, size=11) {
230
231 data_m <- grudat_spread_melt
232 data_matrix <- {
233 tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name)
234 rownames(tmp) <- tmp[[yaxis]]
235 tmp[[yaxis]] <- NULL
236 tmp
237 }
238 dist_method <- "euclidean"
239 clust_method <- "complete"
240
241 if (order_row) {
242 dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method))
243 row_ord <- order.dendrogram(dd_row)
244 ordered_row_names <- row.names(data_matrix[row_ord, ])
245 data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names)
246 }
247
248 if (order_col) {
249 dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method),
250 method = clust_method))
251 col_ord <- order.dendrogram(dd_col)
252 ordered_col_names <- colnames(data_matrix[, col_ord])
253 data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names)
254 }
255
256 heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) +
257 geom_tile(colour = "white") +
258 scale_fill_gradient2(low = "steelblue", high = "red", mid = "white",
259 name = element_blank()) +
260 scale_y_discrete(position = "right") +
261 theme(axis.text.x = element_text(angle = -90, hjust = 0,
262 size = size)) +
263 ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs)
264
265 ## Graphics
266 dendro_linesize <- 0.5
267 dendro_colunit <- 0.2
268 dendro_rowunit <- 0.1
269 final_plot <- heat_plot
270
271 if (order_row) {
272 dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle")
273 dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) +
274 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row),
275 ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend),
276 size = dendro_linesize) + ggplot2::coord_flip()
277 final_plot <- cowplot::insert_yaxis_grob(
278 final_plot, dendro_row, grid::unit(dendro_colunit, "null"),
279 position = "left")
280 }
281 if (order_col) {
282 dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle")
283 dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") +
284 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col),
285 ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
286 size = dendro_linesize)
287 final_plot <- cowplot::insert_xaxis_grob(
288 final_plot, dendro_col, grid::unit(dendro_rowunit, "null"),
289 position = "top")
290 }
291 return(cowplot::ggdraw(final_plot))
292 }
293
294 summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") {
295 ## - Cluster is either "Rows", "Cols", "Both", or "None"
296 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
297 ylabs = element_blank(), xlabs = element_blank(),
298 use_log = TRUE, size = 11) {
299 ## Convert from matrix to long format
300 melted <- grudat_melted ## copy?
301 if (use_log) {
302 melted[[fillval]] <- log10(melted[[fillval]] + 1)
303 }
304 if (cluster == "None") {
305 return(ggplot(melted) +
306 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
307 colour = "white") +
308 scale_fill_gradient2(
309 low = "steelblue", high = "red", mid = "white",
310 name = element_blank()) +
311 theme(axis.text.x = element_text(
312 angle = -90, hjust = 0, size = size)) +
313 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
314 } else {
315 return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval,
316 xlabs, ylabs, title,
317 (cluster %in% c("Cols", "Both")),
318 (cluster %in% c("Rows", "Both"))))
319 }
320 }
321
322 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
323 do_logged <- (plot %in% c("log", "both"))
324 do_normal <- (plot %in% c("normal", "both"))
325 plist <- list()
326 if (do_logged) {
327 plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
328 "value.scale", "Reads (log10+1)",
329 size = size)
330 plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
331 "value.prop", "Sample (log10+1)",
332 size = size)
333 }
334 if (do_normal) {
335 plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
336 "value.scale", "Reads", use_log = F,
337 size = size)
338 plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
339 "value.prop", "Sample", use_log = F,
340 size = size)
341 }
342 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
343 plot_grid(plotlist = plist, ncol = ncol),
344 ncol = 1, rel_heights = c(0.05, 0.95)))
345 }
346
347 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both")
348 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal",
349 ncol = 1, size = 8)
350 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log",
351 ncol = 1, size = 8)
352 p3 <- ggplot + theme_void()
353 if (do_factors) {
354 p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both")
355 }
356 return(list(bulk = p1,
357 samples = list(log = p2b, normal = p2a),
358 factors = p3))
359 }
360
361 summarize_boxplots <- function(grudat_spread, do_factors) {
362 common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
363 xlab(element_blank()) + ylab(element_blank())
364 common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
365 xlab(element_blank()) + ylab(element_blank())
366
367 A <- B <- list() #nolint
368 ## Cell type by sample
369 A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
370 A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
371 ## Sample by Cell type
372 B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
373 ylab("Bulk Dataset")
374 B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
375 ylab("Bulk Dataset")
376 ## -- Factor plots are optional
377 A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()
378
379 if (do_factors) {
380 A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
381 A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
382 B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
383 ylab("Bulk Dataset")
384 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
385 ylab("Bulk Dataset")
386 }
387
388 title_a <- "Cell Types vs Bulk Datasets"
389 title_b <- "Bulk Datasets vs Cell Types"
390 if (do_factors) {
391 title_a <- paste0(title_a, " and Factors")
392 title_b <- paste0(title_b, " and Factors")
393 }
394
395 a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
396 plot_grid(plotlist = A, ncol = 2),
397 ncol = 1, rel_heights = c(0.05, 0.95))
398 b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
399 plot_grid(plotlist = B, ncol = 2),
400 ncol = 1, rel_heights = c(0.05, 0.95))
401 return(list(cell = a_all, bulk = b_all))
402 }
403
404 filter_output <- function(grudat_spread_melt, out_filt) {
405 print_red <- function(comment, red_list) {
406 cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
407 }
408 grudat_filt <- grudat_spread_melt
409 print_red("Total Cell types:", unique(grudat_filt$Cell))
410 if (!is.null(out_filt$cells)) {
411 grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
412 print_red(" - selecting:", out_filt$cells)
413 }
414 print_red("Total Factors:", unique(grudat_spread_melt$Factors))
415 if (!is.null(out_filt$facts)) {
416 grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
417 print_red(" - selecting:", out_filt$facts)
418 }
419 return(grudat_filt)
420 }
421
422 writable2 <- function(obj, prefix, title) {
423 write.table(obj,
424 file = paste0("report_data/", prefix, "_",
425 title, ".tabular"),
426 quote = F, sep = "\t", col.names = NA)
427 }
428
429
430 results <- music_on_all(files)
431 summat <- summarized_matrix(results)
432 grudat <- group_by_dataset(summat)
433 grudat_spread_melt <- merge_factors_spread(grudat$spread,
434 flatten_factor_list(results))
435 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
436
437 plot_all_individual_heatmaps(results)
438
439 ## The output filters ONLY apply to boxplots, since these take
440 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
441 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
442 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors,
443 dendro_setting)
444
445 pdf(out_heatsumm_pdf, width = 14, height = 14)
446 print(heat_maps)
447 print(box_plots)
448 dev.off()
449
450 ## Generate output tables
451 stats_prop <- lapply(grudat$spread$prop, function(x) {
452 t(apply(x, 1, summary))})
453 stats_scale <- lapply(grudat$spread$scale, function(x) {
454 t(apply(x, 1, summary))})
455
456 ## Make the value table printable
457 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
458 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
459 "CT Prop in Sample", "Number of Reads")
460
461 writable2(grudat_spread_melt, "values", "Data Table")
462 writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
463 writable2({
464 aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
465 }, "values", "Matrix of Cell Type Read Counts")
466
467 for (bname in names(stats_prop)) {
468 writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
469 writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
470 }