comparison scripts/compare.R @ 4:56371b5a2da9 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author bgruening
date Thu, 10 Feb 2022 12:52:31 +0000
parents
children 2ba99a52bd44
comparison
equal deleted inserted replaced
3:fd7a16d073c5 4:56371b5a2da9
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
5 ## We use this script to estimate the effectiveness of proportion methods
6
7 ## Load Conf
8 args <- commandArgs(trailingOnly = TRUE)
9 source(args[1])
10
11 method_key <- list("MuSiC" = "est_music",
12 "NNLS" = "est_nnls")[[est_method]]
13
14
15 scale_yaxes <- function(gplot, value) {
16 if (is.na(value)) {
17 gplot
18 } else {
19 gplot + scale_y_continuous(lim = c(0, value))
20 }
21 }
22
23
24 set_factor_data <- function(bulk_data, factor_name = NULL) {
25 if (is.null(factor_name)) {
26 factor_name <- "None" ## change to something plottable
27 }
28 pdat <- pData(bulk_data)
29 sam_fact <- NULL
30 if (factor_name %in% colnames(pdat)) {
31 sam_fact <- cbind(rownames(pdat),
32 as.character(pdat[[factor_name]]))
33 cat(paste0(" - factor: ", factor_name,
34 " found in phenotypes\n"))
35 } else {
36 ## We assign this as the factor for the entire dataset
37 sam_fact <- cbind(rownames(pdat),
38 factor_name)
39 cat(paste0(" - factor: assigning \"", factor_name,
40 "\" to whole dataset\n"))
41 }
42 colnames(sam_fact) <- c("Samples", "Factors")
43 return(as.data.frame(sam_fact))
44 }
45
46 ## Due to limiting sizes, we need to load and unload
47 ## possibly very large datasets.
48 process_pair <- function(sc_data, bulk_data,
49 ctypes_label, samples_label, ctypes,
50 factor_group) {
51 ## - Generate
52 est_prop <- music_prop(
53 bulk.eset = bulk_data, sc.eset = sc_data,
54 clusters = ctypes_label,
55 samples = samples_label, select.ct = ctypes, verbose = T)
56 ## -
57 estimated_music_props <- est_prop$Est.prop.weighted
58 estimated_nnls_props <- est_prop$Est.prop.allgene
59 ## -
60 fact_data <- set_factor_data(bulk_data, factor_group)
61 ## -
62 return(list(est_music = estimated_music_props,
63 est_nnls = estimated_nnls_props,
64 bulk_sample_totals = colSums(exprs(bulk_data)),
65 plot_groups = fact_data))
66 }
67
68 music_on_all <- function(files) {
69 results <- list()
70 for (sc_name in names(files)) {
71 cat(paste0("sc-group:", sc_name, "\n"))
72 scgroup <- files[[sc_name]]
73 ## - sc Data
74 sc_est <- readRDS(scgroup$dataset)
75 ## - params
76 celltypes_label <- scgroup$label_cell
77 samples_label <- scgroup$label_sample
78 celltypes <- scgroup$celltype
79
80 results[[sc_name]] <- list()
81 for (bulk_name in names(scgroup$bulk)) {
82 cat(paste0(" - bulk-group:", bulk_name, "\n"))
83 bulkgroup <- scgroup$bulk[[bulk_name]]
84 ## - bulk Data
85 bulk_est <- readRDS(bulkgroup$dataset)
86 ## - bulk params
87 pheno_facts <- bulkgroup$pheno_facts
88 pheno_excl <- bulkgroup$pheno_excl
89 ##
90 results[[sc_name]][[bulk_name]] <- process_pair(
91 sc_est, bulk_est,
92 celltypes_label, samples_label,
93 celltypes, bulkgroup$factor_group)
94 ##
95 rm(bulk_est) ## unload
96 }
97 rm(sc_est) ## unload
98 }
99 return(results)
100 }
101
102 plot_all_individual_heatmaps <- function(results) {
103 pdf(out_heatmulti_pdf, width = 8, height = 8)
104 for (sc_name in names(results)) {
105 for (bk_name in names(results[[sc_name]])) {
106 res <- results[[sc_name]][[bk_name]]
107 plot_hmap <- Prop_heat_Est(
108 data.matrix(res[[method_key]]), method.name = est_method) +
109 ggtitle(paste0("[", est_method, "Cell type ",
110 "proportions in ",
111 bk_name, " (Bulk) based on ",
112 sc_name, " (scRNA)")) +
113 xlab("Cell Types (scRNA)") +
114 ylab("Samples (Bulk)") +
115 theme(axis.text.x = element_text(angle = -90),
116 axis.text.y = element_text(size = 6))
117 print(plot_hmap)
118 }
119 }
120 dev.off()
121 }
122
123 merge_factors_spread <- function(grudat_spread, factor_groups) {
124 ## Generated
125 merge_it <- function(matr, plot_groups, valname) {
126 ren <- melt(lapply(matr, function(mat) {
127 mat["ct"] <- rownames(mat); return(mat)}))
128 ## - Grab factors and merge into list
129 ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
130 colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
131 return(ren_new)
132 }
133 tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
134 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
135 by = c("Sample", "Cell", "Bulk", "Factors"))
136 return(tab)
137 }
138
139
140 plot_grouped_heatmaps <- function(results) {
141 pdf(out_heatmulti_pdf, width = 8, height = 8)
142 for (sc_name in names(results)) {
143 named_list <- sapply(
144 names(results[[sc_name]]),
145 function(n) {
146 ## We transpose the data here, because
147 ## the plotting function omits by default
148 ## the Y-axis which are the samples.
149 ## Since the celltypes are the common factor
150 ## these should be the Y-axis instead.
151 t(data.matrix(results[[sc_name]][[n]][[method_key]]))
152 }, simplify = F, USE.NAMES = T)
153 named_methods <- names(results[[sc_name]])
154 ##
155 plot_hmap <- Prop_heat_Est(
156 named_list,
157 method.name = named_methods) +
158 ggtitle(paste0("[", est_method, "] Cell type ",
159 "proportions of ",
160 "Bulk Datasets based on ",
161 sc_name, " (scRNA)")) +
162 xlab("Samples (Bulk)") +
163 ylab("Cell Types (scRNA)") +
164 theme(axis.text.x = element_text(angle = -90),
165 axis.text.y = element_text(size = 6))
166 print(plot_hmap)
167 }
168 dev.off()
169 }
170
171 ## Desired plots
172 ## 1. Pie chart:
173 ## - Per Bulk dataset (using just normalised proportions)
174 ## - Per Bulk dataset (multiplying proportions by nreads)
175
176 unlist_names <- function(results, method, prepend_bkname=FALSE) {
177 unique(sort(
178 unlist(lapply(names(results), function(scname) {
179 lapply(names(results[[scname]]), function(bkname) {
180 res <- get(method)(results[[scname]][[bkname]][[method_key]])
181 if (prepend_bkname) {
182 ## We *do not* assume unique bulk sample names
183 ## across different bulk datasets.
184 res <- paste0(bkname, "::", res)
185 }
186 return(res)
187 })
188 }))
189 ))
190 }
191
192 summarized_matrix <- function(results) { # nolint
193 ## We assume that cell types MUST be unique, but that sample
194 ## names do not need to be. For this reason, we must prepend
195 ## the bulk dataset name to the individual sample names.
196 all_celltypes <- unlist_names(results, "colnames")
197 all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)
198
199 ## Iterate through all possible samples and populate a table.
200 ddff <- data.frame()
201 ddff_scale <- data.frame()
202 for (cell in all_celltypes) {
203 for (sample in all_samples) {
204 group_sname <- unlist(strsplit(sample, split = "::"))
205 bulk <- group_sname[1]
206 id_sample <- group_sname[2]
207 for (scgroup in names(results)) {
208 if (bulk %in% names(results[[scgroup]])) {
209 mat_prop <- results[[scgroup]][[bulk]][[method_key]]
210 vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
211 ## - We use sample instead of id_sample because we need to
212 ## extract bulk sets from the complete matrix later. It's
213 ## messy, yes.
214 if (cell %in% colnames(mat_prop)) {
215 ddff[cell, sample] <- mat_prop[id_sample, cell]
216 ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
217 } else {
218 ddff[cell, sample] <- 0
219 ddff_scale[cell, sample] <- 0
220 }
221 }
222 }
223 }
224 }
225 return(list(prop = ddff, scaled = ddff_scale))
226 }
227
228 flatten_factor_list <- function(results) {
229 ## Get a 2d DF of all factors across all bulk samples.
230 res <- c()
231 for (scgroup in names(results)) {
232 for (bulkgroup in names(results[[scgroup]])) {
233 dat <- results[[scgroup]][[bulkgroup]]$plot_groups
234 dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint
235 res <- rbind(res, dat)
236 }
237 }
238 return(res)
239 }
240
241 group_by_dataset <- function(summat) {
242 bulk_names <- unlist(
243 lapply(names(files), function(x) names(files[[x]]$bulk)))
244 mat_names <- colnames(summat$prop)
245 bd <- list()
246 bd_scale <- list()
247 bd_spread_scale <- list()
248 bd_spread_prop <- list()
249 for (bname in bulk_names) {
250 subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))]
251 ## -
252 bd[[bname]] <- rowSums(summat$prop[, subs])
253 bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
254 bd_spread_scale[[bname]] <- summat$scaled[, subs]
255 bd_spread_prop[[bname]] <- summat$prop[, subs]
256 }
257 return(list(prop = as.data.frame(bd),
258 scaled = as.data.frame(bd_scale),
259 spread = list(scale = bd_spread_scale,
260 prop = bd_spread_prop)))
261 }
262
263 summarize_heatmaps <- function(grudat_spread_melt, do_factors) {
264 ## -
265 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
266 ylabs = element_blank(), xlabs = element_blank(),
267 use_log = TRUE, size = 11) {
268 ## Convert from matrix to long format
269 melted <- grudat_melted ## copy?
270 if (use_log) {
271 melted[[fillval]] <- log10(melted[[fillval]] + 1)
272 }
273 return(ggplot(melted) +
274 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
275 colour = "white") +
276 scale_fill_gradient2(low = "steelblue", high = "red",
277 mid = "white", name = element_blank()) +
278 theme(axis.text.x = element_text(angle = -90, hjust = 0,
279 size = size)) +
280 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
281 }
282
283 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
284 do_logged <- (plot %in% c("log", "both"))
285 do_normal <- (plot %in% c("normal", "both"))
286 plist <- list()
287 if (do_logged) {
288 plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
289 "value.scale", "Reads (log10+1)",
290 size = size)
291 plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
292 "value.prop", "Sample (log10+1)",
293 size = size)
294 }
295 if (do_normal) {
296 plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
297 "value.scale", "Reads", use_log = F,
298 size = size)
299 plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
300 "value.prop", "Sample", use_log = F,
301 size = size)
302 }
303 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
304 plot_grid(plotlist = plist, ncol = ncol),
305 ncol = 1, rel_heights = c(0.05, 0.95)))
306
307 }
308 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", )
309 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1,
310 size = 8)
311 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1,
312 size = 8)
313 p3 <- ggplot + theme_void()
314 if (do_factors) {
315 p3 <- do_gridplot("Cell Types against Factors", "Factors", "both")
316 }
317 return(list(bulk = p1,
318 samples = list(log = p2b, normal = p2a),
319 factors = p3))
320 }
321
322 summarize_boxplots <- function(grudat_spread, do_factors) {
323 common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
324 xlab(element_blank()) + ylab(element_blank())
325 common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
326 xlab(element_blank()) + ylab(element_blank())
327
328 A <- B <- list() #nolint
329 ## Cell type by sample
330 A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
331 A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
332 ## Sample by Cell type
333 B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
334 ylab("Bulk Dataset")
335 B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
336 ylab("Bulk Dataset")
337 ## -- Factor plots are optional
338 A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()
339
340 if (do_factors) {
341 A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
342 A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
343 B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
344 ylab("Bulk Dataset")
345 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
346 ylab("Bulk Dataset")
347 }
348
349 title_a <- "Cell Types against Bulk"
350 title_b <- "Bulk Datasets against Cells"
351 if (do_factors) {
352 title_a <- paste0(title_a, " and Factors")
353 title_b <- paste0(title_b, " and Factors")
354 }
355
356 a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
357 plot_grid(plotlist = A, ncol = 2),
358 ncol = 1, rel_heights = c(0.05, 0.95))
359 b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
360 plot_grid(plotlist = B, ncol = 2),
361 ncol = 1, rel_heights = c(0.05, 0.95))
362 return(list(cell = a_all, bulk = b_all))
363 }
364
365 filter_output <- function(grudat_spread_melt, out_filt) {
366 print_red <- function(comment, red_list) {
367 cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
368 }
369 grudat_filt <- grudat_spread_melt
370 print_red("Total Cell types:", unique(grudat_filt$Cell))
371 if (!is.null(out_filt$cells)) {
372 grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
373 print_red(" - selecting:", out_filt$cells)
374 }
375 print_red("Total Factors:", unique(grudat_spread_melt$Factors))
376 if (!is.null(out_filt$facts)) {
377 grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
378 print_red(" - selecting:", out_filt$facts)
379 }
380 return(grudat_filt)
381 }
382
383
384 results <- music_on_all(files)
385
386 if (heat_grouped_p) {
387 plot_grouped_heatmaps(results)
388 } else {
389 plot_all_individual_heatmaps(results)
390 }
391
392 save.image("/tmp/sesh.rds")
393
394 summat <- summarized_matrix(results)
395 grudat <- group_by_dataset(summat)
396 grudat_spread_melt <- merge_factors_spread(grudat$spread,
397 flatten_factor_list(results))
398
399
400
401 ## The output filters ONLY apply to boxplots, since these take
402 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
403
404 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
405
406 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors)
407 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
408
409 pdf(out_heatsumm_pdf, width = 14, height = 14)
410 print(heat_maps)
411 print(box_plots)
412 dev.off()
413
414 ## Generate output tables
415 stats_prop <- lapply(grudat$spread$prop, function(x) {
416 t(apply(x, 1, summary))})
417 stats_scale <- lapply(grudat$spread$scale, function(x) {
418 t(apply(x, 1, summary))})
419
420 writable2 <- function(obj, prefix, title) {
421 write.table(obj,
422 file = paste0("report_data/", prefix, "_",
423 title, ".tabular"),
424 quote = F, sep = "\t", col.names = NA)
425 }
426 ## Make the value table printable
427 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
428 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
429 "CT Prop in Sample", "Number of Reads")
430
431 writable2(grudat_spread_melt, "values", "Data Table")
432 writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
433 writable2({
434 aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
435 }, "values", "Matrix of Cell Type Read Counts")
436
437 for (bname in names(stats_prop)) {
438 writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
439 writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
440 }