comparison scripts/compare.R @ 4:282819d09a4f draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"
author bgruening
date Mon, 02 May 2022 09:58:18 +0000
parents 7ffaa0968da3
children
comparison
equal deleted inserted replaced
3:7ffaa0968da3 4:282819d09a4f
9 source(args[1]) 9 source(args[1])
10 10
11 method_key <- list("MuSiC" = "est_music", 11 method_key <- list("MuSiC" = "est_music",
12 "NNLS" = "est_nnls")[[est_method]] 12 "NNLS" = "est_nnls")[[est_method]]
13 13
14 delim <- "::" ## separator bulk datasets and their samples
14 15
15 scale_yaxes <- function(gplot, value) { 16 scale_yaxes <- function(gplot, value) {
16 if (is.na(value)) { 17 if (is.na(value)) {
17 gplot 18 gplot
18 } else { 19 } else {
134 merge_it(grudat$spread$scale, factor_groups, "value.scale"), 135 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
135 by = c("Sample", "Cell", "Bulk", "Factors")) 136 by = c("Sample", "Cell", "Bulk", "Factors"))
136 return(tab) 137 return(tab)
137 } 138 }
138 139
139
140 plot_grouped_heatmaps <- function(results) {
141 pdf(out_heatmulti_pdf, width = 8, height = 8)
142 for (sc_name in names(results)) {
143 named_list <- sapply(
144 names(results[[sc_name]]),
145 function(n) {
146 ## We transpose the data here, because
147 ## the plotting function omits by default
148 ## the Y-axis which are the samples.
149 ## Since the celltypes are the common factor
150 ## these should be the Y-axis instead.
151 t(data.matrix(results[[sc_name]][[n]][[method_key]]))
152 }, simplify = F, USE.NAMES = T)
153 named_methods <- names(results[[sc_name]])
154 ##
155 plot_hmap <- Prop_heat_Est(
156 named_list,
157 method.name = named_methods) +
158 ggtitle(paste0("[", est_method, "] Cell type ",
159 "proportions of ",
160 "Bulk Datasets based on ",
161 sc_name, " (scRNA)")) +
162 xlab("Samples (Bulk)") +
163 ylab("Cell Types (scRNA)") +
164 theme(axis.text.x = element_text(angle = -90),
165 axis.text.y = element_text(size = 6))
166 print(plot_hmap)
167 }
168 dev.off()
169 }
170
171 ## Desired plots
172 ## 1. Pie chart:
173 ## - Per Bulk dataset (using just normalised proportions)
174 ## - Per Bulk dataset (multiplying proportions by nreads)
175
176 unlist_names <- function(results, method, prepend_bkname=FALSE) { 140 unlist_names <- function(results, method, prepend_bkname=FALSE) {
177 unique(sort( 141 unique(sort(
178 unlist(lapply(names(results), function(scname) { 142 unlist(lapply(names(results), function(scname) {
179 lapply(names(results[[scname]]), function(bkname) { 143 lapply(names(results[[scname]]), function(bkname) {
180 res <- get(method)(results[[scname]][[bkname]][[method_key]]) 144 res <- get(method)(results[[scname]][[bkname]][[method_key]])
181 if (prepend_bkname) { 145 if (prepend_bkname) {
182 ## We *do not* assume unique bulk sample names 146 ## We *do not* assume unique bulk sample names
183 ## across different bulk datasets. 147 ## across different bulk datasets.
184 res <- paste0(bkname, "::", res) 148 res <- paste0(bkname, delim, res)
185 } 149 }
186 return(res) 150 return(res)
187 }) 151 })
188 })) 152 }))
189 )) 153 ))
199 ## Iterate through all possible samples and populate a table. 163 ## Iterate through all possible samples and populate a table.
200 ddff <- data.frame() 164 ddff <- data.frame()
201 ddff_scale <- data.frame() 165 ddff_scale <- data.frame()
202 for (cell in all_celltypes) { 166 for (cell in all_celltypes) {
203 for (sample in all_samples) { 167 for (sample in all_samples) {
204 group_sname <- unlist(strsplit(sample, split = "::")) 168 group_sname <- unlist(strsplit(sample, split = delim))
205 bulk <- group_sname[1] 169 bulk <- group_sname[1]
206 id_sample <- group_sname[2] 170 id_sample <- group_sname[2]
207 for (scgroup in names(results)) { 171 for (scgroup in names(results)) {
208 if (bulk %in% names(results[[scgroup]])) { 172 if (bulk %in% names(results[[scgroup]])) {
209 mat_prop <- results[[scgroup]][[bulk]][[method_key]] 173 mat_prop <- results[[scgroup]][[bulk]][[method_key]]
229 ## Get a 2d DF of all factors across all bulk samples. 193 ## Get a 2d DF of all factors across all bulk samples.
230 res <- c() 194 res <- c()
231 for (scgroup in names(results)) { 195 for (scgroup in names(results)) {
232 for (bulkgroup in names(results[[scgroup]])) { 196 for (bulkgroup in names(results[[scgroup]])) {
233 dat <- results[[scgroup]][[bulkgroup]]$plot_groups 197 dat <- results[[scgroup]][[bulkgroup]]$plot_groups
234 dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint 198 dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint
235 res <- rbind(res, dat) 199 res <- rbind(res, dat)
236 } 200 }
237 } 201 }
238 return(res) 202 return(res)
239 } 203 }
245 bd <- list() 209 bd <- list()
246 bd_scale <- list() 210 bd_scale <- list()
247 bd_spread_scale <- list() 211 bd_spread_scale <- list()
248 bd_spread_prop <- list() 212 bd_spread_prop <- list()
249 for (bname in bulk_names) { 213 for (bname in bulk_names) {
250 subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] 214 subs <- mat_names[startsWith(mat_names, paste0(bname, delim))]
251 ## - 215 ## -
252 bd[[bname]] <- rowSums(summat$prop[, subs]) 216 bd[[bname]] <- rowSums(summat$prop[, subs])
253 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) 217 bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
254 bd_spread_scale[[bname]] <- summat$scaled[, subs] 218 bd_spread_scale[[bname]] <- summat$scaled[, subs]
255 bd_spread_prop[[bname]] <- summat$prop[, subs] 219 bd_spread_prop[[bname]] <- summat$prop[, subs]
258 scaled = as.data.frame(bd_scale), 222 scaled = as.data.frame(bd_scale),
259 spread = list(scale = bd_spread_scale, 223 spread = list(scale = bd_spread_scale,
260 prop = bd_spread_prop))) 224 prop = bd_spread_prop)))
261 } 225 }
262 226
263 summarize_heatmaps <- function(grudat_spread_melt, do_factors) { 227 do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name,
264 ## - 228 xlabs="", ylabs="", titled="",
229 order_col=T, order_row=T, size=11) {
230
231 data_m <- grudat_spread_melt
232 data_matrix <- {
233 tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name)
234 rownames(tmp) <- tmp[[yaxis]]
235 tmp[[yaxis]] <- NULL
236 tmp
237 }
238 dist_method <- "euclidean"
239 clust_method <- "complete"
240
241 if (order_row) {
242 dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method))
243 row_ord <- order.dendrogram(dd_row)
244 ordered_row_names <- row.names(data_matrix[row_ord, ])
245 data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names)
246 }
247
248 if (order_col) {
249 dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method),
250 method = clust_method))
251 col_ord <- order.dendrogram(dd_col)
252 ordered_col_names <- colnames(data_matrix[, col_ord])
253 data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names)
254 }
255
256 heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) +
257 geom_tile(colour = "white") +
258 scale_fill_gradient2(low = "steelblue", high = "red", mid = "white",
259 name = element_blank()) +
260 scale_y_discrete(position = "right") +
261 theme(axis.text.x = element_text(angle = -90, hjust = 0,
262 size = size)) +
263 ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs)
264
265 ## Graphics
266 dendro_linesize <- 0.5
267 dendro_colunit <- 0.2
268 dendro_rowunit <- 0.1
269 final_plot <- heat_plot
270
271 if (order_row) {
272 dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle")
273 dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) +
274 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row),
275 ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend),
276 size = dendro_linesize) + ggplot2::coord_flip()
277 final_plot <- cowplot::insert_yaxis_grob(
278 final_plot, dendro_row, grid::unit(dendro_colunit, "null"),
279 position = "left")
280 }
281 if (order_col) {
282 dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle")
283 dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") +
284 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col),
285 ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
286 size = dendro_linesize)
287 final_plot <- cowplot::insert_xaxis_grob(
288 final_plot, dendro_col, grid::unit(dendro_rowunit, "null"),
289 position = "top")
290 }
291 return(cowplot::ggdraw(final_plot))
292 }
293
294 summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") {
295 ## - Cluster is either "Rows", "Cols", "Both", or "None"
265 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, 296 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
266 ylabs = element_blank(), xlabs = element_blank(), 297 ylabs = element_blank(), xlabs = element_blank(),
267 use_log = TRUE, size = 11) { 298 use_log = TRUE, size = 11) {
268 ## Convert from matrix to long format 299 ## Convert from matrix to long format
269 melted <- grudat_melted ## copy? 300 melted <- grudat_melted ## copy?
270 if (use_log) { 301 if (use_log) {
271 melted[[fillval]] <- log10(melted[[fillval]] + 1) 302 melted[[fillval]] <- log10(melted[[fillval]] + 1)
272 } 303 }
273 return(ggplot(melted) + 304 if (cluster == "None") {
274 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), 305 return(ggplot(melted) +
275 colour = "white") + 306 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
276 scale_fill_gradient2(low = "steelblue", high = "red", 307 colour = "white") +
277 mid = "white", name = element_blank()) + 308 scale_fill_gradient2(
278 theme(axis.text.x = element_text(angle = -90, hjust = 0, 309 low = "steelblue", high = "red", mid = "white",
279 size = size)) + 310 name = element_blank()) +
280 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) 311 theme(axis.text.x = element_text(
312 angle = -90, hjust = 0, size = size)) +
313 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
314 } else {
315 return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval,
316 xlabs, ylabs, title,
317 (cluster %in% c("Cols", "Both")),
318 (cluster %in% c("Rows", "Both"))))
319 }
281 } 320 }
282 321
283 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { 322 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
284 do_logged <- (plot %in% c("log", "both")) 323 do_logged <- (plot %in% c("log", "both"))
285 do_normal <- (plot %in% c("normal", "both")) 324 do_normal <- (plot %in% c("normal", "both"))
301 size = size) 340 size = size)
302 } 341 }
303 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), 342 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
304 plot_grid(plotlist = plist, ncol = ncol), 343 plot_grid(plotlist = plist, ncol = ncol),
305 ncol = 1, rel_heights = c(0.05, 0.95))) 344 ncol = 1, rel_heights = c(0.05, 0.95)))
306 345 }
307 } 346
308 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) 347 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both")
309 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, 348 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal",
310 size = 8) 349 ncol = 1, size = 8)
311 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, 350 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log",
312 size = 8) 351 ncol = 1, size = 8)
313 p3 <- ggplot + theme_void() 352 p3 <- ggplot + theme_void()
314 if (do_factors) { 353 if (do_factors) {
315 p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") 354 p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both")
316 } 355 }
317 return(list(bulk = p1, 356 return(list(bulk = p1,
318 samples = list(log = p2b, normal = p2a), 357 samples = list(log = p2b, normal = p2a),
319 factors = p3)) 358 factors = p3))
320 } 359 }
344 ylab("Bulk Dataset") 383 ylab("Bulk Dataset")
345 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + 384 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
346 ylab("Bulk Dataset") 385 ylab("Bulk Dataset")
347 } 386 }
348 387
349 title_a <- "Cell Types against Bulk" 388 title_a <- "Cell Types vs Bulk Datasets"
350 title_b <- "Bulk Datasets against Cells" 389 title_b <- "Bulk Datasets vs Cell Types"
351 if (do_factors) { 390 if (do_factors) {
352 title_a <- paste0(title_a, " and Factors") 391 title_a <- paste0(title_a, " and Factors")
353 title_b <- paste0(title_b, " and Factors") 392 title_b <- paste0(title_b, " and Factors")
354 } 393 }
355 394
378 print_red(" - selecting:", out_filt$facts) 417 print_red(" - selecting:", out_filt$facts)
379 } 418 }
380 return(grudat_filt) 419 return(grudat_filt)
381 } 420 }
382 421
422 writable2 <- function(obj, prefix, title) {
423 write.table(obj,
424 file = paste0("report_data/", prefix, "_",
425 title, ".tabular"),
426 quote = F, sep = "\t", col.names = NA)
427 }
428
383 429
384 results <- music_on_all(files) 430 results <- music_on_all(files)
385
386 if (heat_grouped_p) {
387 plot_grouped_heatmaps(results)
388 } else {
389 plot_all_individual_heatmaps(results)
390 }
391
392 save.image("/tmp/sesh.rds")
393
394 summat <- summarized_matrix(results) 431 summat <- summarized_matrix(results)
395 grudat <- group_by_dataset(summat) 432 grudat <- group_by_dataset(summat)
396 grudat_spread_melt <- merge_factors_spread(grudat$spread, 433 grudat_spread_melt <- merge_factors_spread(grudat$spread,
397 flatten_factor_list(results)) 434 flatten_factor_list(results))
398 435 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
399 436
437 plot_all_individual_heatmaps(results)
400 438
401 ## The output filters ONLY apply to boxplots, since these take 439 ## The output filters ONLY apply to boxplots, since these take
402 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) 440 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
403
404 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)
405
406 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors)
407 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) 441 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
442 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors,
443 dendro_setting)
408 444
409 pdf(out_heatsumm_pdf, width = 14, height = 14) 445 pdf(out_heatsumm_pdf, width = 14, height = 14)
410 print(heat_maps) 446 print(heat_maps)
411 print(box_plots) 447 print(box_plots)
412 dev.off() 448 dev.off()
415 stats_prop <- lapply(grudat$spread$prop, function(x) { 451 stats_prop <- lapply(grudat$spread$prop, function(x) {
416 t(apply(x, 1, summary))}) 452 t(apply(x, 1, summary))})
417 stats_scale <- lapply(grudat$spread$scale, function(x) { 453 stats_scale <- lapply(grudat$spread$scale, function(x) {
418 t(apply(x, 1, summary))}) 454 t(apply(x, 1, summary))})
419 455
420 writable2 <- function(obj, prefix, title) {
421 write.table(obj,
422 file = paste0("report_data/", prefix, "_",
423 title, ".tabular"),
424 quote = F, sep = "\t", col.names = NA)
425 }
426 ## Make the value table printable 456 ## Make the value table printable
427 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint 457 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
428 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", 458 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
429 "CT Prop in Sample", "Number of Reads") 459 "CT Prop in Sample", "Number of Reads")
430 460