Mercurial > repos > bgruening > music_construct_eset
comparison scripts/compare.R @ 4:282819d09a4f draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d007ae51743e621dc47524f681501e72ef3a2910"
author | bgruening |
---|---|
date | Mon, 02 May 2022 09:58:18 +0000 |
parents | 7ffaa0968da3 |
children |
comparison
equal
deleted
inserted
replaced
3:7ffaa0968da3 | 4:282819d09a4f |
---|---|
9 source(args[1]) | 9 source(args[1]) |
10 | 10 |
11 method_key <- list("MuSiC" = "est_music", | 11 method_key <- list("MuSiC" = "est_music", |
12 "NNLS" = "est_nnls")[[est_method]] | 12 "NNLS" = "est_nnls")[[est_method]] |
13 | 13 |
14 delim <- "::" ## separator bulk datasets and their samples | |
14 | 15 |
15 scale_yaxes <- function(gplot, value) { | 16 scale_yaxes <- function(gplot, value) { |
16 if (is.na(value)) { | 17 if (is.na(value)) { |
17 gplot | 18 gplot |
18 } else { | 19 } else { |
134 merge_it(grudat$spread$scale, factor_groups, "value.scale"), | 135 merge_it(grudat$spread$scale, factor_groups, "value.scale"), |
135 by = c("Sample", "Cell", "Bulk", "Factors")) | 136 by = c("Sample", "Cell", "Bulk", "Factors")) |
136 return(tab) | 137 return(tab) |
137 } | 138 } |
138 | 139 |
139 | |
140 plot_grouped_heatmaps <- function(results) { | |
141 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
142 for (sc_name in names(results)) { | |
143 named_list <- sapply( | |
144 names(results[[sc_name]]), | |
145 function(n) { | |
146 ## We transpose the data here, because | |
147 ## the plotting function omits by default | |
148 ## the Y-axis which are the samples. | |
149 ## Since the celltypes are the common factor | |
150 ## these should be the Y-axis instead. | |
151 t(data.matrix(results[[sc_name]][[n]][[method_key]])) | |
152 }, simplify = F, USE.NAMES = T) | |
153 named_methods <- names(results[[sc_name]]) | |
154 ## | |
155 plot_hmap <- Prop_heat_Est( | |
156 named_list, | |
157 method.name = named_methods) + | |
158 ggtitle(paste0("[", est_method, "] Cell type ", | |
159 "proportions of ", | |
160 "Bulk Datasets based on ", | |
161 sc_name, " (scRNA)")) + | |
162 xlab("Samples (Bulk)") + | |
163 ylab("Cell Types (scRNA)") + | |
164 theme(axis.text.x = element_text(angle = -90), | |
165 axis.text.y = element_text(size = 6)) | |
166 print(plot_hmap) | |
167 } | |
168 dev.off() | |
169 } | |
170 | |
171 ## Desired plots | |
172 ## 1. Pie chart: | |
173 ## - Per Bulk dataset (using just normalised proportions) | |
174 ## - Per Bulk dataset (multiplying proportions by nreads) | |
175 | |
176 unlist_names <- function(results, method, prepend_bkname=FALSE) { | 140 unlist_names <- function(results, method, prepend_bkname=FALSE) { |
177 unique(sort( | 141 unique(sort( |
178 unlist(lapply(names(results), function(scname) { | 142 unlist(lapply(names(results), function(scname) { |
179 lapply(names(results[[scname]]), function(bkname) { | 143 lapply(names(results[[scname]]), function(bkname) { |
180 res <- get(method)(results[[scname]][[bkname]][[method_key]]) | 144 res <- get(method)(results[[scname]][[bkname]][[method_key]]) |
181 if (prepend_bkname) { | 145 if (prepend_bkname) { |
182 ## We *do not* assume unique bulk sample names | 146 ## We *do not* assume unique bulk sample names |
183 ## across different bulk datasets. | 147 ## across different bulk datasets. |
184 res <- paste0(bkname, "::", res) | 148 res <- paste0(bkname, delim, res) |
185 } | 149 } |
186 return(res) | 150 return(res) |
187 }) | 151 }) |
188 })) | 152 })) |
189 )) | 153 )) |
199 ## Iterate through all possible samples and populate a table. | 163 ## Iterate through all possible samples and populate a table. |
200 ddff <- data.frame() | 164 ddff <- data.frame() |
201 ddff_scale <- data.frame() | 165 ddff_scale <- data.frame() |
202 for (cell in all_celltypes) { | 166 for (cell in all_celltypes) { |
203 for (sample in all_samples) { | 167 for (sample in all_samples) { |
204 group_sname <- unlist(strsplit(sample, split = "::")) | 168 group_sname <- unlist(strsplit(sample, split = delim)) |
205 bulk <- group_sname[1] | 169 bulk <- group_sname[1] |
206 id_sample <- group_sname[2] | 170 id_sample <- group_sname[2] |
207 for (scgroup in names(results)) { | 171 for (scgroup in names(results)) { |
208 if (bulk %in% names(results[[scgroup]])) { | 172 if (bulk %in% names(results[[scgroup]])) { |
209 mat_prop <- results[[scgroup]][[bulk]][[method_key]] | 173 mat_prop <- results[[scgroup]][[bulk]][[method_key]] |
229 ## Get a 2d DF of all factors across all bulk samples. | 193 ## Get a 2d DF of all factors across all bulk samples. |
230 res <- c() | 194 res <- c() |
231 for (scgroup in names(results)) { | 195 for (scgroup in names(results)) { |
232 for (bulkgroup in names(results[[scgroup]])) { | 196 for (bulkgroup in names(results[[scgroup]])) { |
233 dat <- results[[scgroup]][[bulkgroup]]$plot_groups | 197 dat <- results[[scgroup]][[bulkgroup]]$plot_groups |
234 dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint | 198 dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint |
235 res <- rbind(res, dat) | 199 res <- rbind(res, dat) |
236 } | 200 } |
237 } | 201 } |
238 return(res) | 202 return(res) |
239 } | 203 } |
245 bd <- list() | 209 bd <- list() |
246 bd_scale <- list() | 210 bd_scale <- list() |
247 bd_spread_scale <- list() | 211 bd_spread_scale <- list() |
248 bd_spread_prop <- list() | 212 bd_spread_prop <- list() |
249 for (bname in bulk_names) { | 213 for (bname in bulk_names) { |
250 subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] | 214 subs <- mat_names[startsWith(mat_names, paste0(bname, delim))] |
251 ## - | 215 ## - |
252 bd[[bname]] <- rowSums(summat$prop[, subs]) | 216 bd[[bname]] <- rowSums(summat$prop[, subs]) |
253 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) | 217 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) |
254 bd_spread_scale[[bname]] <- summat$scaled[, subs] | 218 bd_spread_scale[[bname]] <- summat$scaled[, subs] |
255 bd_spread_prop[[bname]] <- summat$prop[, subs] | 219 bd_spread_prop[[bname]] <- summat$prop[, subs] |
258 scaled = as.data.frame(bd_scale), | 222 scaled = as.data.frame(bd_scale), |
259 spread = list(scale = bd_spread_scale, | 223 spread = list(scale = bd_spread_scale, |
260 prop = bd_spread_prop))) | 224 prop = bd_spread_prop))) |
261 } | 225 } |
262 | 226 |
263 summarize_heatmaps <- function(grudat_spread_melt, do_factors) { | 227 do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name, |
264 ## - | 228 xlabs="", ylabs="", titled="", |
229 order_col=T, order_row=T, size=11) { | |
230 | |
231 data_m <- grudat_spread_melt | |
232 data_matrix <- { | |
233 tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name) | |
234 rownames(tmp) <- tmp[[yaxis]] | |
235 tmp[[yaxis]] <- NULL | |
236 tmp | |
237 } | |
238 dist_method <- "euclidean" | |
239 clust_method <- "complete" | |
240 | |
241 if (order_row) { | |
242 dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method)) | |
243 row_ord <- order.dendrogram(dd_row) | |
244 ordered_row_names <- row.names(data_matrix[row_ord, ]) | |
245 data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names) | |
246 } | |
247 | |
248 if (order_col) { | |
249 dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method), | |
250 method = clust_method)) | |
251 col_ord <- order.dendrogram(dd_col) | |
252 ordered_col_names <- colnames(data_matrix[, col_ord]) | |
253 data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names) | |
254 } | |
255 | |
256 heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) + | |
257 geom_tile(colour = "white") + | |
258 scale_fill_gradient2(low = "steelblue", high = "red", mid = "white", | |
259 name = element_blank()) + | |
260 scale_y_discrete(position = "right") + | |
261 theme(axis.text.x = element_text(angle = -90, hjust = 0, | |
262 size = size)) + | |
263 ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs) | |
264 | |
265 ## Graphics | |
266 dendro_linesize <- 0.5 | |
267 dendro_colunit <- 0.2 | |
268 dendro_rowunit <- 0.1 | |
269 final_plot <- heat_plot | |
270 | |
271 if (order_row) { | |
272 dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle") | |
273 dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) + | |
274 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row), | |
275 ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend), | |
276 size = dendro_linesize) + ggplot2::coord_flip() | |
277 final_plot <- cowplot::insert_yaxis_grob( | |
278 final_plot, dendro_row, grid::unit(dendro_colunit, "null"), | |
279 position = "left") | |
280 } | |
281 if (order_col) { | |
282 dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle") | |
283 dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") + | |
284 ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col), | |
285 ggplot2::aes(x = x, y = y, xend = xend, yend = yend), | |
286 size = dendro_linesize) | |
287 final_plot <- cowplot::insert_xaxis_grob( | |
288 final_plot, dendro_col, grid::unit(dendro_rowunit, "null"), | |
289 position = "top") | |
290 } | |
291 return(cowplot::ggdraw(final_plot)) | |
292 } | |
293 | |
294 summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") { | |
295 ## - Cluster is either "Rows", "Cols", "Both", or "None" | |
265 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, | 296 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, |
266 ylabs = element_blank(), xlabs = element_blank(), | 297 ylabs = element_blank(), xlabs = element_blank(), |
267 use_log = TRUE, size = 11) { | 298 use_log = TRUE, size = 11) { |
268 ## Convert from matrix to long format | 299 ## Convert from matrix to long format |
269 melted <- grudat_melted ## copy? | 300 melted <- grudat_melted ## copy? |
270 if (use_log) { | 301 if (use_log) { |
271 melted[[fillval]] <- log10(melted[[fillval]] + 1) | 302 melted[[fillval]] <- log10(melted[[fillval]] + 1) |
272 } | 303 } |
273 return(ggplot(melted) + | 304 if (cluster == "None") { |
274 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), | 305 return(ggplot(melted) + |
275 colour = "white") + | 306 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), |
276 scale_fill_gradient2(low = "steelblue", high = "red", | 307 colour = "white") + |
277 mid = "white", name = element_blank()) + | 308 scale_fill_gradient2( |
278 theme(axis.text.x = element_text(angle = -90, hjust = 0, | 309 low = "steelblue", high = "red", mid = "white", |
279 size = size)) + | 310 name = element_blank()) + |
280 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) | 311 theme(axis.text.x = element_text( |
312 angle = -90, hjust = 0, size = size)) + | |
313 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) | |
314 } else { | |
315 return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval, | |
316 xlabs, ylabs, title, | |
317 (cluster %in% c("Cols", "Both")), | |
318 (cluster %in% c("Rows", "Both")))) | |
319 } | |
281 } | 320 } |
282 | 321 |
283 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { | 322 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { |
284 do_logged <- (plot %in% c("log", "both")) | 323 do_logged <- (plot %in% c("log", "both")) |
285 do_normal <- (plot %in% c("normal", "both")) | 324 do_normal <- (plot %in% c("normal", "both")) |
301 size = size) | 340 size = size) |
302 } | 341 } |
303 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), | 342 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), |
304 plot_grid(plotlist = plist, ncol = ncol), | 343 plot_grid(plotlist = plist, ncol = ncol), |
305 ncol = 1, rel_heights = c(0.05, 0.95))) | 344 ncol = 1, rel_heights = c(0.05, 0.95))) |
306 | 345 } |
307 } | 346 |
308 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) | 347 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both") |
309 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, | 348 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", |
310 size = 8) | 349 ncol = 1, size = 8) |
311 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, | 350 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", |
312 size = 8) | 351 ncol = 1, size = 8) |
313 p3 <- ggplot + theme_void() | 352 p3 <- ggplot + theme_void() |
314 if (do_factors) { | 353 if (do_factors) { |
315 p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") | 354 p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both") |
316 } | 355 } |
317 return(list(bulk = p1, | 356 return(list(bulk = p1, |
318 samples = list(log = p2b, normal = p2a), | 357 samples = list(log = p2b, normal = p2a), |
319 factors = p3)) | 358 factors = p3)) |
320 } | 359 } |
344 ylab("Bulk Dataset") | 383 ylab("Bulk Dataset") |
345 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + | 384 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + |
346 ylab("Bulk Dataset") | 385 ylab("Bulk Dataset") |
347 } | 386 } |
348 | 387 |
349 title_a <- "Cell Types against Bulk" | 388 title_a <- "Cell Types vs Bulk Datasets" |
350 title_b <- "Bulk Datasets against Cells" | 389 title_b <- "Bulk Datasets vs Cell Types" |
351 if (do_factors) { | 390 if (do_factors) { |
352 title_a <- paste0(title_a, " and Factors") | 391 title_a <- paste0(title_a, " and Factors") |
353 title_b <- paste0(title_b, " and Factors") | 392 title_b <- paste0(title_b, " and Factors") |
354 } | 393 } |
355 | 394 |
378 print_red(" - selecting:", out_filt$facts) | 417 print_red(" - selecting:", out_filt$facts) |
379 } | 418 } |
380 return(grudat_filt) | 419 return(grudat_filt) |
381 } | 420 } |
382 | 421 |
422 writable2 <- function(obj, prefix, title) { | |
423 write.table(obj, | |
424 file = paste0("report_data/", prefix, "_", | |
425 title, ".tabular"), | |
426 quote = F, sep = "\t", col.names = NA) | |
427 } | |
428 | |
383 | 429 |
384 results <- music_on_all(files) | 430 results <- music_on_all(files) |
385 | |
386 if (heat_grouped_p) { | |
387 plot_grouped_heatmaps(results) | |
388 } else { | |
389 plot_all_individual_heatmaps(results) | |
390 } | |
391 | |
392 save.image("/tmp/sesh.rds") | |
393 | |
394 summat <- summarized_matrix(results) | 431 summat <- summarized_matrix(results) |
395 grudat <- group_by_dataset(summat) | 432 grudat <- group_by_dataset(summat) |
396 grudat_spread_melt <- merge_factors_spread(grudat$spread, | 433 grudat_spread_melt <- merge_factors_spread(grudat$spread, |
397 flatten_factor_list(results)) | 434 flatten_factor_list(results)) |
398 | 435 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) |
399 | 436 |
437 plot_all_individual_heatmaps(results) | |
400 | 438 |
401 ## The output filters ONLY apply to boxplots, since these take | 439 ## The output filters ONLY apply to boxplots, since these take |
402 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) | 440 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) |
403 | |
404 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) | |
405 | |
406 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) | |
407 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) | 441 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) |
442 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors, | |
443 dendro_setting) | |
408 | 444 |
409 pdf(out_heatsumm_pdf, width = 14, height = 14) | 445 pdf(out_heatsumm_pdf, width = 14, height = 14) |
410 print(heat_maps) | 446 print(heat_maps) |
411 print(box_plots) | 447 print(box_plots) |
412 dev.off() | 448 dev.off() |
415 stats_prop <- lapply(grudat$spread$prop, function(x) { | 451 stats_prop <- lapply(grudat$spread$prop, function(x) { |
416 t(apply(x, 1, summary))}) | 452 t(apply(x, 1, summary))}) |
417 stats_scale <- lapply(grudat$spread$scale, function(x) { | 453 stats_scale <- lapply(grudat$spread$scale, function(x) { |
418 t(apply(x, 1, summary))}) | 454 t(apply(x, 1, summary))}) |
419 | 455 |
420 writable2 <- function(obj, prefix, title) { | |
421 write.table(obj, | |
422 file = paste0("report_data/", prefix, "_", | |
423 title, ".tabular"), | |
424 quote = F, sep = "\t", col.names = NA) | |
425 } | |
426 ## Make the value table printable | 456 ## Make the value table printable |
427 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint | 457 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint |
428 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", | 458 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", |
429 "CT Prop in Sample", "Number of Reads") | 459 "CT Prop in Sample", "Number of Reads") |
430 | 460 |