Mercurial > repos > artbio > gsc_high_dimensions_visualisation
diff high_dim_visu.R @ 8:fe6f76030168 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_high_dimension_visualization commit a3dc683410fc240f428c8fbee3c63aa9965fbf38
author | artbio |
---|---|
date | Wed, 29 Nov 2023 17:28:18 +0000 |
parents | 18a1dc4aec4a |
children | 58aa18e1fe14 |
line wrap: on
line diff
--- a/high_dim_visu.R Sun May 21 16:26:23 2023 +0000 +++ b/high_dim_visu.R Wed Nov 29 17:28:18 2023 +0000 @@ -1,11 +1,12 @@ options(show.error.messages = FALSE, - error = function() { - cat(geterrmessage(), file = stderr()) - q("no", 1, FALSE) - } + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() + library(optparse) library(FactoMineR) library(factoextra) @@ -17,7 +18,6 @@ library(data.table) library(Polychrome) -# Arguments option_list <- list( make_option( "--data", @@ -26,24 +26,6 @@ help = "Input file that contains expression value to visualise" ), make_option( - "--sep", - default = "\t", - type = "character", - help = "File separator [default : '%default' ]" - ), - make_option( - "--colnames", - default = TRUE, - type = "logical", - help = "Consider first line as header ? [default : '%default' ]" - ), - make_option( - "--out", - default = "res.tab", - type = "character", - help = "Output name [default : '%default' ]" - ), - make_option( "--labels", default = FALSE, type = "logical", @@ -62,12 +44,6 @@ help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" ), make_option( - "--table_coordinates", - default = "", - type = "character", - help = "Table with plot coordinates [default : '%default' ]" - ), - make_option( "--Rtsne_seed", default = 42, type = "integer", @@ -115,7 +91,7 @@ type = "logical", help = "Should data be centered before pca is applied? [default : '%default' ]" ), - make_option( + make_option( "--Rtsne_pca_scale", default = FALSE, type = "logical", @@ -133,20 +109,26 @@ type = "numeric", help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" ), - make_option( + make_option( "--PCA_npc", default = 5, type = "integer", help = "number of dimensions kept in the results [default : '%default' ]" ), - make_option( - "--PCA_x_axis", + make_option( + "--item_size", + default = 1, + type = "numeric", + help = "Size of points/labels in PCA [default : '%default' ]" + ), + make_option( + "--x_axis", default = 1, type = "integer", help = "PC to plot in the x axis [default : '%default' ]" ), - make_option( - "--PCA_y_axis", + make_option( + "--y_axis", default = 2, type = "integer", help = "PC to plot in the y axis [default : '%default' ]" @@ -157,7 +139,7 @@ type = "numeric", help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" ), - make_option( + make_option( "--HCPC_npc", default = 5, type = "integer", @@ -199,13 +181,13 @@ type = "integer", help = "The least possible number of clusters suggested [default :'%default']" ), - make_option( + make_option( "--HCPC_max", default = -1, type = "integer", help = "The higher possible number of clusters suggested [default :'%default']" ), - make_option( + make_option( "--HCPC_clusterCA", default = "rows", type = "character", @@ -218,124 +200,164 @@ help = "The maximum number of iterations for the consolidation [default :'%default']" ), make_option( - "--HCPC_clust", - default = "", - type = "character", - help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']" - ), - make_option( "--HCPC_mutual_info", default = "", type = "character", help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']" ), make_option( - "--HCPC_cluster_description", + "--HCPC_cell_clust", default = "", type = "character", - help = "Output file with variables most contributing to clustering [default :'%default']" + help = "Lists cells in the clusters generated by HCPC clustering. 2-column table (cell identifiers/clusters) [default :'%default']" + ), + make_option( + "--HCPC_contributions", + default = "", + type = "character", + help = "Table of variables (genes) most contributing to HCPC clustering [default :'%default']" ) ) opt <- parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) -if (opt$sep == "tab") { - opt$sep <- "\t" -} -if (opt$sep == "comma") { - opt$sep <- "," -} if (opt$HCPC_max == -1) { - opt$HCPC_max <- NULL + opt$HCPC_max <- NULL } if (opt$HCPC_kk == -1) { - opt$HCPC_kk <- Inf + opt$HCPC_kk <- Inf } -##Add legend to plot() -legend_col <- function(col, lev) { - -opar <- par - -n <- length(col) - -bx <- par("usr") - -box_cx <- c(bx[2] + (bx[2] - bx[1]) / 1000, -bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50) -box_cy <- c(bx[3], bx[3]) -box_sy <- (bx[4] - bx[3]) / n - -xx <- rep(box_cx, each = 2) - -par(xpd = TRUE) -for (i in 1:n) { - yy <- c(box_cy[1] + (box_sy * (i - 1)), - box_cy[1] + (box_sy * (i)), - box_cy[1] + (box_sy * (i)), - box_cy[1] + (box_sy * (i - 1))) - polygon(xx, yy, col = col[i], border = col[i]) -} -par(new = TRUE) -plot(0, 0, type = "n", -ylim = c(min(lev), max(lev)), -yaxt = "n", ylab = "", -xaxt = "n", xlab = "", -frame.plot = FALSE) -axis(side = 4, las = 2, tick = FALSE, line = .25) -par <- opar -} - +#### We treat data once, at the beginning of the script #### data <- read.delim( opt$data, check.names = FALSE, - header = opt$colnames, + header = TRUE, row.names = 1, - sep = opt$sep + sep = "\t" ) +# we transpose immediately, because this is the common data structure +data <- as.data.frame(t(data)) -# Contrasting factor and its colors +# we treat the factor for usage in 3 methods if (opt$factor != "") { - contrasting_factor <- read.delim( - opt$factor, - header = TRUE - ) + contrasting_factor <- read.delim(opt$factor, header = TRUE) rownames(contrasting_factor) <- contrasting_factor[, 1] - contrasting_factor <- contrasting_factor[colnames(data), ] - colnames(contrasting_factor) <- c("id", "factor") - if (is.numeric(contrasting_factor$factor)) { - factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor] + # we pick only the relevant values of the contrasting factor + contrasting_factor <- contrasting_factor[rownames(data), ] + sup <- colnames(contrasting_factor)[2] + if (!is.numeric(contrasting_factor[, 2])) { + contrasting_factor[, 2] <- as.factor(contrasting_factor[, 2]) + } +} + +######### make PCA with FactoMineR ################# +if (opt$visu_choice == "PCA") { + if (opt$labels) { + labels <- "ind" } else { - contrasting_factor$factor <- as.factor(contrasting_factor$factor) - if (nlevels(contrasting_factor$factor) == 2) { - colors_labels <- c("#E41A1C", "#377EB8") + labels <- "none" + } + pdf(opt$pdf_out) + if (opt$factor != "") { + data <- cbind(data, contrasting_factor[, 2]) + colnames(data)[length(data)] <- sup + if (is.numeric(contrasting_factor[, 2])) { + res_pca <- PCA(X = data, quanti.sup = sup, graph = FALSE) + pca_plot <- plot(res_pca, habillage = sup, label = labels, + title = "PCA graph of cells", cex = opt$item_size, + axes = c(opt$x_axis, opt$y_axis)) } else { - set.seed(567629) - colors_labels <- createPalette(nlevels(contrasting_factor$factor), c("#5A5156", "#E4E1E3", "#F6222E")) - names(colors_labels) <- NULL + res_pca <- PCA(X = data, quali.sup = sup, graph = FALSE) + pca_plot <- plot(res_pca, habillage = sup, label = labels, + title = "PCA graph of cells", cex = opt$item_size, + axes = c(opt$x_axis, opt$y_axis)) } - factor_colors <- - with(contrasting_factor, - data.frame(factor = levels(contrasting_factor$factor), - color = I(colors_labels) - ) - ) - factor_cols <- factor_colors$color[match(contrasting_factor$factor, - factor_colors$factor)] + } else { + res_pca <- PCA(X = data, graph = FALSE) + pca_plot <- plot(res_pca, label = labels, + title = "PCA graph of cells", cex = opt$item_size, + axes = c(opt$x_axis, opt$y_axis), col.ind = "deepskyblue4") } -} else { - factor_cols <- rep("deepskyblue4", length(rownames(data))) + print(pca_plot) + dev.off() } +########### make HCPC with FactoMineR ########## +if (opt$visu_choice == "HCPC") { + pdf(opt$pdf_out) + # HCPC starts with a PCA + pca <- PCA(X = data, ncp = opt$HCPC_npc, graph = FALSE) + pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA + # Hierarchical Clustering On Principal Components Followed By Kmean Clustering + res_hcpc <- HCPC(pca, + nb.clust = opt$HCPC_ncluster, + metric = opt$HCPC_metric, + method = opt$HCPC_method, + graph = FALSE, + consol = opt$HCPC_consol, + iter.max = opt$HCPC_itermax, + min = opt$HCPC_min, + max = opt$HCPC_max, + cluster.CA = opt$HCPC_clusterCA, + kk = opt$HCPC_kk) + # HCPC plots + dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2 + plot(res_hcpc, choice = "tree") + plot(res_hcpc, choice = "bar") + if (opt$labels == FALSE) { + plot(res_hcpc, choice = "3D.map", ind.names = FALSE) + plot(res_hcpc, choice = "map", label = "none") + } else { + plot(res_hcpc, choice = "3D.map") + plot(res_hcpc, choice = "map") + } + ## Normalized Mutual Information + if (opt$factor != "") { + sink(opt$HCPC_mutual_info) + cat("Relationship between input factor and its levels and the HCPC clusters") + res <- external_validation(true_labels = as.numeric(contrasting_factor[, 2]), + clusters = as.numeric(res_hcpc$data.clust$clust), + summary_stats = TRUE) + sink() + } + dev.off() + + res_clustering <- data.frame(Cell = rownames(res_hcpc$data.clust), + Cluster = res_hcpc$data.clust$clust) + # Description of cluster by most contributing variables / gene expressions + # first transform list of vectors in a list of dataframes + extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame) + # second, transfer rownames (genes) to column in the dataframe, before rbinding + extract_description_w_genes <- Map(cbind, + extract_description, + genes = lapply(extract_description, rownames)) + # Then collapse all dataframes with cluster_id in 1st column using {data.table} rbindlist() + cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id") + cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # swap columns + cluster_description <- cluster_description[order(cluster_description[[2]], + cluster_description[[8]]), ] # sort by cluster then by pval + # Finally, output cluster description data frame + write.table(cluster_description, + file = opt$HCPC_contributions, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE) + + ## Return cluster table to user + write.table(res_clustering, + file = opt$HCPC_cell_clust, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE) +} ################ t-SNE #################### if (opt$visu_choice == "tSNE") { - # filter and transpose df for tsne and pca - tdf <- t(data) - # make tsne and plot results set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility - - tsne_out <- Rtsne(tdf, + tsne_out <- Rtsne(data, dims = opt$Rtsne_dims, initial_dims = opt$Rtsne_initial_dims, perplexity = opt$Rtsne_perplexity, @@ -346,190 +368,48 @@ pca_scale = opt$Rtsne_pca_scale, normalize = opt$Rtsne_normalize, exaggeration_factor = opt$Rtsne_exaggeration_factor) - embedding <- as.data.frame(tsne_out$Y[, 1:2]) - embedding$Class <- as.factor(rownames(tdf)) + embedding$Class <- as.factor(rownames(data)) gg_legend <- theme(legend.position = "right") + pointcolor <- "#E70000" + pointsize <- opt$item_size * 1.5 + the_theme <- theme( + panel.background = element_rect(fill = "gray100", colour = "#6D9EC1", + size = 2, linetype = "solid"), + panel.grid.major = element_line(size = 0.5, linetype = "solid", + colour = "#6D9EC1"), + panel.grid.minor = element_line(size = 0.25, linetype = "solid", + colour = "darkslategray3") + ) if (opt$factor == "") { - ggplot(embedding, aes(x = V1, y = V2)) + - geom_point(size = 1, color = "deepskyblue4") + - gg_legend + - xlab("t-SNE 1") + - ylab("t-SNE 2") + - ggtitle("t-SNE") + - if (opt$labels) { - geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 1.5, color = "deepskyblue4") - } - } else { - if (is.numeric(contrasting_factor$factor)) { - embedding$factor <- contrasting_factor$factor - } else { - embedding$factor <- as.factor(contrasting_factor$factor) - } - - ggplot(embedding, aes(x = V1, y = V2, color = factor)) + - geom_point(size = 1) + + p <- ggplot(embedding, aes(x = V1, y = V2)) + + geom_point(size = pointsize * 0.25, color = pointcolor) + gg_legend + xlab("t-SNE 1") + ylab("t-SNE 2") + ggtitle("t-SNE") + + the_theme + if (opt$labels) { - geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = 1.5) + geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = pointsize, color = pointcolor) } + } else { + if (is.numeric(contrasting_factor[, 2])) { + embedding$factor <- contrasting_factor[, 2] + } else { + embedding$factor <- as.factor(contrasting_factor[, 2]) } - ggsave(file = opt$pdf_out, device = "pdf") - - #save coordinates table - if (opt$table_coordinates != "") { - coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6)) - colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$Rtsne_dims))) - } -} - - -######### make PCA with FactoMineR ################# -if (opt$visu_choice == "PCA") { - pca <- PCA(t(data), ncp = opt$PCA_npc, graph = FALSE) - pdf(opt$pdf_out) - if (opt$labels == FALSE) { - plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), label = "none", col.ind = factor_cols) - } else { - plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), cex = 0.2, col.ind = factor_cols) - } -if (opt$factor != "") { - if (is.factor(contrasting_factor$factor)) { - legend(x = "topright", - legend = as.character(factor_colors$factor), - col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7) - } else { - legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) + p <- ggplot(embedding, aes(x = V1, y = V2, color = factor)) + + geom_point(size = pointsize * 0.25) + + gg_legend + + xlab("t-SNE 1") + + ylab("t-SNE 2") + + ggtitle("t-SNE") + + the_theme + + if (opt$labels) { + geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = pointsize) + } } -} -dev.off() - - #save coordinates table - if (opt$table_coordinates != "") { - coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6)) - colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$PCA_npc))) - } - -} - -########### make HCPC with FactoMineR ########## -if (opt$visu_choice == "HCPC") { - -# HCPC starts with a PCA -pca <- PCA( - t(data), - ncp = opt$HCPC_npc, - graph = FALSE, -) - -pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA - -# Hierarchical Clustering On Principal Components Followed By Kmean Clustering -res_hcpc <- HCPC(pca, - nb.clust = opt$HCPC_ncluster, metric = opt$HCPC_metric, method = opt$HCPC_method, - graph = FALSE, consol = opt$HCPC_consol, iter.max = opt$HCPC_itermax, min = opt$HCPC_min, max = opt$HCPC_max, - cluster.CA = opt$HCPC_clusterCA, kk = opt$HCPC_kk) -# HCPC plots -dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2 -pdf(opt$pdf_out) -plot(res_hcpc, choice = "tree") -plot(res_hcpc, choice = "bar") -plot(res_hcpc, choice = "3D.map") -if (opt$labels == FALSE) { -plot(res_hcpc, choice = "map", label = "none") -} else { -plot(res_hcpc, choice = "map") + pdf(opt$pdf_out) + print(p) + dev.off() } - -# user contrasts on the pca -if (opt$factor != "") { - plot(pca, label = "none", col.ind = factor_cols) - if (is.factor(contrasting_factor$factor)) { - legend(x = "topright", - legend = as.character(factor_colors$factor), - col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7) - - ## Normalized Mutual Information - sink(opt$HCPC_mutual_info) - res <- external_validation( - true_labels = as.numeric(contrasting_factor$factor), - clusters = as.numeric(res_hcpc$data.clust$clust), - summary_stats = TRUE - ) - sink() - - } else { - legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) - } -} - -dev.off() - -if (opt$table_coordinates != "") { - coord_table <- cbind(Cell = rownames(res_hcpc$call$X), - round(as.data.frame(res_hcpc$call$X[, -length(res_hcpc$call$X)]), 6), - as.data.frame(res_hcpc$call$X[, length(res_hcpc$call$X)]) - ) - colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$HCPC_npc)), "Cluster") - } - -if (opt$HCPC_clust != "") { -res_clustering <- data.frame(Cell = rownames(res_hcpc$data.clust), - Cluster = res_hcpc$data.clust$clust) - - } - -# Description of cluster by most contributing variables / gene expressions - -# first transform list of vectors in a list of dataframes -extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame) -# second, transfer rownames (genes) to column in the dataframe, before rbinding -extract_description_w_genes <- Map(cbind, - extract_description, - genes = lapply(extract_description, rownames) - ) -# Then use data.table to collapse all generated dataframe, with the cluster id in first column -# using the {data.table} rbindlist function -cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id") -cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # reorganize columns - - -# Finally, output cluster description data frame -write.table( - cluster_description, - file = opt$HCPC_cluster_description, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE - ) - -} - -## Return coordinates file to user - -if (opt$table_coordinates != "") { - write.table( - coord_table, - file = opt$table_coordinates, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE - ) -} - - -if (opt$HCPC_clust != "") { - write.table( - res_clustering, - file = opt$HCPC_clust, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE - ) -}