Mercurial > repos > artbio > gsc_high_dimensions_visualisation
view high_dim_visu.R @ 9:58aa18e1fe14 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_high_dimension_visualization commit 7343be2d3b1b8cb1ba0c4c55767b60dbce8f8b22
author | artbio |
---|---|
date | Thu, 07 Nov 2024 22:43:01 +0000 |
parents | fe6f76030168 |
children |
line wrap: on
line source
options( show.error.messages = FALSE, error = function() { cat(geterrmessage(), file = stderr()) q("no", 1, FALSE) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(FactoMineR) library(factoextra) library(Rtsne) library(ggplot2) library(ggfortify) library(RColorBrewer) library(ClusterR) library(data.table) library(Polychrome) option_list <- list( make_option( "--data", default = NA, type = "character", help = "Input file that contains expression value to visualise" ), make_option( "--labels", default = FALSE, type = "logical", help = "add labels in scatter plots [default : '%default' ]" ), make_option( "--factor", default = "", type = "character", help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]" ), make_option( "--visu_choice", default = "PCA", type = "character", help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" ), make_option( "--Rtsne_seed", default = 42, type = "integer", help = "Seed value for reproducibility [default : '%default' ]" ), make_option( "--Rtsne_dims", default = 2, type = "integer", help = "Output dimensionality [default : '%default' ]" ), make_option( "--Rtsne_initial_dims", default = 50, type = "integer", help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]" ), make_option( "--Rtsne_perplexity", default = 5.0, type = "numeric", help = "perplexity [default : '%default' ]" ), make_option( "--Rtsne_theta", default = 1.0, type = "numeric", help = "theta [default : '%default' ]" ), make_option( "--Rtsne_max_iter", default = 1000, type = "integer", help = "max_iter [default : '%default' ]" ), make_option( "--Rtsne_pca", default = TRUE, type = "logical", help = "Whether an initial PCA step should be performed [default : '%default' ]" ), make_option( "--Rtsne_pca_center", default = TRUE, type = "logical", help = "Should data be centered before pca is applied? [default : '%default' ]" ), make_option( "--Rtsne_pca_scale", default = FALSE, type = "logical", help = "Should data be scaled before pca is applied? [default : '%default' ]" ), make_option( "--Rtsne_normalize", default = TRUE, type = "logical", help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]" ), make_option( "--Rtsne_exaggeration_factor", default = 12.0, type = "numeric", help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" ), make_option( "--PCA_npc", default = 5, type = "integer", help = "number of dimensions kept in the results [default : '%default' ]" ), make_option( "--item_size", default = 1, type = "numeric", help = "Size of points/labels in PCA [default : '%default' ]" ), make_option( "--x_axis", default = 1, type = "integer", help = "PC to plot in the x axis [default : '%default' ]" ), make_option( "--y_axis", default = 2, type = "integer", help = "PC to plot in the y axis [default : '%default' ]" ), make_option( "--HCPC_ncluster", default = -1, type = "numeric", help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" ), make_option( "--HCPC_npc", default = 5, type = "integer", help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]" ), make_option( "--HCPC_metric", default = "euclidean", type = "character", help = "Metric to be used for calculating dissimilarities between observations, available 'euclidean' or 'manhattan' [default : '%default' ]" ), make_option( "--HCPC_method", default = "ward", type = "character", help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']" ), make_option( "--pdf_out", default = "out.pdf", type = "character", help = "pdf of plots [default : '%default' ]" ), make_option( "--HCPC_consol", default = "TRUE", type = "logical", help = "If TRUE, a k-means consolidation is performed [default :'%default']" ), make_option( "--HCPC_itermax", default = "10", type = "integer", help = "The maximum number of iterations for the consolidation [default :'%default']" ), make_option( "--HCPC_min", default = "3", type = "integer", help = "The least possible number of clusters suggested [default :'%default']" ), make_option( "--HCPC_max", default = -1, type = "integer", help = "The higher possible number of clusters suggested [default :'%default']" ), make_option( "--HCPC_clusterCA", default = "rows", type = "character", help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']" ), make_option( "--HCPC_kk", default = Inf, type = "numeric", help = "The maximum number of iterations for the consolidation [default :'%default']" ), make_option( "--HCPC_mutual_info", default = "", type = "character", help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']" ), make_option( "--HCPC_cell_clust", default = "", type = "character", help = "Lists cells in the clusters generated by HCPC clustering. 2-column table (cell identifiers/clusters) [default :'%default']" ), make_option( "--HCPC_contributions", default = "", type = "character", help = "Table of variables (genes) most contributing to HCPC clustering [default :'%default']" ) ) opt <- parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE) ) if (opt$HCPC_max == -1) { opt$HCPC_max <- NULL } if (opt$HCPC_kk == -1) { opt$HCPC_kk <- Inf } #### We treat data once, at the beginning of the script #### data <- read.delim( opt$data, check.names = FALSE, header = TRUE, row.names = 1, sep = "\t" ) # we transpose immediately, because this is the common data structure data <- as.data.frame(t(data)) # we treat the factor for usage in 3 methods if (opt$factor != "") { contrasting_factor <- read.delim(opt$factor, header = TRUE) rownames(contrasting_factor) <- contrasting_factor[, 1] # we pick only the relevant values of the contrasting factor contrasting_factor <- contrasting_factor[rownames(data), ] sup <- colnames(contrasting_factor)[2] if (!is.numeric(contrasting_factor[, 2])) { contrasting_factor[, 2] <- as.factor(contrasting_factor[, 2]) } } ######### make PCA with FactoMineR ################# if (opt$visu_choice == "PCA") { if (opt$labels) { labels <- "ind" } else { labels <- "none" } pdf(opt$pdf_out) if (opt$factor != "") { data <- cbind(data, contrasting_factor[, 2]) colnames(data)[length(data)] <- sup if (is.numeric(contrasting_factor[, 2])) { res_pca <- PCA(X = data, quanti.sup = sup, graph = FALSE) pca_plot <- plot(res_pca, habillage = sup, label = labels, title = "PCA graph of cells", cex = opt$item_size, axes = c(opt$x_axis, opt$y_axis) ) } else { res_pca <- PCA(X = data, quali.sup = sup, graph = FALSE) pca_plot <- plot(res_pca, habillage = sup, label = labels, title = "PCA graph of cells", cex = opt$item_size, axes = c(opt$x_axis, opt$y_axis) ) } } else { res_pca <- PCA(X = data, graph = FALSE) pca_plot <- plot(res_pca, label = labels, title = "PCA graph of cells", cex = opt$item_size, axes = c(opt$x_axis, opt$y_axis), col.ind = "deepskyblue4" ) } print(pca_plot) dev.off() } ########### make HCPC with FactoMineR ########## if (opt$visu_choice == "HCPC") { pdf(opt$pdf_out) # HCPC starts with a PCA pca <- PCA(X = data, ncp = opt$HCPC_npc, graph = FALSE) pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA # Hierarchical Clustering On Principal Components Followed By Kmean Clustering res_hcpc <- HCPC(pca, nb.clust = opt$HCPC_ncluster, metric = opt$HCPC_metric, method = opt$HCPC_method, graph = FALSE, consol = opt$HCPC_consol, iter.max = opt$HCPC_itermax, min = opt$HCPC_min, max = opt$HCPC_max, cluster.CA = opt$HCPC_clusterCA, kk = opt$HCPC_kk ) # HCPC plots dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2 plot(res_hcpc, choice = "tree") plot(res_hcpc, choice = "bar") if (opt$labels == FALSE) { plot(res_hcpc, choice = "3D.map", ind.names = FALSE) plot(res_hcpc, choice = "map", label = "none") } else { plot(res_hcpc, choice = "3D.map") plot(res_hcpc, choice = "map") } ## Normalized Mutual Information if (opt$factor != "") { sink(opt$HCPC_mutual_info) cat("Relationship between input factor and its levels and the HCPC clusters") res <- external_validation( true_labels = as.numeric(contrasting_factor[, 2]), clusters = as.numeric(res_hcpc$data.clust$clust), summary_stats = TRUE ) sink() } dev.off() res_clustering <- data.frame( Cell = rownames(res_hcpc$data.clust), Cluster = res_hcpc$data.clust$clust ) # Description of cluster by most contributing variables / gene expressions # first transform list of vectors in a list of dataframes extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame) # second, transfer rownames (genes) to column in the dataframe, before rbinding extract_description_w_genes <- Map(cbind, extract_description, genes = lapply(extract_description, rownames) ) # Then collapse all dataframes with cluster_id in 1st column using {data.table} rbindlist() cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id") cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # swap columns cluster_description <- cluster_description[order( cluster_description[[2]], cluster_description[[8]] ), ] # sort by cluster then by pval # Finally, output cluster description data frame write.table(cluster_description, file = opt$HCPC_contributions, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE ) ## Return cluster table to user write.table(res_clustering, file = opt$HCPC_cell_clust, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE ) } ################ t-SNE #################### if (opt$visu_choice == "tSNE") { set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility tsne_out <- Rtsne(data, dims = opt$Rtsne_dims, initial_dims = opt$Rtsne_initial_dims, perplexity = opt$Rtsne_perplexity, theta = opt$Rtsne_theta, max_iter = opt$Rtsne_max_iter, pca = opt$Rtsne_pca, pca_center = opt$Rtsne_pca_center, pca_scale = opt$Rtsne_pca_scale, normalize = opt$Rtsne_normalize, exaggeration_factor = opt$Rtsne_exaggeration_factor ) embedding <- as.data.frame(tsne_out$Y[, 1:2]) embedding$Class <- as.factor(rownames(data)) gg_legend <- theme(legend.position = "right") pointcolor <- "#E70000" pointsize <- opt$item_size * 1.5 the_theme <- theme( panel.background = element_rect( fill = "gray100", colour = "#6D9EC1", size = 2, linetype = "solid" ), panel.grid.major = element_line( size = 0.5, linetype = "solid", colour = "#6D9EC1" ), panel.grid.minor = element_line( size = 0.25, linetype = "solid", colour = "darkslategray3" ) ) if (opt$factor == "") { p <- ggplot(embedding, aes(x = V1, y = V2)) + geom_point(size = pointsize * 0.25, color = pointcolor) + gg_legend + xlab("t-SNE 1") + ylab("t-SNE 2") + ggtitle("t-SNE") + the_theme + if (opt$labels) { geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = pointsize, color = pointcolor) } } else { if (is.numeric(contrasting_factor[, 2])) { embedding$factor <- contrasting_factor[, 2] } else { embedding$factor <- as.factor(contrasting_factor[, 2]) } p <- ggplot(embedding, aes(x = V1, y = V2, color = factor)) + geom_point(size = pointsize * 0.25) + gg_legend + xlab("t-SNE 1") + ylab("t-SNE 2") + ggtitle("t-SNE") + the_theme + if (opt$labels) { geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = pointsize) } } pdf(opt$pdf_out) print(p) dev.off() }