diff high_dim_visu.R @ 8:fe6f76030168 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_high_dimension_visualization commit a3dc683410fc240f428c8fbee3c63aa9965fbf38
author artbio
date Wed, 29 Nov 2023 17:28:18 +0000
parents 18a1dc4aec4a
children 58aa18e1fe14
line wrap: on
line diff
--- a/high_dim_visu.R	Sun May 21 16:26:23 2023 +0000
+++ b/high_dim_visu.R	Wed Nov 29 17:28:18 2023 +0000
@@ -1,11 +1,12 @@
 options(show.error.messages = FALSE,
-        error = function() {
-            cat(geterrmessage(), file = stderr())
-            q("no", 1, FALSE)
-        }
+  error = function() {
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
+  }
 )
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 warnings()
+
 library(optparse)
 library(FactoMineR)
 library(factoextra)
@@ -17,7 +18,6 @@
 library(data.table)
 library(Polychrome)
 
-# Arguments
 option_list <- list(
   make_option(
     "--data",
@@ -26,24 +26,6 @@
     help = "Input file that contains expression value to visualise"
   ),
   make_option(
-    "--sep",
-    default = "\t",
-    type = "character",
-    help = "File separator [default : '%default' ]"
-  ),
-  make_option(
-    "--colnames",
-    default = TRUE,
-    type = "logical",
-    help = "Consider first line as header ? [default : '%default' ]"
-  ),
-  make_option(
-    "--out",
-    default = "res.tab",
-    type = "character",
-    help = "Output name [default : '%default' ]"
-  ),
-  make_option(
     "--labels",
     default = FALSE,
     type = "logical",
@@ -62,12 +44,6 @@
     help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]"
   ),
   make_option(
-    "--table_coordinates",
-    default = "",
-    type = "character",
-    help = "Table with plot coordinates [default : '%default' ]"
-  ),
-  make_option(
     "--Rtsne_seed",
     default = 42,
     type = "integer",
@@ -115,7 +91,7 @@
     type = "logical",
     help = "Should data be centered before pca is applied? [default : '%default' ]"
   ),
-   make_option(
+  make_option(
     "--Rtsne_pca_scale",
     default = FALSE,
     type = "logical",
@@ -133,20 +109,26 @@
     type = "numeric",
     help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]"
   ),
-   make_option(
+  make_option(
     "--PCA_npc",
     default = 5,
     type = "integer",
     help = "number of dimensions kept in the results [default : '%default' ]"
   ),
-   make_option(
-    "--PCA_x_axis",
+  make_option(
+    "--item_size",
+    default = 1,
+    type = "numeric",
+    help = "Size of points/labels in PCA [default : '%default' ]"
+  ),
+  make_option(
+    "--x_axis",
     default = 1,
     type = "integer",
     help = "PC to plot in the x axis [default : '%default' ]"
   ),
-   make_option(
-    "--PCA_y_axis",
+  make_option(
+    "--y_axis",
     default = 2,
     type = "integer",
     help = "PC to plot in the y axis [default : '%default' ]"
@@ -157,7 +139,7 @@
     type = "numeric",
     help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]"
   ),
-   make_option(
+  make_option(
     "--HCPC_npc",
     default = 5,
     type = "integer",
@@ -199,13 +181,13 @@
     type = "integer",
     help = "The least possible number of clusters suggested [default :'%default']"
   ),
-   make_option(
+  make_option(
     "--HCPC_max",
     default = -1,
     type = "integer",
     help = "The higher possible number of clusters suggested [default :'%default']"
   ),
-   make_option(
+  make_option(
     "--HCPC_clusterCA",
     default = "rows",
     type = "character",
@@ -218,124 +200,164 @@
     help = "The maximum number of iterations for the consolidation [default :'%default']"
   ),
   make_option(
-    "--HCPC_clust",
-    default = "",
-    type = "character",
-    help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']"
-  ),
-  make_option(
     "--HCPC_mutual_info",
     default = "",
     type = "character",
     help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']"
   ),
   make_option(
-    "--HCPC_cluster_description",
+    "--HCPC_cell_clust",
     default = "",
     type = "character",
-    help = "Output file with variables most contributing to clustering [default :'%default']"
+    help = "Lists cells in the clusters generated by HCPC clustering. 2-column table (cell identifiers/clusters) [default :'%default']"
+  ),
+  make_option(
+    "--HCPC_contributions",
+    default = "",
+    type = "character",
+    help = "Table of variables (genes) most contributing to HCPC clustering [default :'%default']"
   )
 )
 
 opt <- parse_args(OptionParser(option_list = option_list),
                   args = commandArgs(trailingOnly = TRUE))
 
-if (opt$sep == "tab") {
-    opt$sep <- "\t"
-}
-if (opt$sep == "comma") {
-    opt$sep <- ","
-}
 if (opt$HCPC_max == -1) {
-    opt$HCPC_max <- NULL
+  opt$HCPC_max <- NULL
 }
 if (opt$HCPC_kk == -1) {
-    opt$HCPC_kk <- Inf
+  opt$HCPC_kk <- Inf
 }
 
-##Add legend to plot()
-legend_col <- function(col, lev) {
-
-opar <- par
-
-n <- length(col)
-
-bx <- par("usr")
-
-box_cx <- c(bx[2] + (bx[2] - bx[1]) / 1000,
-bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50)
-box_cy <- c(bx[3], bx[3])
-box_sy <- (bx[4] - bx[3]) / n
-
-xx <- rep(box_cx, each = 2)
-
-par(xpd = TRUE)
-for (i in 1:n) {
-  yy <- c(box_cy[1] + (box_sy * (i - 1)),
-  box_cy[1] + (box_sy * (i)),
-  box_cy[1] + (box_sy * (i)),
-  box_cy[1] + (box_sy * (i - 1)))
-  polygon(xx, yy, col = col[i], border = col[i])
-}
-par(new = TRUE)
-plot(0, 0, type = "n",
-ylim = c(min(lev), max(lev)),
-yaxt = "n", ylab = "",
-xaxt = "n", xlab = "",
-frame.plot = FALSE)
-axis(side = 4, las = 2, tick = FALSE, line = .25)
-par <- opar
-}
-
+#### We treat data once, at the beginning of the script ####
 data <- read.delim(
   opt$data,
   check.names = FALSE,
-  header = opt$colnames,
+  header = TRUE,
   row.names = 1,
-  sep = opt$sep
+  sep = "\t"
 )
+# we transpose immediately, because this is the common data structure
+data <- as.data.frame(t(data))
 
-# Contrasting factor and its colors
+# we treat the factor for usage in 3 methods
 if (opt$factor != "") {
-  contrasting_factor <- read.delim(
-    opt$factor,
-    header = TRUE
-  )
+  contrasting_factor <- read.delim(opt$factor, header = TRUE)
   rownames(contrasting_factor) <- contrasting_factor[, 1]
-  contrasting_factor <- contrasting_factor[colnames(data), ]
-  colnames(contrasting_factor) <- c("id", "factor")
-  if (is.numeric(contrasting_factor$factor)) {
-    factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor]
+  # we pick only the relevant values of the contrasting factor
+  contrasting_factor <- contrasting_factor[rownames(data), ]
+  sup <- colnames(contrasting_factor)[2]
+  if (!is.numeric(contrasting_factor[, 2])) {
+    contrasting_factor[, 2] <- as.factor(contrasting_factor[, 2])
+  }
+}
+
+######### make PCA with FactoMineR #################
+if (opt$visu_choice == "PCA") {
+  if (opt$labels) {
+    labels <- "ind"
   } else {
-    contrasting_factor$factor <- as.factor(contrasting_factor$factor)
-    if (nlevels(contrasting_factor$factor) == 2) {
-      colors_labels <- c("#E41A1C", "#377EB8")
+    labels <- "none"
+  }
+  pdf(opt$pdf_out)
+  if (opt$factor != "") {
+    data <- cbind(data, contrasting_factor[, 2])
+    colnames(data)[length(data)] <- sup
+    if (is.numeric(contrasting_factor[, 2])) {
+      res_pca <- PCA(X = data, quanti.sup = sup, graph = FALSE)
+      pca_plot <- plot(res_pca, habillage = sup, label = labels,
+                       title = "PCA graph of cells", cex = opt$item_size,
+                       axes = c(opt$x_axis, opt$y_axis))
     } else {
-      set.seed(567629)
-      colors_labels <- createPalette(nlevels(contrasting_factor$factor), c("#5A5156", "#E4E1E3", "#F6222E"))
-      names(colors_labels) <- NULL
+      res_pca <- PCA(X = data, quali.sup = sup, graph = FALSE)
+      pca_plot <- plot(res_pca, habillage = sup, label = labels,
+                       title = "PCA graph of cells", cex = opt$item_size,
+                       axes = c(opt$x_axis, opt$y_axis))
     }
-    factor_colors <-
-      with(contrasting_factor,
-           data.frame(factor = levels(contrasting_factor$factor),
-                      color = I(colors_labels)
-           )
-      )
-    factor_cols <- factor_colors$color[match(contrasting_factor$factor,
-                                          factor_colors$factor)]
+  } else {
+    res_pca <- PCA(X = data, graph = FALSE)
+    pca_plot <- plot(res_pca, label = labels,
+                     title = "PCA graph of cells", cex = opt$item_size,
+                     axes = c(opt$x_axis, opt$y_axis), col.ind = "deepskyblue4")
   }
-} else {
-  factor_cols <- rep("deepskyblue4", length(rownames(data)))
+  print(pca_plot)
+  dev.off()
 }
 
+########### make HCPC with FactoMineR ##########
+if (opt$visu_choice == "HCPC") {
+  pdf(opt$pdf_out)
+  # HCPC starts with a PCA
+  pca <- PCA(X = data, ncp = opt$HCPC_npc, graph = FALSE)
+  pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA
+  # Hierarchical Clustering On Principal Components Followed By Kmean Clustering
+  res_hcpc <- HCPC(pca,
+                   nb.clust = opt$HCPC_ncluster,
+                   metric = opt$HCPC_metric,
+                   method = opt$HCPC_method,
+                   graph = FALSE,
+                   consol = opt$HCPC_consol,
+                   iter.max = opt$HCPC_itermax,
+                   min = opt$HCPC_min,
+                   max = opt$HCPC_max,
+                   cluster.CA = opt$HCPC_clusterCA,
+                   kk = opt$HCPC_kk)
+  # HCPC plots
+  dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2
+  plot(res_hcpc, choice = "tree")
+  plot(res_hcpc, choice = "bar")
+  if (opt$labels == FALSE) {
+    plot(res_hcpc, choice = "3D.map", ind.names = FALSE)
+    plot(res_hcpc, choice = "map", label = "none")
+  } else {
+    plot(res_hcpc, choice = "3D.map")
+    plot(res_hcpc, choice = "map")
+  }
+  ## Normalized Mutual Information
+  if (opt$factor != "") {
+    sink(opt$HCPC_mutual_info)
+    cat("Relationship between input factor and its levels and the HCPC clusters")
+    res <- external_validation(true_labels = as.numeric(contrasting_factor[, 2]),
+                               clusters = as.numeric(res_hcpc$data.clust$clust),
+                               summary_stats = TRUE)
+    sink()
+  }
+  dev.off()
+
+  res_clustering <- data.frame(Cell = rownames(res_hcpc$data.clust),
+                               Cluster = res_hcpc$data.clust$clust)
+  # Description of cluster by most contributing variables / gene expressions
+  # first transform list of vectors in a list of dataframes
+  extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame)
+  # second, transfer rownames (genes) to column in the dataframe, before rbinding
+  extract_description_w_genes <- Map(cbind,
+                                     extract_description,
+                                     genes = lapply(extract_description, rownames))
+  # Then collapse all dataframes with cluster_id in 1st column using {data.table} rbindlist()
+  cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id")
+  cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # swap columns
+  cluster_description <- cluster_description[order(cluster_description[[2]],
+                                                   cluster_description[[8]]), ] # sort by cluster then by pval
+  # Finally, output cluster description data frame
+  write.table(cluster_description,
+              file = opt$HCPC_contributions,
+              sep = "\t",
+              quote = FALSE,
+              col.names = TRUE,
+              row.names = FALSE)
+
+  ## Return cluster table to user
+  write.table(res_clustering,
+              file = opt$HCPC_cell_clust,
+              sep = "\t",
+              quote = FALSE,
+              col.names = TRUE,
+              row.names = FALSE)
+}
 ################  t-SNE ####################
 if (opt$visu_choice == "tSNE") {
-  # filter and transpose df for tsne and pca
-  tdf <- t(data)
-  # make tsne and plot results
   set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility
-
-  tsne_out <- Rtsne(tdf,
+  tsne_out <- Rtsne(data,
                     dims = opt$Rtsne_dims,
                     initial_dims = opt$Rtsne_initial_dims,
                     perplexity = opt$Rtsne_perplexity,
@@ -346,190 +368,48 @@
                     pca_scale = opt$Rtsne_pca_scale,
                     normalize = opt$Rtsne_normalize,
                     exaggeration_factor = opt$Rtsne_exaggeration_factor)
-
   embedding <- as.data.frame(tsne_out$Y[, 1:2])
-  embedding$Class <- as.factor(rownames(tdf))
+  embedding$Class <- as.factor(rownames(data))
   gg_legend <- theme(legend.position = "right")
+  pointcolor <- "#E70000"
+  pointsize <- opt$item_size * 1.5
+  the_theme <- theme(
+    panel.background = element_rect(fill = "gray100", colour = "#6D9EC1",
+                                    size = 2, linetype = "solid"),
+    panel.grid.major = element_line(size = 0.5, linetype = "solid",
+                                    colour = "#6D9EC1"),
+    panel.grid.minor = element_line(size = 0.25, linetype = "solid",
+                                    colour = "darkslategray3")
+  )
   if (opt$factor == "") {
-    ggplot(embedding, aes(x = V1, y = V2)) +
-      geom_point(size = 1, color = "deepskyblue4") +
-      gg_legend +
-      xlab("t-SNE 1") +
-      ylab("t-SNE 2") +
-      ggtitle("t-SNE") +
-      if (opt$labels) {
-        geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 1.5, color = "deepskyblue4")
-      }
-    } else {
-    if (is.numeric(contrasting_factor$factor)) {
-      embedding$factor <- contrasting_factor$factor
-    } else {
-      embedding$factor <- as.factor(contrasting_factor$factor)
-    }
-
-    ggplot(embedding, aes(x = V1, y = V2, color = factor)) +
-      geom_point(size = 1) +
+    p <- ggplot(embedding, aes(x = V1, y = V2)) +
+      geom_point(size = pointsize * 0.25, color = pointcolor) +
       gg_legend +
       xlab("t-SNE 1") +
       ylab("t-SNE 2") +
       ggtitle("t-SNE") +
+      the_theme +
       if (opt$labels) {
-        geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = 1.5)
+        geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = pointsize, color = pointcolor)
       }
+  } else {
+    if (is.numeric(contrasting_factor[, 2])) {
+      embedding$factor <- contrasting_factor[, 2]
+    } else {
+      embedding$factor <- as.factor(contrasting_factor[, 2])
     }
-  ggsave(file = opt$pdf_out, device = "pdf")
-
-  #save coordinates table
-  if (opt$table_coordinates != "") {
-  coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6))
-  colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$Rtsne_dims)))
-  }
-}
-
-
-######### make PCA with FactoMineR #################
-if (opt$visu_choice == "PCA") {
-  pca <- PCA(t(data), ncp = opt$PCA_npc, graph = FALSE)
-  pdf(opt$pdf_out)
-  if (opt$labels == FALSE) {
-    plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), label = "none", col.ind = factor_cols)
-    } else {
-    plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), cex = 0.2, col.ind = factor_cols)
-  }
-if (opt$factor != "") {
-  if (is.factor(contrasting_factor$factor)) {
-    legend(x = "topright",
-       legend = as.character(factor_colors$factor),
-       col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7)
-  } else {
-    legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
+    p <- ggplot(embedding, aes(x = V1, y = V2, color = factor)) +
+      geom_point(size = pointsize * 0.25) +
+      gg_legend +
+      xlab("t-SNE 1") +
+      ylab("t-SNE 2") +
+      ggtitle("t-SNE") +
+      the_theme +
+      if (opt$labels) {
+        geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = pointsize)
+      }
   }
-}
-dev.off()
-
-  #save coordinates table
-  if (opt$table_coordinates != "") {
-  coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6))
-  colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$PCA_npc)))
-  }
-
-}
-
-########### make HCPC with FactoMineR ##########
-if (opt$visu_choice == "HCPC") {
-
-# HCPC starts with a PCA
-pca <- PCA(
-    t(data),
-    ncp = opt$HCPC_npc,
-    graph = FALSE,
-)
-
-pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA
-
-# Hierarchical Clustering On Principal Components Followed By Kmean Clustering
-res_hcpc <- HCPC(pca,
-                 nb.clust = opt$HCPC_ncluster, metric = opt$HCPC_metric, method = opt$HCPC_method,
-                 graph = FALSE, consol = opt$HCPC_consol, iter.max = opt$HCPC_itermax, min = opt$HCPC_min, max = opt$HCPC_max,
-                 cluster.CA = opt$HCPC_clusterCA, kk = opt$HCPC_kk)
-# HCPC plots
-dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2
-pdf(opt$pdf_out)
-plot(res_hcpc, choice = "tree")
-plot(res_hcpc, choice = "bar")
-plot(res_hcpc, choice = "3D.map")
-if (opt$labels == FALSE) {
-plot(res_hcpc, choice = "map", label = "none")
-} else {
-plot(res_hcpc, choice = "map")
+  pdf(opt$pdf_out)
+  print(p)
+  dev.off()
 }
-
-# user contrasts on the pca
-if (opt$factor != "") {
-  plot(pca, label = "none", col.ind = factor_cols)
-  if (is.factor(contrasting_factor$factor)) {
-    legend(x = "topright",
-       legend = as.character(factor_colors$factor),
-       col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7)
-
-    ## Normalized Mutual Information
-    sink(opt$HCPC_mutual_info)
-    res <- external_validation(
-       true_labels = as.numeric(contrasting_factor$factor),
-       clusters = as.numeric(res_hcpc$data.clust$clust),
-       summary_stats = TRUE
-    )
-    sink()
-
-  } else {
-    legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
-  }
-}
-
-dev.off()
-
-if (opt$table_coordinates != "") {
-  coord_table <- cbind(Cell = rownames(res_hcpc$call$X),
-                       round(as.data.frame(res_hcpc$call$X[, -length(res_hcpc$call$X)]), 6),
-                       as.data.frame(res_hcpc$call$X[, length(res_hcpc$call$X)])
-                       )
-  colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$HCPC_npc)), "Cluster")
-  }
-
-if (opt$HCPC_clust != "") {
-res_clustering <- data.frame(Cell = rownames(res_hcpc$data.clust),
-                             Cluster = res_hcpc$data.clust$clust)
-
- }
-
-# Description of cluster by most contributing variables / gene expressions
-
-# first transform list of vectors in a list of dataframes
-extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame)
-# second, transfer rownames (genes) to column in the dataframe, before rbinding
-extract_description_w_genes <- Map(cbind,
-                                   extract_description,
-                                   genes = lapply(extract_description, rownames)
-                                   )
-# Then use data.table to collapse all generated dataframe, with the cluster id in first column
-# using the {data.table} rbindlist function
-cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id")
-cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # reorganize columns
-
-
-# Finally, output cluster description data frame
-write.table(
-    cluster_description,
-    file = opt$HCPC_cluster_description,
-    sep = "\t",
-    quote = FALSE,
-    col.names = TRUE,
-    row.names = FALSE
-    )
-
-}
-
-## Return coordinates file to user
-
-if (opt$table_coordinates != "") {
-  write.table(
-    coord_table,
-    file = opt$table_coordinates,
-    sep = "\t",
-    quote = FALSE,
-    col.names = TRUE,
-    row.names = FALSE
-    )
-}
-
-
-if (opt$HCPC_clust != "") {
-  write.table(
-    res_clustering,
-    file = opt$HCPC_clust,
-    sep = "\t",
-    quote = FALSE,
-    col.names = TRUE,
-    row.names = FALSE
-    )
-}