diff high_dim_visu.R @ 4:8e17c31c536a draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 1282ac9de7c926ab251f88afb2453f52c8b14200
author artbio
date Thu, 11 Jul 2019 12:31:28 -0400
parents 8e44c9e18a56
children 569334568afa
line wrap: on
line diff
--- a/high_dim_visu.R	Thu Jun 27 06:17:08 2019 -0400
+++ b/high_dim_visu.R	Thu Jul 11 12:31:28 2019 -0400
@@ -11,7 +11,7 @@
 library(ggplot2)
 library(ggfortify)
 library(RColorBrewer)
-
+library(ClusterR)
 
 # Arguments
 option_list = list(
@@ -92,13 +92,13 @@
     default = 1.0,
     type = 'numeric',
     help = "theta [default : '%default' ]"
-  ), 
+  ),
   make_option(
     "--Rtsne_max_iter",
     default = 1000,
     type = 'integer',
     help = "max_iter [default : '%default' ]"
-  ), 
+  ),
   make_option(
     "--Rtsne_pca",
     default = TRUE,
@@ -128,13 +128,25 @@
     default = 12.0,
     type = 'numeric',
     help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]"
-  ), 
+  ),
    make_option(
     "--PCA_npc",
     default = 5,
     type = 'integer',
     help = "number of dimensions kept in the results [default : '%default' ]"
   ),
+   make_option(
+    "--PCA_x_axis",
+    default = 1,
+    type = 'integer',
+    help = "PC to plot in the x axis [default : '%default' ]"
+  ),
+   make_option(
+    "--PCA_y_axis",
+    default = 2,
+    type = 'integer',
+    help = "PC to plot in the y axis [default : '%default' ]"
+  ),
   make_option(
     "--HCPC_ncluster",
     default = -1,
@@ -200,7 +212,19 @@
     default = -1,
     type = 'numeric',
     help = "The maximum number of iterations for the consolidation [default :'%default']"
-  ) 
+  ),
+  make_option(
+    "--HCPC_clust",
+    default = "",
+    type = 'character',
+    help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']"
+  ),
+  make_option(
+    "--mutual_info",
+    default = "",
+    type = "character",
+    help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']"
+  )
 )
 
 opt = parse_args(OptionParser(option_list = option_list),
@@ -211,6 +235,43 @@
 if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL}
 if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf}
 
+##Add legend to plot()
+legend.col <- function(col, lev){
+
+opar <- par
+
+n <- length(col)
+
+bx <- par("usr")
+
+box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000,
+bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50)
+box.cy <- c(bx[3], bx[3])
+box.sy <- (bx[4] - bx[3]) / n
+
+xx <- rep(box.cx, each = 2)
+
+par(xpd = TRUE)
+for(i in 1:n){
+
+yy <- c(box.cy[1] + (box.sy * (i - 1)),
+box.cy[1] + (box.sy * (i)),
+box.cy[1] + (box.sy * (i)),
+box.cy[1] + (box.sy * (i - 1)))
+polygon(xx, yy, col = col[i], border = col[i])
+
+}
+par(new = TRUE)
+plot(0, 0, type = "n",
+ylim = c(min(lev), max(lev)),
+yaxt = "n", ylab = "",
+xaxt = "n", xlab = "",
+frame.plot = FALSE)
+axis(side = 4, las = 2, tick = FALSE, line = .25)
+par <- opar
+}
+
+
 data = read.table(
   opt$data,
   check.names = FALSE,
@@ -228,20 +289,24 @@
   rownames(contrasting_factor) <- contrasting_factor[,1]
   contrasting_factor <- contrasting_factor[colnames(data),]
   colnames(contrasting_factor) <- c("id","factor")
-  contrasting_factor$factor <- as.factor(contrasting_factor$factor)
-  if(nlevels(contrasting_factor$factor) == 2){
-    colors_labels <- c("#E41A1C", "#377EB8")
+  if(is.numeric(contrasting_factor$factor)){
+    factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor]
   } else {
-    colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired')
+    contrasting_factor$factor <- as.factor(contrasting_factor$factor)
+    if(nlevels(contrasting_factor$factor) == 2){
+      colors_labels <- c("#E41A1C", "#377EB8")
+    } else {
+      colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired')
+    }
+    factorColors <-
+      with(contrasting_factor,
+           data.frame(factor = levels(contrasting_factor$factor),
+                      color = I(colors_labels)
+           )
+      )
+    factor_cols <- factorColors$color[match(contrasting_factor$factor,
+                                          factorColors$factor)]
   }
-  factorColors <-
-    with(contrasting_factor,
-         data.frame(factor = levels(contrasting_factor$factor),
-                    color = I(colors_labels)
-         )
-    )
-  factor_cols <- factorColors$color[match(contrasting_factor$factor,
-                                          factorColors$factor)]
 } else {
   factor_cols <- rep("deepskyblue4", length(rownames(data)))
 }
@@ -272,19 +337,24 @@
     ggplot(embedding, aes(x=V1, y=V2)) +
       geom_point(size=1, color='deepskyblue4') +
       gg_legend +
-      xlab("") +
-      ylab("") +
+      xlab("t-SNE 1") +
+      ylab("t-SNE 2") +
       ggtitle('t-SNE') +
       if (opt$labels) {
         geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4')
       }
     } else {
-    embedding$factor <- as.factor(contrasting_factor$factor)
+    if(is.numeric(contrasting_factor$factor)){
+      embedding$factor <- contrasting_factor$factor
+    } else {
+      embedding$factor <- as.factor(contrasting_factor$factor)
+    }
+
     ggplot(embedding, aes(x=V1, y=V2, color=factor)) +
       geom_point(size=1) + #, color=factor_cols
       gg_legend +
-      xlab("") +
-      ylab("") +
+      xlab("t-SNE 1") +
+      ylab("t-SNE 2") +
       ggtitle('t-SNE') +
       if (opt$labels) {
         geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5)
@@ -294,7 +364,7 @@
   
   #save coordinates table
   if(opt$table_coordinates != ''){
-  coord_table <- cbind(rownames(tdf),as.data.frame(tsne_out$Y))
+  coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6))
   colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims)))
   }
 }
@@ -305,20 +375,24 @@
   pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE)
   pdf(opt$pdf_out)
   if (opt$labels == FALSE) {
-    plot(pca, label="none" , col.ind = factor_cols)
+    plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), label="none" , col.ind = factor_cols)
     } else {
-    plot(pca, cex=0.2 , col.ind = factor_cols)
+    plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), cex=0.2 , col.ind = factor_cols)
   }
 if (opt$factor != '') {
+  if(is.factor(contrasting_factor$factor)) {
     legend(x = 'topright', 
        legend = as.character(factorColors$factor),
        col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
+  } else {
+    legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
+  }
 }
 dev.off()
 
   #save coordinates table
   if(opt$table_coordinates != ''){
-  coord_table <- cbind(rownames(pca$ind$coord),as.data.frame(pca$ind$coord))
+  coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6))
   colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc)))
   }
 
@@ -355,11 +429,25 @@
 
 # user contrasts on the pca
 if (opt$factor != '') {
-    plot(pca, label="none", habillage="ind", col.hab=factor_cols)
+  plot(pca, label="none", col.ind = factor_cols)
+  if(is.factor(contrasting_factor$factor)) {
     legend(x = 'topright', 
        legend = as.character(factorColors$factor),
        col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
-    }
+    
+    ## Normalized Mutual Information
+    sink(opt$mutual_info)
+    res <- external_validation(
+       true_labels = as.numeric(contrasting_factor$factor),
+       clusters = as.numeric(res.hcpc$data.clust$clust),
+       summary_stats = TRUE
+    )
+    sink()
+
+  } else {
+    legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
+  }
+}
 ## Clusters to which individual observations belong # used ?
 # Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)],
 #                     Observation = rownames(res.hcpc$data.clust))
@@ -380,21 +468,22 @@
 # Sil = fviz_silhouette(hc.cut)
 # sil1 = as.data.frame(Sil$data)
 
-## Normalized Mutual Information # to be implemented later
-# sink(opt$mutual_info)
-# res = external_validation(
-#   as.numeric(factor(metadata[, Patient])),
-#   as.numeric(metadata$Cluster),
-#   method = "adjusted_rand_index",
-#   summary_stats = TRUE
-# )
-# sink()
 dev.off()
 
  if(opt$table_coordinates != ''){
-  coord_table <- cbind(Cell=rownames(res.hcpc$call$X),as.data.frame(res.hcpc$call$X))
+  coord_table <- cbind(Cell=rownames(res.hcpc$call$X),
+                       round(as.data.frame(res.hcpc$call$X[, -length(res.hcpc$call$X)]), 6),
+                       as.data.frame(res.hcpc$call$X[, length(res.hcpc$call$X)])
+                       )
   colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster")
   }
+
+ if(opt$HCPC_clust != ""){
+ res_clustering <- data.frame(Cell = rownames(res.hcpc$data.clust),
+                              Cluster = res.hcpc$data.clust$clust)
+ 
+ }
+
 }
 
 ## Return coordinates file to user
@@ -411,7 +500,16 @@
 }
 
 
-  
+if(opt$HCPC_clust != ""){
+  write.table(
+    res_clustering,
+    file = opt$HCPC_clust,
+    sep = "\t",
+    quote = F,
+    col.names = T,
+    row.names = F
+    )
+}