view high_dim_visu.R @ 3:8e44c9e18a56 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit bd9a794c0eb55cebd2f22b528c922d306df0214e
author artbio
date Thu, 27 Jun 2019 06:17:08 -0400
parents 7e7a2a4cfce2
children 8e17c31c536a
line wrap: on
line source

# load packages that are provided in the conda env
options( show.error.messages=F,
       error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify')
warnings()
library(optparse)
library(FactoMineR)
library(factoextra)
library(Rtsne)
library(ggplot2)
library(ggfortify)
library(RColorBrewer)


# Arguments
option_list = list(
  make_option(
    "--data",
    default = NA,
    type = 'character',
    help = "Input file that contains expression value to visualise"
  ),
  make_option(
    "--sep",
    default = '\t',
    type = 'character',
    help = "File separator [default : '%default' ]"
  ),
  make_option(
    "--colnames",
    default = TRUE,
    type = 'logical',
    help = "Consider first line as header ? [default : '%default' ]"
  ),
  make_option(
    "--out",
    default = "res.tab",
    type = 'character',
    help = "Output name [default : '%default' ]"
  ),
  make_option(
    "--labels",
    default = FALSE,
    type = 'logical',
    help = "add labels in scatter plots [default : '%default' ]"
  ),
  make_option(
    "--factor",
    default = '',
    type = 'character',
    help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]"
  ),
  make_option(
    "--visu_choice",
    default = 'PCA',
    type = 'character',
    help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]"
  ),
  make_option(
    "--table_coordinates",
    default = '',
    type = 'character',
    help = "Table with plot coordinates [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_seed",
    default = 42,
    type = 'integer',
    help = "Seed value for reproducibility [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_dims",
    default = 2,
    type = 'integer',
    help = "Output dimensionality [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_initial_dims",
    default = 50,
    type = 'integer',
    help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_perplexity",
    default = 5.0,
    type = 'numeric',
    help = "perplexity [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_theta",
    default = 1.0,
    type = 'numeric',
    help = "theta [default : '%default' ]"
  ), 
  make_option(
    "--Rtsne_max_iter",
    default = 1000,
    type = 'integer',
    help = "max_iter [default : '%default' ]"
  ), 
  make_option(
    "--Rtsne_pca",
    default = TRUE,
    type = 'logical',
    help = "Whether an initial PCA step should be performed [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_pca_center",
    default = TRUE,
    type = 'logical',
    help = "Should data be centered before pca is applied? [default : '%default' ]"
  ),
   make_option(
    "--Rtsne_pca_scale",
    default = FALSE,
    type = 'logical',
    help = "Should data be scaled before pca is applied? [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_normalize",
    default = TRUE,
    type = 'logical',
    help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]"
  ),
  make_option(
    "--Rtsne_exaggeration_factor",
    default = 12.0,
    type = 'numeric',
    help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]"
  ), 
   make_option(
    "--PCA_npc",
    default = 5,
    type = 'integer',
    help = "number of dimensions kept in the results [default : '%default' ]"
  ),
  make_option(
    "--HCPC_ncluster",
    default = -1,
    type = 'numeric',
    help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]"
  ),
   make_option(
    "--HCPC_npc",
    default = 5,
    type = 'integer',
    help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]"
  ),
  make_option(
    "--HCPC_metric",
    default = 'euclidian',
    type = 'character',
    help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]"
  ),
  make_option(
    "--HCPC_method",
    default = 'ward',
    type = 'character',
    help = "Clustering method between 'ward','average','single', 'complete', 'weighted'  [default :'%default']"
  ),
  make_option(
    "--pdf_out",
    default = "out.pdf",
    type = 'character',
    help = "pdf of plots [default : '%default' ]"
  ),
  make_option(
    "--HCPC_consol",
    default = 'TRUE',
    type = 'logical',
    help = "If TRUE, a k-means consolidation is performed [default :'%default']"
  ),
  make_option(
    "--HCPC_itermax",
    default = '10',
    type = 'integer',
    help = "The maximum number of iterations for the consolidation [default :'%default']"
  ),
  make_option(
    "--HCPC_min",
    default = '3',
    type = 'integer',
    help = "The least possible number of clusters suggested [default :'%default']"
  ),
   make_option(
    "--HCPC_max",
    default = -1,
    type = 'integer',
    help = "The higher possible number of clusters suggested [default :'%default']"
  ),
   make_option(
    "--HCPC_clusterCA",
    default = 'rows',
    type = 'character',
    help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']"
  ),
  make_option(
    "--HCPC_kk",
    default = -1,
    type = 'numeric',
    help = "The maximum number of iterations for the consolidation [default :'%default']"
  ) 
)

opt = parse_args(OptionParser(option_list = option_list),
                 args = commandArgs(trailingOnly = TRUE))

if (opt$sep == "tab") {opt$sep <- "\t"}
if (opt$sep == "comma") {opt$sep <- ","}
if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL}
if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf}

data = read.table(
  opt$data,
  check.names = FALSE,
  header = opt$colnames,
  row.names = 1,
  sep = opt$sep
)

# Contrasting factor and its colors
if (opt$factor != '') {
  contrasting_factor <- read.delim(
    opt$factor,
    header = TRUE
  )
  rownames(contrasting_factor) <- contrasting_factor[,1]
  contrasting_factor <- contrasting_factor[colnames(data),]
  colnames(contrasting_factor) <- c("id","factor")
  contrasting_factor$factor <- as.factor(contrasting_factor$factor)
  if(nlevels(contrasting_factor$factor) == 2){
    colors_labels <- c("#E41A1C", "#377EB8")
  } else {
    colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired')
  }
  factorColors <-
    with(contrasting_factor,
         data.frame(factor = levels(contrasting_factor$factor),
                    color = I(colors_labels)
         )
    )
  factor_cols <- factorColors$color[match(contrasting_factor$factor,
                                          factorColors$factor)]
} else {
  factor_cols <- rep("deepskyblue4", length(rownames(data)))
}

################  t-SNE ####################
if (opt$visu_choice == 'tSNE') {
  # filter and transpose df for tsne and pca
  tdf = t(data)
  # make tsne and plot results
  set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility

  tsne_out <- Rtsne(tdf,
                    dims = opt$Rtsne_dims,
                    initial_dims = opt$Rtsne_initial_dims, 
                    perplexity = opt$Rtsne_perplexity ,
                    theta = opt$Rtsne_theta,
                    max_iter = opt$Rtsne_max_iter,
                    pca = opt$Rtsne_pca, 
                    pca_center = opt$Rtsne_pca_center,
                    pca_scale = opt$Rtsne_pca_scale,
                    normalize = opt$Rtsne_normalize,
                    exaggeration_factor=opt$Rtsne_exaggeration_factor)

  embedding <- as.data.frame(tsne_out$Y[,1:2])
  embedding$Class <- as.factor(rownames(tdf))
  gg_legend = theme(legend.position="right")
  if (opt$factor == '') {
    ggplot(embedding, aes(x=V1, y=V2)) +
      geom_point(size=1, color='deepskyblue4') +
      gg_legend +
      xlab("") +
      ylab("") +
      ggtitle('t-SNE') +
      if (opt$labels) {
        geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4')
      }
    } else {
    embedding$factor <- as.factor(contrasting_factor$factor)
    ggplot(embedding, aes(x=V1, y=V2, color=factor)) +
      geom_point(size=1) + #, color=factor_cols
      gg_legend +
      xlab("") +
      ylab("") +
      ggtitle('t-SNE') +
      if (opt$labels) {
        geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5)
      }
    }    
  ggsave(file=opt$pdf_out, device="pdf")
  
  #save coordinates table
  if(opt$table_coordinates != ''){
  coord_table <- cbind(rownames(tdf),as.data.frame(tsne_out$Y))
  colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims)))
  }
}


######### make PCA with FactoMineR #################
if (opt$visu_choice == 'PCA') {
  pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE)
  pdf(opt$pdf_out)
  if (opt$labels == FALSE) {
    plot(pca, label="none" , col.ind = factor_cols)
    } else {
    plot(pca, cex=0.2 , col.ind = factor_cols)
  }
if (opt$factor != '') {
    legend(x = 'topright', 
       legend = as.character(factorColors$factor),
       col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
}
dev.off()

  #save coordinates table
  if(opt$table_coordinates != ''){
  coord_table <- cbind(rownames(pca$ind$coord),as.data.frame(pca$ind$coord))
  colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc)))
  }

}

########### make HCPC with FactoMineR ##########
if (opt$visu_choice == 'HCPC') {

# HCPC starts with a PCA
pca <- PCA(
    t(data),
    ncp = opt$HCPC_npc,
    graph = FALSE,
)

PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA

# Hierarchical Clustering On Principal Components Followed By Kmean Clustering
res.hcpc <- HCPC(pca,
                 nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method,
                 graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max,
                 cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk)
# HCPC plots
dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2
pdf(opt$pdf_out)
plot(res.hcpc, choice="tree")
plot(res.hcpc, choice="bar")
plot(res.hcpc, choice="3D.map")
if (opt$labels == FALSE) {
plot(res.hcpc, choice="map", label="none")
} else {
plot(res.hcpc, choice="map")
}

# user contrasts on the pca
if (opt$factor != '') {
    plot(pca, label="none", habillage="ind", col.hab=factor_cols)
    legend(x = 'topright', 
       legend = as.character(factorColors$factor),
       col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
    }
## Clusters to which individual observations belong # used ?
# Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)],
#                     Observation = rownames(res.hcpc$data.clust))
# metadata <- data.frame(Observation=colnames(data), row.names=colnames(data))
# metadata = merge(y = metadata,
#                  x = Clust,
#                  by = "Observation")

# unclear utility
# ObsNumberPerCluster = as.data.frame(table(metadata$Cluster))
# colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber")
# 
# ## Silhouette Plot # not used
# hc.cut = hcut(PCA_IndCoord,
#               k = nlevels(metadata$Cluster),
#               hc_method = "ward.D2")
#               
# Sil = fviz_silhouette(hc.cut)
# sil1 = as.data.frame(Sil$data)

## Normalized Mutual Information # to be implemented later
# sink(opt$mutual_info)
# res = external_validation(
#   as.numeric(factor(metadata[, Patient])),
#   as.numeric(metadata$Cluster),
#   method = "adjusted_rand_index",
#   summary_stats = TRUE
# )
# sink()
dev.off()

 if(opt$table_coordinates != ''){
  coord_table <- cbind(Cell=rownames(res.hcpc$call$X),as.data.frame(res.hcpc$call$X))
  colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster")
  }
}

## Return coordinates file to user

if(opt$table_coordinates != ''){
  write.table(
    coord_table,
    file = opt$table_coordinates,
    sep = "\t",
    quote = F,
    col.names = T,
    row.names = F
    )
}