diff CorrTable/Corr_Script_samples_row.R @ 0:b22c453e4cf4 draft

Uploaded
author melpetera
date Thu, 11 Oct 2018 05:35:55 -0400
parents
children 29ec7e3afdd4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CorrTable/Corr_Script_samples_row.R	Thu Oct 11 05:35:55 2018 -0400
@@ -0,0 +1,410 @@
+  #################################################################################################
+  # CORRELATION TABLE                                                                             #
+  #                                                                                               #
+  #                                                                                               #
+  # Input : 2 tables with common samples                                                          #
+  # Output : Correlation table ; Heatmap (pdf)                                                    #
+  #                                                                                               #
+  # Dependencies : Libraries "ggplot2" and "reshape2"                                             #
+  #                                                                                               #
+  #################################################################################################
+  
+  
+  # Parameters (for dev)
+  if(FALSE){
+    
+    rm(list = ls())
+    setwd(dir = "Y:/Developpement")
+
+    tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular"
+    tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt"
+    param1.samples <- "column"
+    param2.samples <- "row"
+    corr.method <- "pearson"
+    test.corr <- "yes"
+    alpha <- 0.05
+    multi.name <- "none"
+    filter <- "yes"
+    filters.choice <- "filters_0_thr"
+    threshold <- 0.2
+    reorder.var <- "yes"
+    color.heatmap <- "yes"
+    type.classes <-"irregular"
+    reg.value <- 1/3
+    irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4)
+    output1 <- "Correlation_table.txt"
+    output2 <- "Heatmap.pdf"
+    
+  }
+  
+  
+  
+  correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, 
+                              multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes, 
+                              reg.value, irreg.vect, output1, output2){
+
+    # This function allows to visualize the correlation between two tables
+    # 
+    # Parameters:
+    # - tab1.name: table 1 file's access
+    # - tab2.name: table 2 file's access
+    # - param1.samples ("row" or "column"): where the samples are in tab1
+    # - param2.samples ("row" or "column"): where the samples are in tab2
+    # - corr.method ("pearson", "spearman", "kendall"): 
+    # - test.corr ("yes" or "no"): test the significance of a correlation coefficient
+    # - alpha (value between 0 and 1): risk for the correlation significance test
+    # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests
+    # - filter ("yes", "no"): use filter.0 or/and filter.threshold
+    # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0
+    #   and threshold filter remove variables with all their correlation coefficients in abs < threshold
+    # - threshold (value between 0 and 1): threshold for filter threshold
+    # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA
+    # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user
+    # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes
+    # - reg.value (value between 0 and 1): value for regular classes
+    # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes)
+    # - output1: correlation table file's access
+    # - output2: heatmap (colored correlation table) file's access
+  
+  
+  # Input ----------------------------------------------------------------------------------------------
+  
+  tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) 
+  tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1)  
+  
+  # Transpose tables according to the samples
+  if(param1.samples == "column"){
+    tab1 <- t(tab1)
+  } 
+  
+  if(param2.samples == "column"){
+    tab2 <- t(tab2)
+  }
+  
+  # Sorting tables in alphabetical order of the samples
+  tab1 <- tab1[order(rownames(tab1)),]
+  tab2 <- tab2[order(rownames(tab2)),]
+  
+  
+  # Check if the 2 datasets match regarding samples identifiers
+  # Adapt from functions "check.err" and "match2", RcheckLibrary.R
+    
+  err.stock <- NULL
+    
+  id1 <- rownames(tab1)
+  id2 <- rownames(tab2)
+    
+  if(sum(id1 != id2) > 0){
+    err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n")
+      
+    if(length(which(id1%in%id2)) != length(id1)){
+      identif <- id1[which(!(id1%in%id2))]
+      if (length(identif) < 4){
+        err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n")
+      }
+      else {
+        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n")
+      }
+      identif <- identif[1:min(3,length(which(!(id1%in%id2))))]
+      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+    }
+      
+    if(length(which(id2%in%id1)) != length(id2)){
+      identif <- id2[which(!(id2%in%id1))]
+      if (length(identif) < 4){
+        err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n")
+      }
+      else{
+        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n")
+      }
+      identif <- identif[1:min(3,length(which(!(id2%in%id1))))] 
+      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+    }
+    err.stock <- c(err.stock,"\nPlease check your data.\n")
+  }
+  
+  if(length(err.stock)!=0){ 
+    stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n")
+  }
+  
+    
+  # Check qualitative variables in each input tables
+  err.msg <- NULL
+  
+  var1.quali <- vector()
+  var2.quali <- vector()
+  
+  for (i in 1:dim(tab1)[2]){
+    if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){
+      var1.quali <- c(var1.quali,i)
+    }
+  }
+  
+  for (j in 1:dim(tab2)[2]){
+    if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){
+      var2.quali <- c(var2.quali, j)
+    }
+  }
+  
+  if (length(var1.quali) != 0 | length(var2.quali) != 0){
+    err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n")
+    
+    if(length(var1.quali) != 0 && length(var1.quali) < 4){
+      err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n",
+                   "  ",paste(colnames(tab1)[var1.quali],collapse="\n    "),"\n")
+    } else if(length(var1.quali) != 0 && length(var1.quali) > 3){
+      err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n",
+                   "  ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n    "),"\n")
+    }
+    
+    if(length(var2.quali) != 0 && length(var2.quali) < 4){
+      err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", 
+                   "  ",paste(colnames(tab2)[var2.quali],collapse="\n    "),"\n")
+    } else if(length(var2.quali) != 0 && length(var2.quali) > 3){
+      err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", 
+                   "  ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n    "),"\n")
+    }  
+  }
+  
+  if(length(var1.quali) != 0){
+    tab1 <- tab1[,-var1.quali]
+  }
+  if(length(var2.quali) != 0){
+    tab2 <- tab2[,-var2.quali]
+  }
+  
+  if(length(err.msg) != 0){ 
+    cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n")
+  }
+  
+  # Correlation table ---------------------------------------------------------------------------------
+  
+  tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2])
+  for (i in 1:dim(tab2)[2]){
+    for (j in 1:dim(tab1)[2]){
+      tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") 
+    }
+  }
+  
+  colnames(tab.corr) <- colnames(tab1)
+  rownames(tab.corr) <- colnames(tab2)
+  
+  
+  
+  # Significance of correlation test  ------------------------------------------------------------------
+  
+  if (test.corr == "yes"){
+    
+    pvalue <- vector()
+    for (i in 1:dim(tab.corr)[1]){
+      for (j in 1:dim(tab.corr)[2]){
+        suppressWarnings(corrtest <- cor.test(tab2[,i], tab1[,j], method = corr.method))
+        pvalue <- c(pvalue, corrtest$p.value)
+        if (multi.name == "none"){
+          if (corrtest$p.value > alpha){
+            tab.corr[i,j] <- 0
+          }
+        }
+      }
+    }
+  
+    if(multi.name != "none"){
+      adjust <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2], byrow = T)
+      tab.corr[adjust > alpha] <- 0
+    }
+  }  
+  
+  
+  # Filter settings ------------------------------------------------------------------------------------
+  
+  if (filter == "yes"){
+    
+    # Remove variables with all their correlation coefficients = 0 :
+    if (filters.choice == "filter_0"){
+      threshold <- 0
+    } 
+      
+    var2.thres <- vector()
+    for (i in 1:dim(tab.corr)[1]){
+      if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){
+        var2.thres <- c(var2.thres, i)
+      }
+    }
+    
+    if (length(var2.thres) != 0){
+      tab.corr <- tab.corr[-var2.thres,]
+      tab2 <- tab2[, -var2.thres]
+    }
+    
+    var1.thres <- vector()
+    for (i in 1:dim(tab.corr)[2]){
+      if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){
+        var1.thres <- c(var1.thres, i)
+      }
+    }
+      
+    if (length(var1.thres) != 0){
+      tab.corr <- tab.corr[,-var1.thres]
+      tab1 <- tab1[,-var1.thres]
+    }
+    
+  } 
+  
+  
+  # Reorder variables in the correlation table (with the HCA) ------------------------------------------
+  if (reorder.var == "yes"){
+    
+    cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs")
+    dist.tab2 <- as.dist(1 - cormat.tab2)
+    hc.tab2 <- hclust(dist.tab2, method = "ward.D2")
+    tab.corr <- tab.corr[hc.tab2$order,]
+    
+    cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs")
+    dist.tab1 <- as.dist(1 - cormat.tab1)
+    hc.tab1 <- hclust(dist.tab1, method = "ward.D2")
+    tab.corr <- tab.corr[,hc.tab1$order]
+    
+  }
+  
+  
+  
+  # Output 1 : Correlation table -----------------------------------------------------------------------
+  
+  # Export correlation table
+  write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE)
+  
+  
+  
+  # Create the heatmap ---------------------------------------------------------------------------------
+  
+  # A message if no variable kept
+  if(length(tab.corr)==0){
+	pdf(output2)
+	plot.new()
+	legend("center","Filtering leads to no remaining correlation coefficient.")
+	dev.off()
+  } else {
+  
+  
+  library(ggplot2)
+  library(reshape2)
+  
+  # Melt the correlation table :
+  melted.tab.corr <- melt(tab.corr)
+  
+  if (color.heatmap == "yes") {
+     
+    # Add a column for the classes of each correlation coefficient
+    classe <- rep(0, dim(melted.tab.corr)[1])
+    melted <- cbind(melted.tab.corr, classe)
+    
+    if (type.classes == "regular"){
+      
+      vect <- vector()
+      if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){
+        vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], 
+                  rev(seq(1,0,-reg.value)))
+      } else {
+        vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value)))
+      }
+      
+    } else if (type.classes == "irregular") {
+      
+      irreg.vect <- c(-1, irreg.vect, 1) 
+      vect <- irreg.vect
+      
+    }
+    
+    # Color palette :  
+    myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline")
+    
+    # Create vector intervals
+    cl <- vector()
+    cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "")
+    
+    for (x in 2:(length(vect)-1)) {
+      if (vect[x+1] == 0) {
+        cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = ""))
+      } else {
+        cl <- c(cl, paste("]", round(vect[x],3), ";", 
+                          round(vect[x+1],3), "]", sep = ""))
+      }
+    }
+    
+    # Assign an interval to each correlation coefficient
+    for (i in 1:dim(melted.tab.corr)[1]){
+      for (j in 1:(length(cl))){
+        if (vect[j] == -1){
+          melted$classe[i][melted$value[i] >= vect[j] 
+                           && melted$value[i] <= vect[j+1]] <- cl[j]
+        } else {
+          melted$classe[i][melted$value[i] > vect[j] 
+                           && melted$value[i] <= vect[j+1]] <- cl[j]
+        }
+      }
+    }
+    
+    # Find the 0 and assign it the white as name
+    if (length(which(vect == 0)) == 1) {
+      melted$classe[melted$value == 0] <- "0"
+      indic <- which(vect == 0)
+      cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)])
+      names(cl)[indic] <- "#FFFFFF"
+    } else if (length(which(vect == 0)) == 0) {
+      indic <- 0
+      for (x in 1:(length(vect)-1)) {
+        if (0 > vect[x] && 0 <= vect[x+1]) {
+          names(cl)[x] <- "#FFFFFF"
+          indic <- x
+        }
+      }
+    }
+    
+    indic <- length(cl) - indic + 1
+    cl <- rev(cl)
+    
+    # Assign the colors of each intervals as their name
+    names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1]
+    names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))]
+    
+    
+    melted$classe <- factor(melted$classe)
+    melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)])
+    
+    # Heatmap if color.heatmap = yes :
+    ggplot(melted, aes(Var2, Var1, fill = classe)) +
+      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
+      geom_tile(color ="ghostwhite") +
+      scale_fill_manual( breaks = levels(melted$classe),
+                         values = names(cl)[cl%in%levels(melted$classe)],
+                         name = paste(corr.method, "correlation", sep = "\n")) +
+      theme_classic() +
+      theme(axis.text.x = element_text(angle = 90, vjust = 0.5), 
+            plot.title = element_text(hjust = 0.5)) 
+    
+  } else {
+    
+    # Heatmap if color.heatmap = no :
+    ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) +
+      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
+      geom_tile(color ="ghostwhite") +
+      scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1),
+                           name = paste(corr.method, "correlation", sep = "\n")) +
+      theme_classic() +
+      theme(axis.text.x = element_text(angle = 90, vjust = 0.5), 
+            plot.title = element_text(hjust = 0.5))
+  }
+  
+  
+  ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE)
+  
+  
+  } # End if(length(tab.corr)==0)else
+  
+  } # End of correlation.tab
+  
+  
+  # Function call
+  # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter,
+  #                             filters.choice, threshold, reorder.var, color.heatmap, type.classes,
+  #                             reg.value, irreg.vect, output1, output2)