Mercurial > repos > melpetera > corr_table

  #################################################################################################
  # CORRELATION TABLE                                                                             #
  #                                                                                               #
  #                                                                                               #
  # Input : 2 tables with common samples                                                          #
  # Output : Correlation table ; Heatmap (pdf)                                                    #
  #                                                                                               #
  # Dependencies : Libraries "ggplot2" and "reshape2"                                             #
  #                                                                                               #
  #################################################################################################


  # Parameters (for dev)
  if(FALSE){

    rm(list = ls())
    setwd(dir = "Y:/Developpement")

    tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular"
    tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt"
    param1.samples <- "column"
    param2.samples <- "row"
    corr.method <- "pearson"
    test.corr <- "yes"
    alpha <- 0.05
    multi.name <- "none"
    filter <- "yes"
    filters.choice <- "filters_0_thr"
    threshold <- 0.2
    reorder.var <- "yes"
    color.heatmap <- "yes"
    type.classes <-"irregular"
    reg.value <- 1/3
    irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4)
    output1 <- "Correlation_table.txt"
    output2 <- "Heatmap.pdf"

  }


  correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha,
                              multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes,
                              reg.value, irreg.vect, output1, output2){

    # This function allows to visualize the correlation between two tables
    #
    # Parameters:
    # - tab1.name: table 1 file's access
    # - tab2.name: table 2 file's access
    # - param1.samples ("row" or "column"): where the samples are in tab1
    # - param2.samples ("row" or "column"): where the samples are in tab2
    # - corr.method ("pearson", "spearman", "kendall"):
    # - test.corr ("yes" or "no"): test the significance of a correlation coefficient
    # - alpha (value between 0 and 1): risk for the correlation significance test
    # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests
    # - filter ("yes", "no"): use filter.0 or/and filter.threshold
    # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0
    #   and threshold filter remove variables with all their correlation coefficients in abs < threshold
    # - threshold (value between 0 and 1): threshold for filter threshold
    # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA
    # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user
    # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes
    # - reg.value (value between 0 and 1): value for regular classes
    # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes)
    # - output1: correlation table file's access
    # - output2: heatmap (colored correlation table) file's access


  # Input ----------------------------------------------------------------------------------------------

  tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1)
  tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1)

  # Transpose tables according to the samples
  if(param1.samples == "column"){
    tab1 <- t(tab1)
  }

  if(param2.samples == "column"){
    tab2 <- t(tab2)
  }

  # Sorting tables in alphabetical order of the samples
  tab1 <- tab1[order(rownames(tab1)),]
  tab2 <- tab2[order(rownames(tab2)),]


  # Check if the 2 datasets match regarding samples identifiers
  # Adapt from functions "check.err" and "match2", RcheckLibrary.R

  err.stock <- NULL

  id1 <- rownames(tab1)
  id2 <- rownames(tab2)

  if(sum(id1 != id2) > 0){
    err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n")

    if(length(which(id1%in%id2)) != length(id1)){
      identif <- id1[which(!(id1%in%id2))]
      if (length(identif) < 4){
        err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n")
      }
      else {
        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n")
      }
      identif <- identif[1:min(3,length(which(!(id1%in%id2))))]
      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
    }

    if(length(which(id2%in%id1)) != length(id2)){
      identif <- id2[which(!(id2%in%id1))]
      if (length(identif) < 4){
        err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n")
      }
      else{
        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n")
      }
      identif <- identif[1:min(3,length(which(!(id2%in%id1))))]
      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
    }
    err.stock <- c(err.stock,"\nPlease check your data.\n")
  }

  if(length(err.stock)!=0){
    stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n")
  }


  # Check qualitative variables in each input tables
  err.msg <- NULL

  var1.quali <- vector()
  var2.quali <- vector()

  for (i in 1:dim(tab1)[2]){
    if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){
      var1.quali <- c(var1.quali,i)
    }
  }

  for (j in 1:dim(tab2)[2]){
    if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){
      var2.quali <- c(var2.quali, j)
    }
  }

  if (length(var1.quali) != 0 | length(var2.quali) != 0){
    err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n")

    if(length(var1.quali) != 0 && length(var1.quali) < 4){
      err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.quali],collapse="\n    "),"\n")
    } else if(length(var1.quali) != 0 && length(var1.quali) > 3){
      err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n    "),"\n")
    }

    if(length(var2.quali) != 0 && length(var2.quali) < 4){
      err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab2)[var2.quali],collapse="\n    "),"\n")
    } else if(length(var2.quali) != 0 && length(var2.quali) > 3){
      err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n    "),"\n")
    }
  }

  if(length(var1.quali) != 0){
    tab1 <- tab1[,-var1.quali]
  }
  if(length(var2.quali) != 0){
    tab2 <- tab2[,-var2.quali]
  }

  if(length(err.msg) != 0){
    cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n")
  }

  # Correlation table ---------------------------------------------------------------------------------

  tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2])
  for (i in 1:dim(tab2)[2]){
    for (j in 1:dim(tab1)[2]){
      tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs")
    }
  }

  colnames(tab.corr) <- colnames(tab1)
  rownames(tab.corr) <- colnames(tab2)


  # Significance of correlation test  ------------------------------------------------------------------

  if (test.corr == "yes"){

    pvalue <- vector()
    for (i in 1:dim(tab.corr)[1]){
      for (j in 1:dim(tab.corr)[2]){
        suppressWarnings(corrtest <- cor.test(tab2[,i], tab1[,j], method = corr.method))
        pvalue <- c(pvalue, corrtest$p.value)
        if (multi.name == "none"){
          if (corrtest$p.value > alpha){
            tab.corr[i,j] <- 0
          }
        }
      }
    }

    if(multi.name != "none"){
      adjust <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2], byrow = T)
      tab.corr[adjust > alpha] <- 0
    }
  }


  # Filter settings ------------------------------------------------------------------------------------

  if (filter == "yes"){

    # Remove variables with all their correlation coefficients = 0 :
    if (filters.choice == "filter_0"){
      threshold <- 0
    }

    var2.thres <- vector()
    for (i in 1:dim(tab.corr)[1]){
      if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){
        var2.thres <- c(var2.thres, i)
      }
    }

    if (length(var2.thres) != 0){
      tab.corr <- tab.corr[-var2.thres,]
      tab2 <- tab2[, -var2.thres]
    }

    var1.thres <- vector()
    for (i in 1:dim(tab.corr)[2]){
      if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){
        var1.thres <- c(var1.thres, i)
      }
    }

    if (length(var1.thres) != 0){
      tab.corr <- tab.corr[,-var1.thres]
      tab1 <- tab1[,-var1.thres]
    }

  }


  # Reorder variables in the correlation table (with the HCA) ------------------------------------------
  if (reorder.var == "yes"){

    cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs")
    dist.tab2 <- as.dist(1 - cormat.tab2)
    hc.tab2 <- hclust(dist.tab2, method = "ward.D2")
    tab.corr <- tab.corr[hc.tab2$order,]

    cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs")
    dist.tab1 <- as.dist(1 - cormat.tab1)
    hc.tab1 <- hclust(dist.tab1, method = "ward.D2")
    tab.corr <- tab.corr[,hc.tab1$order]

  }


  # Output 1 : Correlation table -----------------------------------------------------------------------

  # Export correlation table
  write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE)


  # Create the heatmap ---------------------------------------------------------------------------------

  # A message if no variable kept
  if(length(tab.corr)==0){
	pdf(output2)
	plot.new()
	legend("center","Filtering leads to no remaining correlation coefficient.")
	dev.off()
  } else {


  library(ggplot2)
  library(reshape2)

  # Melt the correlation table :
  melted.tab.corr <- melt(tab.corr)

  if (color.heatmap == "yes") {

    # Add a column for the classes of each correlation coefficient
    classe <- rep(0, dim(melted.tab.corr)[1])
    melted <- cbind(melted.tab.corr, classe)

    if (type.classes == "regular"){

      vect <- vector()
      if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){
        vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))],
                  rev(seq(1,0,-reg.value)))
      } else {
        vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value)))
      }

    } else if (type.classes == "irregular") {

      irreg.vect <- c(-1, irreg.vect, 1)
      vect <- irreg.vect

    }

    # Color palette :
    myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline")

    # Create vector intervals
    cl <- vector()
    cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "")

    for (x in 2:(length(vect)-1)) {
      if (vect[x+1] == 0) {
        cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = ""))
      } else {
        cl <- c(cl, paste("]", round(vect[x],3), ";",
                          round(vect[x+1],3), "]", sep = ""))
      }
    }

    # Assign an interval to each correlation coefficient
    for (i in 1:dim(melted.tab.corr)[1]){
      for (j in 1:(length(cl))){
        if (vect[j] == -1){
          melted$classe[i][melted$value[i] >= vect[j]
                           && melted$value[i] <= vect[j+1]] <- cl[j]
        } else {
          melted$classe[i][melted$value[i] > vect[j]
                           && melted$value[i] <= vect[j+1]] <- cl[j]
        }
      }
    }

    # Find the 0 and assign it the white as name
    if (length(which(vect == 0)) == 1) {
      melted$classe[melted$value == 0] <- "0"
      indic <- which(vect == 0)
      cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)])
      names(cl)[indic] <- "#FFFFFF"
    } else if (length(which(vect == 0)) == 0) {
      indic <- 0
      for (x in 1:(length(vect)-1)) {
        if (0 > vect[x] && 0 <= vect[x+1]) {
          names(cl)[x] <- "#FFFFFF"
          indic <- x
        }
      }
    }

    indic <- length(cl) - indic + 1
    cl <- rev(cl)

    # Assign the colors of each intervals as their name
    names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1]
    names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))]


    melted$classe <- factor(melted$classe)
    melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)])

    # Heatmap if color.heatmap = yes :
    ggplot(melted, aes(Var2, Var1, fill = classe)) +
      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
      geom_tile(color ="ghostwhite") +
      scale_fill_manual( breaks = levels(melted$classe),
                         values = names(cl)[cl%in%levels(melted$classe)],
                         name = paste(corr.method, "correlation", sep = "\n")) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
            plot.title = element_text(hjust = 0.5))

  } else {

    # Heatmap if color.heatmap = no :
    ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) +
      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
      geom_tile(color ="ghostwhite") +
      scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1),
                           name = paste(corr.method, "correlation", sep = "\n")) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
            plot.title = element_text(hjust = 0.5))
  }


  ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE)


  } # End if(length(tab.corr)==0)else

  } # End of correlation.tab


  # Function call
  # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter,
  #                             filters.choice, threshold, reorder.var, color.heatmap, type.classes,
  #                             reg.value, irreg.vect, output1, output2)
author	melpetera
date	Thu, 11 Oct 2018 05:35:55 -0400
parents
children	29ec7e3afdd4