view MClust_all.R @ 1:f3fa21cda5f5 draft default tip

planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty
author labis-app
date Fri, 10 Aug 2018 15:45:44 -0400
parents
children
line wrap: on
line source

###
# title: ""
# author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia"
# date: ""
# output: pdf
###

###
# Description
###

### Installers ###
# install.packages("qvalue")
# install.packages("ggplot2")
# install.packages("mclust")
# install.packages("reshape2")
# install.packages('getopt')

rm(list=ls())

#Load required packages
suppressMessages(library(qvalue))
suppressMessages(library(ggplot2))
suppressMessages(library(mclust))
suppressMessages(library(reshape2))
suppressMessages(library(getopt))

###

# Set Functions

# Set The directory to start the execution and read a csv file
# directoryName String Name of the directory.
# filename String Name of the csv file.
# Rodrigo Dorado
readCsvDirectory<-function(directoryName, filename, SpaceString) {
  setwd(directoryName)
  fileData <- read.csv(filename,
                       sep = "\t", 
                       header=TRUE, 
                       na.strings = 'NaN',
                       dec=SpaceString,
                       stringsAsFactors=FALSE)
  return(fileData)
}

# Print the dim, columns name and the first 5 rows of a variable
# dataToInform  Object Name of the varaible.
# Rodrigo Dorado
printInfo <- function(dataToInform) {
  print('Length')
  print(length(dataToInform))
  print('---------------')
  print('Dim')
  print(dim(dataToInform))
  print('***************')
  print('Column Names')
  print(colnames(dataToInform))
  print('_______________')
  print('Firsts rows')
  print(head(dataToInform))
  print('_______________')
}

## Put the id of protein as row name, get only the data of the experiments
## proteins matrix All the proteins data
## column_id int The number of the column where the is the id
## init_column int The number of the colmun of the initial experiment
## number_exp int The number of experiments
## Rodrigo Dorado 
getDataofInfo <- function(proteins, column_id, init_column, number_exp) {
  selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)]
  id_proteins_column <- proteins[1:nrow(proteins), column_id]
  ids <- matrix(, ncol = 1, nrow = nrow(proteins))
  j <- 1
  for(i in id_proteins_column){
    id_proteins_aux <- strsplit(i, ";")
    unlist(id_proteins_aux)
    ids[j] <- id_proteins_aux[[1]][1]
    j <- j + 1
  }
  row.names(selected_columns) <- ids
  return(selected_columns)
}

## Verify if the protein contain at least one group with all the values different to NaN
## proteins matrix All the proteins data
## orderCondition int The division per experiment
## Rodrigo Dorado
VarifyAComplteGroup <- function(proteins, orderCondition) {
  protein_erase <- c()
  col_names <- colnames(proteins)
  ## Get every protein
  count <- 0
  for (protein in 1:nrow(proteins)) {
    ok_protein <- FALSE
    num_div <- ncol(orderCondition)
    columnNumber <- 1
    ## Move by a experiment
    for (result_exp in seq(1, nrow(orderCondition))) {
      ok_group <- TRUE
      pos_NaN <- c()
      pos_valid <- c()
      ## Get the name of the experiment
      column_name <- row.names(orderCondition)[result_exp]
      ## Move into the divisions
      for (div_pos in 1:num_div) {
        if (is.na(orderCondition[result_exp, div_pos])){
          break
        }
        pos <- columnNumber
        columnNumber <- columnNumber + 1
        ## pos <- result_exp + (div_pos - 1)
        ## check if is NaN
        if(is.na(proteins[protein, pos])){
          ok_group <- FALSE
          pos_NaN <- c(pos_NaN, pos)
        } else {
          pos_valid <- c(pos_valid, pos)
        }
      }
      ##Check if in the group, if the three are valid
      if (ok_group) {
        ok_protein <- TRUE
      }
      switch_var <- length(pos_NaN)
      allPos <- c(pos_valid, pos_NaN)
      ## Execute conditions
      
      if(length(pos_valid) == 0){
        proteins[protein, pos_NaN] <- 0
      }else if(length(pos_NaN) > 0){
        proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid)
      }
      resultValue <- getTheMedian(proteins, protein, allPos)
      ##if (switch_var == 0) {
      ##resultValue <- getTheMedian(proteins, protein, allPos)
      ##caso1 <- caso1 + 1
      ##}
      ##if (switch_var == 1) {
      ##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid]))
      ##resultValue <- getTheMedian(proteins, protein, allPos)
      ##caso2 <- caso2 + 1
      ##}
      ##if (switch_var == 2) {
      ##proteins[protein, pos_NaN] <- 0
      #### resultValue <- 0
      ##resultValue <- proteins[protein,   ]
      ##caso3 <- caso3 + 1
      ##}
      ##if (switch_var == 3) {
      ##proteins[protein, pos_NaN] <- 0
      ##resultValue <- 0
      ##caso4 <- caso4 + 1
      ##}
      ## Get the unique value
      proteins[protein, column_name] <- resultValue 
    }
    ## Check if the protein conatin at least one group valid
    if (!ok_protein) {
      protein_erase <- c(protein_erase, protein)
    }
  }
  ## return the list
  if(length(protein_erase) > 0) {
    return(proteins[-protein_erase,])
  }else{
    return(proteins)
  }
}

## Get the median of a group
## proteins matrix All the proteins
## protein int The actual protein
## positions int The number of experiment
## Rodrigo Dorado
getTheMedian <- function(proteins, protein, positions) {
  return(median(as.numeric(proteins[protein, positions])))
}

## Merge two data frames by the id
## proteins Data Frame The first data frame
## p_values Data Frame The second data frame
mergeById <- function(proteins, p_values) {
  proteins$auxiliar <- row.names(proteins)
  p_values$auxiliar <- row.names(p_values)
  proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar')
  row.names(proteins) <- proteins[, 'auxiliar']
  return(proteins[, -1])
}

## Get the p values of the ANOVA
## proteins Data Frame All the data of the experiments
## Rodrigo Dorado

##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)

calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY"
  all_p_values <- c()
  for (i in 1:nrow(proteins)) {
    groupNames <- c()
    for (j in colnames(proteins)) {
      groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE))) 
    }
    MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ])))
    colnames(MToANOVA) <- c('GROUP', 'DATA')
    aov1 <- aov(DATA ~ GROUP, data = MToANOVA)
    all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1])
  }
  p_values <- data.frame(all_p_values)
  row.names(p_values) <- row.names(proteins)
  return(p_values)
}

## Select the data that has less than 0.05 the p_value; Replace NA by 0
## selected_columns_proteins Matrix the selected columns of the data
## Rodrigo Dorado
selectSignificanProteinsExpression <- function(selected_columns_proteins) {
  protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0)
  selected_columns_proteins$QValue <- protein_Qvalues$qvalues
  selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)]
  all_row_names <- row.names(selectedSignificantExpr)
  selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric))
  row.names(selectedSignificantExpr) <- all_row_names
  ##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0
  return(selectedSignificantExpr)
}

## Get Z Values of the proteins Expression
## selectedSignificantExpr Matrix The values of the selected proteins
## Rodrigo Dorado
getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) {
  return(as.data.frame(t(scale(t(selectedSignificantExpr)))))
}

## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters
## ZValues Matrix The z values to cluster
## numOfClusters int The max number of clusters
## filename String The name of the csv fil
## Rodrigo Dorado
executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) {
  print("Clustering")
  print(numOfClusters)
  clusterResult <- Mclust(ZValues, G = 2:numOfClusters)
  print(summary(clusterResult))
  Num_clusters <- max(clusterResult$classification, na.rm=TRUE)
  if (Num_clusters >= numOfClusters) {
    newNumOfClusters <- numOfClusters + 30
    executeMClust(ZValues, filename, newNumOfClusters)
  } else {
    clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL)
    table(clusters$Cluster)
    setwd(directoryCSV)
    #write.csv(clusters, filename, row.names = FALSE, quote = FALSE)
    write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t")
    return(clusters) 
  }
}

## Prepare data structure
## ZValues Matrix the Z Values of the proteins selected
## clusters Matrix The clusters of the proteins
## Rodrigo Dorado
getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) {
  timesCond <- row.names(orderCondition)
  ZValues_melt <- ZValues
  ZValues_melt$ProteinID <- rownames(ZValues_melt)
  ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID')
  ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster'))
  colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression')
  ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster)
  for(cond in 1:nrow(orderCondition)) {
    ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
  }
  return(ZValues_melt)
}

## Test the cluster function with diferent sizes
## zvalues data Frame The Z values of the data to get clustered
## number_times array The sizes of the test
## Rodrigo Dorado
testCluster <- function (zvalues, number_times) {
  for (i in number_times) {
    print("***********")
    print(i)
    cluster_result <- Mclust(zvalues, G = 2:i)
    print(summary(cluster_result))
    print("-----------")
  }
}


## Set the color of the background of the image
## Rodrigo dorado
setBackGroundColors <- function() {
  rects <- as.data.frame(matrix(ncol = 3,nrow=2))
  colnames(rects) <- c('xstart','xend','col')
  rects[1,] <- c(-49,0,'gray0')
  rects[2,] <- c(0,73,'white')
  rects$xend <- as.numeric(rects$xend)
  rects$xstart <- as.numeric(rects$xstart)
  return(rects)
}

## Create new folder.
## mainDir String The directory of the new folder.
## folder String The name of the new folder.
createFolder <- function(mainDir, folder) {
  if (file.exists(folder)){
    setwd(file.path(mainDir, folder))
  } else {
    dir.create(file.path(mainDir, folder))
    setwd(file.path(mainDir, folder))
  }
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

## Save all the plots in a pdf file
## name String The name of the file
## data Array All the plots
## Rodrigo Dorado
savePlotPDF <- function(name, data) {
  pdf(name)
  invisible(lapply(data, print))
  dev.off()
}

## Order the conditions per group
## conditions Data Frame All the conditions
## Rodrigo Dorado
orderConditionData <- function(conditions) {
  experimentNames <- unique(conditions[,2])
  maxSizeId <- tail(names(sort(table(conditions[,2]))), 1)
  maxSize <- nrow(conditions[conditions[,2] == maxSizeId,])
  result <- matrix(nrow = length(experimentNames), ncol = maxSize)
  row.names(result) <- experimentNames
  colnames(result) <- c(1:maxSize)
  for(i in 1:length(experimentNames)) {
    for (j in 1:maxSize) {
      result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j]
    }
  }
  return(result)
}

## Get the times or names of the conditions.
## conditions data frame All the text of the control file.
## separaLine Int The line where the flag was founded.
## Rodrigo Dorado
getNamesOfConditions <- function(conditions, separateLine) {
  columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ]
  colnames(columnsNameTime) <- columnsNameTime[1,]
  columnsNameTime <- columnsNameTime[-1,]
  row.names(columnsNameTime) <- columnsNameTime[,1]
  ##columnsNameTime <- columnsNameTime[,-1]
  return(columnsNameTime)
}

## Get the directory of a file
## file String File name and directory
## Rodrigo Dorado
getDirectoryOfFile <- function(file) {
  dir <- file
  allDir <- strsplit(dir, "/")[[1]]
  allEnters<-length(allDir) - 1
  directory<-allDir[1]
  for(i in 2:allEnters) {
    directory<-paste(directory, allDir[i], sep = "/")
  }
  return(list("directory" = directory, "file" = allDir[length(allDir)]) )  
}

spec = matrix(c(
  'filename', 'f', 1, "character",
  'controlFile', 'c', 1, "character",
  #'projectName', 'p', 1, "character",
  'SpaceString', 's', 1, "character",
  'idColumn', 'i', 1, "integer",
  'firstExperiment', 'e', 1, "integer",
  'csvFile', 'v', 1, "character",
  'pdfFile', 'd', 1, "character"
), byrow=TRUE, ncol=4)

opt = getopt(spec) 

filename <- opt$filename
controlFileName <- opt$controlFile
#project_name <- opt$projectName
SpaceString <- opt$SpaceString
id_column <- opt$idColumn
first_experiment <- opt$firstExperiment
csvFile <- opt$csvFile
pdfFile <- opt$pdfFile


directoryF <- getDirectoryOfFile(filename)
directoryC <- getDirectoryOfFile(controlFileName)
directoryCSV <- getDirectoryOfFile(csvFile)
directoryPDF <- getDirectoryOfFile(pdfFile)


#directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file
#filename <- "EXP11_paraR_log.txt" ##File name
#project_name <- "june_20_2018" ##Project name
#controlFileName <- "EXP11_paraR_log_properties" ##Control File
#id_column <- 31 ##Column number of the id of the proteins
#first_experiment <- 0 ##Colmn number where the Experiments start
#SpaceString <- "."
ANOVA_type <- "ONE-WAY"
#ANOVA_type <- "TOW_WAY"

All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString)
conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString)
separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',])
columnsNameTime <- getNamesOfConditions(conditions, separateLine)
conditions <- conditions[1:(as.numeric(separateLine) - 1), ]
printInfo(conditions)
conditions[,1] <- gsub(" ", SpaceString, conditions[,1])
printInfo(conditions)
Column_division <- nrow(conditions)
orderCondition <- orderConditionData(conditions)
printInfo(orderCondition)
printInfo(All_proteins)
#createFolder(directory, project_name)
proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division)
printInfo(proteins_complete)
proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition)
printInfo(proteins_New_column)
proteins_for_ANOVA <- proteins_New_column[,1:Column_division]
proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)]
printInfo(proteins_for_ANOVA)
printInfo(proteins)
#write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE)
#write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE)
p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type)
printInfo(p_values)
proteins <- mergeById(proteins, p_values)
printInfo(proteins)
selectedSignificantExpr <- selectSignificanProteinsExpression(proteins)
printInfo(selectedSignificantExpr)
Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr)
printInfo(Sig_Pro_Exp_ZValues)

clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv'
printInfo(clusters)
Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime)

rects <- setBackGroundColors()

##plot the result. note: make function(s)
plots_withBackground <- list()
plots_withOutBackground <- list()
for (clusterNumber in 1:max(clusters$Cluster)){
  averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1))
  timesCond <- row.names(orderCondition)
  colnames(averageProfile) <- timesCond
  for (cond in 1:nrow(orderCondition)) {
    averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]])
  }
  
  zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile))))
  zval_DEGenes_cluster$ClusterNumber <- clusterNumber
  zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber")
  zval_DEGenes_cluster_melt$ClusterNumber <- NULL
  colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression")
  zval_DEGenes_cluster_melt$ProteinID <- "Average"
  
  for(cond in 1:nrow(orderCondition)) {
    zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2])
  }
  
  numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"]))
  
  outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".")
  outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".")
  
  p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) + 
    geom_line(col = "lightgray") + 
    geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) +
    theme_bw() +
    xlab("Time point") +
    ylab("Relative expression") +
    ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) +
    annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " "))
  ##ggsave(filename = outFilePlot1, plot = p)
  plots_withOutBackground[[clusterNumber]] <- p
}
setwd(directoryPDF$directory)
savePlotPDF(directoryPDF$file, plots_withOutBackground)