Mercurial > repos > labis-app > galaxy_proteomics
view MClust_all.R @ 1:f3fa21cda5f5 draft default tip
planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty
author | labis-app |
---|---|
date | Fri, 10 Aug 2018 15:45:44 -0400 |
parents | |
children |
line wrap: on
line source
### # title: "" # author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia" # date: "" # output: pdf ### ### # Description ### ### Installers ### # install.packages("qvalue") # install.packages("ggplot2") # install.packages("mclust") # install.packages("reshape2") # install.packages('getopt') rm(list=ls()) #Load required packages suppressMessages(library(qvalue)) suppressMessages(library(ggplot2)) suppressMessages(library(mclust)) suppressMessages(library(reshape2)) suppressMessages(library(getopt)) ### # Set Functions # Set The directory to start the execution and read a csv file # directoryName String Name of the directory. # filename String Name of the csv file. # Rodrigo Dorado readCsvDirectory<-function(directoryName, filename, SpaceString) { setwd(directoryName) fileData <- read.csv(filename, sep = "\t", header=TRUE, na.strings = 'NaN', dec=SpaceString, stringsAsFactors=FALSE) return(fileData) } # Print the dim, columns name and the first 5 rows of a variable # dataToInform Object Name of the varaible. # Rodrigo Dorado printInfo <- function(dataToInform) { print('Length') print(length(dataToInform)) print('---------------') print('Dim') print(dim(dataToInform)) print('***************') print('Column Names') print(colnames(dataToInform)) print('_______________') print('Firsts rows') print(head(dataToInform)) print('_______________') } ## Put the id of protein as row name, get only the data of the experiments ## proteins matrix All the proteins data ## column_id int The number of the column where the is the id ## init_column int The number of the colmun of the initial experiment ## number_exp int The number of experiments ## Rodrigo Dorado getDataofInfo <- function(proteins, column_id, init_column, number_exp) { selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)] id_proteins_column <- proteins[1:nrow(proteins), column_id] ids <- matrix(, ncol = 1, nrow = nrow(proteins)) j <- 1 for(i in id_proteins_column){ id_proteins_aux <- strsplit(i, ";") unlist(id_proteins_aux) ids[j] <- id_proteins_aux[[1]][1] j <- j + 1 } row.names(selected_columns) <- ids return(selected_columns) } ## Verify if the protein contain at least one group with all the values different to NaN ## proteins matrix All the proteins data ## orderCondition int The division per experiment ## Rodrigo Dorado VarifyAComplteGroup <- function(proteins, orderCondition) { protein_erase <- c() col_names <- colnames(proteins) ## Get every protein count <- 0 for (protein in 1:nrow(proteins)) { ok_protein <- FALSE num_div <- ncol(orderCondition) columnNumber <- 1 ## Move by a experiment for (result_exp in seq(1, nrow(orderCondition))) { ok_group <- TRUE pos_NaN <- c() pos_valid <- c() ## Get the name of the experiment column_name <- row.names(orderCondition)[result_exp] ## Move into the divisions for (div_pos in 1:num_div) { if (is.na(orderCondition[result_exp, div_pos])){ break } pos <- columnNumber columnNumber <- columnNumber + 1 ## pos <- result_exp + (div_pos - 1) ## check if is NaN if(is.na(proteins[protein, pos])){ ok_group <- FALSE pos_NaN <- c(pos_NaN, pos) } else { pos_valid <- c(pos_valid, pos) } } ##Check if in the group, if the three are valid if (ok_group) { ok_protein <- TRUE } switch_var <- length(pos_NaN) allPos <- c(pos_valid, pos_NaN) ## Execute conditions if(length(pos_valid) == 0){ proteins[protein, pos_NaN] <- 0 }else if(length(pos_NaN) > 0){ proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid) } resultValue <- getTheMedian(proteins, protein, allPos) ##if (switch_var == 0) { ##resultValue <- getTheMedian(proteins, protein, allPos) ##caso1 <- caso1 + 1 ##} ##if (switch_var == 1) { ##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid])) ##resultValue <- getTheMedian(proteins, protein, allPos) ##caso2 <- caso2 + 1 ##} ##if (switch_var == 2) { ##proteins[protein, pos_NaN] <- 0 #### resultValue <- 0 ##resultValue <- proteins[protein, ] ##caso3 <- caso3 + 1 ##} ##if (switch_var == 3) { ##proteins[protein, pos_NaN] <- 0 ##resultValue <- 0 ##caso4 <- caso4 + 1 ##} ## Get the unique value proteins[protein, column_name] <- resultValue } ## Check if the protein conatin at least one group valid if (!ok_protein) { protein_erase <- c(protein_erase, protein) } } ## return the list if(length(protein_erase) > 0) { return(proteins[-protein_erase,]) }else{ return(proteins) } } ## Get the median of a group ## proteins matrix All the proteins ## protein int The actual protein ## positions int The number of experiment ## Rodrigo Dorado getTheMedian <- function(proteins, protein, positions) { return(median(as.numeric(proteins[protein, positions]))) } ## Merge two data frames by the id ## proteins Data Frame The first data frame ## p_values Data Frame The second data frame mergeById <- function(proteins, p_values) { proteins$auxiliar <- row.names(proteins) p_values$auxiliar <- row.names(p_values) proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar') row.names(proteins) <- proteins[, 'auxiliar'] return(proteins[, -1]) } ## Get the p values of the ANOVA ## proteins Data Frame All the data of the experiments ## Rodrigo Dorado ##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type) calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY" all_p_values <- c() for (i in 1:nrow(proteins)) { groupNames <- c() for (j in colnames(proteins)) { groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE))) } MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ]))) colnames(MToANOVA) <- c('GROUP', 'DATA') aov1 <- aov(DATA ~ GROUP, data = MToANOVA) all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1]) } p_values <- data.frame(all_p_values) row.names(p_values) <- row.names(proteins) return(p_values) } ## Select the data that has less than 0.05 the p_value; Replace NA by 0 ## selected_columns_proteins Matrix the selected columns of the data ## Rodrigo Dorado selectSignificanProteinsExpression <- function(selected_columns_proteins) { protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0) selected_columns_proteins$QValue <- protein_Qvalues$qvalues selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)] all_row_names <- row.names(selectedSignificantExpr) selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric)) row.names(selectedSignificantExpr) <- all_row_names ##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0 return(selectedSignificantExpr) } ## Get Z Values of the proteins Expression ## selectedSignificantExpr Matrix The values of the selected proteins ## Rodrigo Dorado getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) { return(as.data.frame(t(scale(t(selectedSignificantExpr))))) } ## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters ## ZValues Matrix The z values to cluster ## numOfClusters int The max number of clusters ## filename String The name of the csv fil ## Rodrigo Dorado executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) { print("Clustering") print(numOfClusters) clusterResult <- Mclust(ZValues, G = 2:numOfClusters) print(summary(clusterResult)) Num_clusters <- max(clusterResult$classification, na.rm=TRUE) if (Num_clusters >= numOfClusters) { newNumOfClusters <- numOfClusters + 30 executeMClust(ZValues, filename, newNumOfClusters) } else { clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL) table(clusters$Cluster) setwd(directoryCSV) #write.csv(clusters, filename, row.names = FALSE, quote = FALSE) write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t") return(clusters) } } ## Prepare data structure ## ZValues Matrix the Z Values of the proteins selected ## clusters Matrix The clusters of the proteins ## Rodrigo Dorado getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) { timesCond <- row.names(orderCondition) ZValues_melt <- ZValues ZValues_melt$ProteinID <- rownames(ZValues_melt) ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID') ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster')) colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression') ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster) for(cond in 1:nrow(orderCondition)) { ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2]) } return(ZValues_melt) } ## Test the cluster function with diferent sizes ## zvalues data Frame The Z values of the data to get clustered ## number_times array The sizes of the test ## Rodrigo Dorado testCluster <- function (zvalues, number_times) { for (i in number_times) { print("***********") print(i) cluster_result <- Mclust(zvalues, G = 2:i) print(summary(cluster_result)) print("-----------") } } ## Set the color of the background of the image ## Rodrigo dorado setBackGroundColors <- function() { rects <- as.data.frame(matrix(ncol = 3,nrow=2)) colnames(rects) <- c('xstart','xend','col') rects[1,] <- c(-49,0,'gray0') rects[2,] <- c(0,73,'white') rects$xend <- as.numeric(rects$xend) rects$xstart <- as.numeric(rects$xstart) return(rects) } ## Create new folder. ## mainDir String The directory of the new folder. ## folder String The name of the new folder. createFolder <- function(mainDir, folder) { if (file.exists(folder)){ setwd(file.path(mainDir, folder)) } else { dir.create(file.path(mainDir, folder)) setwd(file.path(mainDir, folder)) } } # Multiple plot function # # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) # - cols: Number of columns in layout # - layout: A matrix specifying the layout. If present, 'cols' is ignored. # # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), # then plot 1 will go in the upper left, 2 will go in the upper right, and # 3 will go all the way across the bottom. # http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { library(grid) # Make a list from the ... arguments and plotlist plots <- c(list(...), plotlist) numPlots = length(plots) # If layout is NULL, then use 'cols' to determine layout if (is.null(layout)) { # Make the panel # ncol: Number of columns of plots # nrow: Number of rows needed, calculated from # of cols layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), ncol = cols, nrow = ceiling(numPlots/cols)) } if (numPlots==1) { print(plots[[1]]) } else { # Set up the page grid.newpage() pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) # Make each plot, in the correct location for (i in 1:numPlots) { # Get the i,j matrix positions of the regions that contain this subplot matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) } } } ## Save all the plots in a pdf file ## name String The name of the file ## data Array All the plots ## Rodrigo Dorado savePlotPDF <- function(name, data) { pdf(name) invisible(lapply(data, print)) dev.off() } ## Order the conditions per group ## conditions Data Frame All the conditions ## Rodrigo Dorado orderConditionData <- function(conditions) { experimentNames <- unique(conditions[,2]) maxSizeId <- tail(names(sort(table(conditions[,2]))), 1) maxSize <- nrow(conditions[conditions[,2] == maxSizeId,]) result <- matrix(nrow = length(experimentNames), ncol = maxSize) row.names(result) <- experimentNames colnames(result) <- c(1:maxSize) for(i in 1:length(experimentNames)) { for (j in 1:maxSize) { result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j] } } return(result) } ## Get the times or names of the conditions. ## conditions data frame All the text of the control file. ## separaLine Int The line where the flag was founded. ## Rodrigo Dorado getNamesOfConditions <- function(conditions, separateLine) { columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ] colnames(columnsNameTime) <- columnsNameTime[1,] columnsNameTime <- columnsNameTime[-1,] row.names(columnsNameTime) <- columnsNameTime[,1] ##columnsNameTime <- columnsNameTime[,-1] return(columnsNameTime) } ## Get the directory of a file ## file String File name and directory ## Rodrigo Dorado getDirectoryOfFile <- function(file) { dir <- file allDir <- strsplit(dir, "/")[[1]] allEnters<-length(allDir) - 1 directory<-allDir[1] for(i in 2:allEnters) { directory<-paste(directory, allDir[i], sep = "/") } return(list("directory" = directory, "file" = allDir[length(allDir)]) ) } spec = matrix(c( 'filename', 'f', 1, "character", 'controlFile', 'c', 1, "character", #'projectName', 'p', 1, "character", 'SpaceString', 's', 1, "character", 'idColumn', 'i', 1, "integer", 'firstExperiment', 'e', 1, "integer", 'csvFile', 'v', 1, "character", 'pdfFile', 'd', 1, "character" ), byrow=TRUE, ncol=4) opt = getopt(spec) filename <- opt$filename controlFileName <- opt$controlFile #project_name <- opt$projectName SpaceString <- opt$SpaceString id_column <- opt$idColumn first_experiment <- opt$firstExperiment csvFile <- opt$csvFile pdfFile <- opt$pdfFile directoryF <- getDirectoryOfFile(filename) directoryC <- getDirectoryOfFile(controlFileName) directoryCSV <- getDirectoryOfFile(csvFile) directoryPDF <- getDirectoryOfFile(pdfFile) #directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file #filename <- "EXP11_paraR_log.txt" ##File name #project_name <- "june_20_2018" ##Project name #controlFileName <- "EXP11_paraR_log_properties" ##Control File #id_column <- 31 ##Column number of the id of the proteins #first_experiment <- 0 ##Colmn number where the Experiments start #SpaceString <- "." ANOVA_type <- "ONE-WAY" #ANOVA_type <- "TOW_WAY" All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString) conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString) separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',]) columnsNameTime <- getNamesOfConditions(conditions, separateLine) conditions <- conditions[1:(as.numeric(separateLine) - 1), ] printInfo(conditions) conditions[,1] <- gsub(" ", SpaceString, conditions[,1]) printInfo(conditions) Column_division <- nrow(conditions) orderCondition <- orderConditionData(conditions) printInfo(orderCondition) printInfo(All_proteins) #createFolder(directory, project_name) proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division) printInfo(proteins_complete) proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition) printInfo(proteins_New_column) proteins_for_ANOVA <- proteins_New_column[,1:Column_division] proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)] printInfo(proteins_for_ANOVA) printInfo(proteins) #write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE) #write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE) p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type) printInfo(p_values) proteins <- mergeById(proteins, p_values) printInfo(proteins) selectedSignificantExpr <- selectSignificanProteinsExpression(proteins) printInfo(selectedSignificantExpr) Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr) printInfo(Sig_Pro_Exp_ZValues) clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv' printInfo(clusters) Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime) rects <- setBackGroundColors() ##plot the result. note: make function(s) plots_withBackground <- list() plots_withOutBackground <- list() for (clusterNumber in 1:max(clusters$Cluster)){ averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1)) timesCond <- row.names(orderCondition) colnames(averageProfile) <- timesCond for (cond in 1:nrow(orderCondition)) { averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]]) } zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile)))) zval_DEGenes_cluster$ClusterNumber <- clusterNumber zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber") zval_DEGenes_cluster_melt$ClusterNumber <- NULL colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression") zval_DEGenes_cluster_melt$ProteinID <- "Average" for(cond in 1:nrow(orderCondition)) { zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2]) } numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"])) outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".") outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".") p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) + geom_line(col = "lightgray") + geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) + theme_bw() + xlab("Time point") + ylab("Relative expression") + ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) + annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " ")) ##ggsave(filename = outFilePlot1, plot = p) plots_withOutBackground[[clusterNumber]] <- p } setwd(directoryPDF$directory) savePlotPDF(directoryPDF$file, plots_withOutBackground)