Mercurial > repos > labis-app > galaxy_proteomics
diff MClust_all.R @ 1:f3fa21cda5f5 draft default tip
planemo upload commit 13e72e84c523bda22bda792bbebf4720d28542d5-dirty
author | labis-app |
---|---|
date | Fri, 10 Aug 2018 15:45:44 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MClust_all.R Fri Aug 10 15:45:44 2018 -0400 @@ -0,0 +1,534 @@ +### +# title: "" +# author: "Diego M. Riano-Pachon, Rodrigo R. Dorado Goitia" +# date: "" +# output: pdf +### + +### +# Description +### + +### Installers ### +# install.packages("qvalue") +# install.packages("ggplot2") +# install.packages("mclust") +# install.packages("reshape2") +# install.packages('getopt') + +rm(list=ls()) + +#Load required packages +suppressMessages(library(qvalue)) +suppressMessages(library(ggplot2)) +suppressMessages(library(mclust)) +suppressMessages(library(reshape2)) +suppressMessages(library(getopt)) + +### + +# Set Functions + +# Set The directory to start the execution and read a csv file +# directoryName String Name of the directory. +# filename String Name of the csv file. +# Rodrigo Dorado +readCsvDirectory<-function(directoryName, filename, SpaceString) { + setwd(directoryName) + fileData <- read.csv(filename, + sep = "\t", + header=TRUE, + na.strings = 'NaN', + dec=SpaceString, + stringsAsFactors=FALSE) + return(fileData) +} + +# Print the dim, columns name and the first 5 rows of a variable +# dataToInform Object Name of the varaible. +# Rodrigo Dorado +printInfo <- function(dataToInform) { + print('Length') + print(length(dataToInform)) + print('---------------') + print('Dim') + print(dim(dataToInform)) + print('***************') + print('Column Names') + print(colnames(dataToInform)) + print('_______________') + print('Firsts rows') + print(head(dataToInform)) + print('_______________') +} + +## Put the id of protein as row name, get only the data of the experiments +## proteins matrix All the proteins data +## column_id int The number of the column where the is the id +## init_column int The number of the colmun of the initial experiment +## number_exp int The number of experiments +## Rodrigo Dorado +getDataofInfo <- function(proteins, column_id, init_column, number_exp) { + selected_columns <- proteins[1:nrow(proteins), c(init_column:number_exp)] + id_proteins_column <- proteins[1:nrow(proteins), column_id] + ids <- matrix(, ncol = 1, nrow = nrow(proteins)) + j <- 1 + for(i in id_proteins_column){ + id_proteins_aux <- strsplit(i, ";") + unlist(id_proteins_aux) + ids[j] <- id_proteins_aux[[1]][1] + j <- j + 1 + } + row.names(selected_columns) <- ids + return(selected_columns) +} + +## Verify if the protein contain at least one group with all the values different to NaN +## proteins matrix All the proteins data +## orderCondition int The division per experiment +## Rodrigo Dorado +VarifyAComplteGroup <- function(proteins, orderCondition) { + protein_erase <- c() + col_names <- colnames(proteins) + ## Get every protein + count <- 0 + for (protein in 1:nrow(proteins)) { + ok_protein <- FALSE + num_div <- ncol(orderCondition) + columnNumber <- 1 + ## Move by a experiment + for (result_exp in seq(1, nrow(orderCondition))) { + ok_group <- TRUE + pos_NaN <- c() + pos_valid <- c() + ## Get the name of the experiment + column_name <- row.names(orderCondition)[result_exp] + ## Move into the divisions + for (div_pos in 1:num_div) { + if (is.na(orderCondition[result_exp, div_pos])){ + break + } + pos <- columnNumber + columnNumber <- columnNumber + 1 + ## pos <- result_exp + (div_pos - 1) + ## check if is NaN + if(is.na(proteins[protein, pos])){ + ok_group <- FALSE + pos_NaN <- c(pos_NaN, pos) + } else { + pos_valid <- c(pos_valid, pos) + } + } + ##Check if in the group, if the three are valid + if (ok_group) { + ok_protein <- TRUE + } + switch_var <- length(pos_NaN) + allPos <- c(pos_valid, pos_NaN) + ## Execute conditions + + if(length(pos_valid) == 0){ + proteins[protein, pos_NaN] <- 0 + }else if(length(pos_NaN) > 0){ + proteins[protein, pos_NaN] <- getTheMedian(proteins, protein, pos_valid) + } + resultValue <- getTheMedian(proteins, protein, allPos) + ##if (switch_var == 0) { + ##resultValue <- getTheMedian(proteins, protein, allPos) + ##caso1 <- caso1 + 1 + ##} + ##if (switch_var == 1) { + ##proteins[protein, pos_NaN] <- mean(as.numeric(proteins[protein, pos_valid])) + ##resultValue <- getTheMedian(proteins, protein, allPos) + ##caso2 <- caso2 + 1 + ##} + ##if (switch_var == 2) { + ##proteins[protein, pos_NaN] <- 0 + #### resultValue <- 0 + ##resultValue <- proteins[protein, ] + ##caso3 <- caso3 + 1 + ##} + ##if (switch_var == 3) { + ##proteins[protein, pos_NaN] <- 0 + ##resultValue <- 0 + ##caso4 <- caso4 + 1 + ##} + ## Get the unique value + proteins[protein, column_name] <- resultValue + } + ## Check if the protein conatin at least one group valid + if (!ok_protein) { + protein_erase <- c(protein_erase, protein) + } + } + ## return the list + if(length(protein_erase) > 0) { + return(proteins[-protein_erase,]) + }else{ + return(proteins) + } +} + +## Get the median of a group +## proteins matrix All the proteins +## protein int The actual protein +## positions int The number of experiment +## Rodrigo Dorado +getTheMedian <- function(proteins, protein, positions) { + return(median(as.numeric(proteins[protein, positions]))) +} + +## Merge two data frames by the id +## proteins Data Frame The first data frame +## p_values Data Frame The second data frame +mergeById <- function(proteins, p_values) { + proteins$auxiliar <- row.names(proteins) + p_values$auxiliar <- row.names(p_values) + proteins <- merge(proteins, p_values, by.x = 'auxiliar', by.y = 'auxiliar') + row.names(proteins) <- proteins[, 'auxiliar'] + return(proteins[, -1]) +} + +## Get the p values of the ANOVA +## proteins Data Frame All the data of the experiments +## Rodrigo Dorado + +##p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type) + +calculateANOVA <- function(proteins, orderCondition, type = "ONE-WAY") { #"ONE-WAY", "TOW_WAY" + all_p_values <- c() + for (i in 1:nrow(proteins)) { + groupNames <- c() + for (j in colnames(proteins)) { + groupNames <- c(groupNames, row.names(which(orderCondition == j, arr.ind = TRUE))) + } + MToANOVA <- data.frame(groupNames, as.numeric(t(proteins[i, ]))) + colnames(MToANOVA) <- c('GROUP', 'DATA') + aov1 <- aov(DATA ~ GROUP, data = MToANOVA) + all_p_values <- c(all_p_values, summary(aov1)[[1]][["Pr(>F)"]][1]) + } + p_values <- data.frame(all_p_values) + row.names(p_values) <- row.names(proteins) + return(p_values) +} + +## Select the data that has less than 0.05 the p_value; Replace NA by 0 +## selected_columns_proteins Matrix the selected columns of the data +## Rodrigo Dorado +selectSignificanProteinsExpression <- function(selected_columns_proteins) { + protein_Qvalues <- qvalue(selected_columns_proteins$all_p_values, lambda=0) + selected_columns_proteins$QValue <- protein_Qvalues$qvalues + selectedSignificantExpr <- selected_columns_proteins[which(selected_columns_proteins$QValue < 0.05), c(1:10)] + all_row_names <- row.names(selectedSignificantExpr) + selectedSignificantExpr <- as.data.frame(sapply(selectedSignificantExpr, as.numeric)) + row.names(selectedSignificantExpr) <- all_row_names + ##selectedSignificantExpr[is.na(selectedSignificantExpr)] <- 0 + return(selectedSignificantExpr) +} + +## Get Z Values of the proteins Expression +## selectedSignificantExpr Matrix The values of the selected proteins +## Rodrigo Dorado +getZValueSgnificantProteinExpression <- function(selectedSignificantExpr) { + return(as.data.frame(t(scale(t(selectedSignificantExpr))))) +} + +## Execute cluster function, save the result in a csv file. Note: add the condition of max clusters +## ZValues Matrix The z values to cluster +## numOfClusters int The max number of clusters +## filename String The name of the csv fil +## Rodrigo Dorado +executeMClust <- function(ZValues, directoryCSV, filename, numOfClusters = 60) { + print("Clustering") + print(numOfClusters) + clusterResult <- Mclust(ZValues, G = 2:numOfClusters) + print(summary(clusterResult)) + Num_clusters <- max(clusterResult$classification, na.rm=TRUE) + if (Num_clusters >= numOfClusters) { + newNumOfClusters <- numOfClusters + 30 + executeMClust(ZValues, filename, newNumOfClusters) + } else { + clusters <- data.frame(ProteinID = names(clusterResult$classification), Cluster = clusterResult$classification, row.names = NULL) + table(clusters$Cluster) + setwd(directoryCSV) + #write.csv(clusters, filename, row.names = FALSE, quote = FALSE) + write.table(clusters, file = filename, row.names = FALSE, quote = FALSE, sep = "\t") + return(clusters) + } +} + +## Prepare data structure +## ZValues Matrix the Z Values of the proteins selected +## clusters Matrix The clusters of the proteins +## Rodrigo Dorado +getZvaluesMelt <- function(ZValues, clusters, orderCondition, columnsNameTime) { + timesCond <- row.names(orderCondition) + ZValues_melt <- ZValues + ZValues_melt$ProteinID <- rownames(ZValues_melt) + ZValues_melt <- merge(ZValues_melt, clusters, by.x = 'ProteinID', by.y = 'ProteinID') + ZValues_melt <- melt(ZValues_melt, id.vars = c('ProteinID','Cluster')) + colnames(ZValues_melt) <- c('ProteinID', 'Cluster', 'Time', 'RelativeExpression') + ZValues_melt$Cluster <- as.factor(ZValues_melt$Cluster) + for(cond in 1:nrow(orderCondition)) { + ZValues_melt$Time2[ZValues_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2]) + } + return(ZValues_melt) +} + +## Test the cluster function with diferent sizes +## zvalues data Frame The Z values of the data to get clustered +## number_times array The sizes of the test +## Rodrigo Dorado +testCluster <- function (zvalues, number_times) { + for (i in number_times) { + print("***********") + print(i) + cluster_result <- Mclust(zvalues, G = 2:i) + print(summary(cluster_result)) + print("-----------") + } +} + + +## Set the color of the background of the image +## Rodrigo dorado +setBackGroundColors <- function() { + rects <- as.data.frame(matrix(ncol = 3,nrow=2)) + colnames(rects) <- c('xstart','xend','col') + rects[1,] <- c(-49,0,'gray0') + rects[2,] <- c(0,73,'white') + rects$xend <- as.numeric(rects$xend) + rects$xstart <- as.numeric(rects$xstart) + return(rects) +} + +## Create new folder. +## mainDir String The directory of the new folder. +## folder String The name of the new folder. +createFolder <- function(mainDir, folder) { + if (file.exists(folder)){ + setwd(file.path(mainDir, folder)) + } else { + dir.create(file.path(mainDir, folder)) + setwd(file.path(mainDir, folder)) + } +} + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} + +## Save all the plots in a pdf file +## name String The name of the file +## data Array All the plots +## Rodrigo Dorado +savePlotPDF <- function(name, data) { + pdf(name) + invisible(lapply(data, print)) + dev.off() +} + +## Order the conditions per group +## conditions Data Frame All the conditions +## Rodrigo Dorado +orderConditionData <- function(conditions) { + experimentNames <- unique(conditions[,2]) + maxSizeId <- tail(names(sort(table(conditions[,2]))), 1) + maxSize <- nrow(conditions[conditions[,2] == maxSizeId,]) + result <- matrix(nrow = length(experimentNames), ncol = maxSize) + row.names(result) <- experimentNames + colnames(result) <- c(1:maxSize) + for(i in 1:length(experimentNames)) { + for (j in 1:maxSize) { + result[experimentNames[i],j] <- conditions[conditions[,2] == experimentNames[i],][,1][j] + } + } + return(result) +} + +## Get the times or names of the conditions. +## conditions data frame All the text of the control file. +## separaLine Int The line where the flag was founded. +## Rodrigo Dorado +getNamesOfConditions <- function(conditions, separateLine) { + columnsNameTime <- conditions[(as.numeric(separateLine) + 1):nrow(conditions), ] + colnames(columnsNameTime) <- columnsNameTime[1,] + columnsNameTime <- columnsNameTime[-1,] + row.names(columnsNameTime) <- columnsNameTime[,1] + ##columnsNameTime <- columnsNameTime[,-1] + return(columnsNameTime) +} + +## Get the directory of a file +## file String File name and directory +## Rodrigo Dorado +getDirectoryOfFile <- function(file) { + dir <- file + allDir <- strsplit(dir, "/")[[1]] + allEnters<-length(allDir) - 1 + directory<-allDir[1] + for(i in 2:allEnters) { + directory<-paste(directory, allDir[i], sep = "/") + } + return(list("directory" = directory, "file" = allDir[length(allDir)]) ) +} + +spec = matrix(c( + 'filename', 'f', 1, "character", + 'controlFile', 'c', 1, "character", + #'projectName', 'p', 1, "character", + 'SpaceString', 's', 1, "character", + 'idColumn', 'i', 1, "integer", + 'firstExperiment', 'e', 1, "integer", + 'csvFile', 'v', 1, "character", + 'pdfFile', 'd', 1, "character" +), byrow=TRUE, ncol=4) + +opt = getopt(spec) + +filename <- opt$filename +controlFileName <- opt$controlFile +#project_name <- opt$projectName +SpaceString <- opt$SpaceString +id_column <- opt$idColumn +first_experiment <- opt$firstExperiment +csvFile <- opt$csvFile +pdfFile <- opt$pdfFile + + +directoryF <- getDirectoryOfFile(filename) +directoryC <- getDirectoryOfFile(controlFileName) +directoryCSV <- getDirectoryOfFile(csvFile) +directoryPDF <- getDirectoryOfFile(pdfFile) + + +#directory <- "/home/rodrigodorado/MClustDirectory/" ##Directory of the file +#filename <- "EXP11_paraR_log.txt" ##File name +#project_name <- "june_20_2018" ##Project name +#controlFileName <- "EXP11_paraR_log_properties" ##Control File +#id_column <- 31 ##Column number of the id of the proteins +#first_experiment <- 0 ##Colmn number where the Experiments start +#SpaceString <- "." +ANOVA_type <- "ONE-WAY" +#ANOVA_type <- "TOW_WAY" + +All_proteins <- readCsvDirectory(directoryF$directory, directoryF$file, SpaceString) +conditions <- readCsvDirectory(directoryC$directory, directoryC$file, SpaceString) +separateLine <- row.names(conditions[conditions[,1] == '-- SEPARATE --',]) +columnsNameTime <- getNamesOfConditions(conditions, separateLine) +conditions <- conditions[1:(as.numeric(separateLine) - 1), ] +printInfo(conditions) +conditions[,1] <- gsub(" ", SpaceString, conditions[,1]) +printInfo(conditions) +Column_division <- nrow(conditions) +orderCondition <- orderConditionData(conditions) +printInfo(orderCondition) +printInfo(All_proteins) +#createFolder(directory, project_name) +proteins_complete <- getDataofInfo(All_proteins, id_column, first_experiment, Column_division) +printInfo(proteins_complete) +proteins_New_column <- VarifyAComplteGroup(proteins_complete, orderCondition) +printInfo(proteins_New_column) +proteins_for_ANOVA <- proteins_New_column[,1:Column_division] +proteins <- proteins_New_column[,(Column_division + 1):ncol(proteins_New_column)] +printInfo(proteins_for_ANOVA) +printInfo(proteins) +#write.csv(proteins_for_ANOVA, "Proteins For Anova.csv",row.names = TRUE, quote = FALSE) +#write.csv(proteins, "Proteins With Conditions.csv",row.names = TRUE, quote = FALSE) +p_values <- calculateANOVA(proteins_for_ANOVA, orderCondition, ANOVA_type) +printInfo(p_values) +proteins <- mergeById(proteins, p_values) +printInfo(proteins) +selectedSignificantExpr <- selectSignificanProteinsExpression(proteins) +printInfo(selectedSignificantExpr) +Sig_Pro_Exp_ZValues <- getZValueSgnificantProteinExpression(selectedSignificantExpr) +printInfo(Sig_Pro_Exp_ZValues) + +clusters <- executeMClust(Sig_Pro_Exp_ZValues, directoryCSV$directory, directoryCSV$file)#'clusters.csv' +printInfo(clusters) +Sig_Pro_Exp_ZValues_melt <- getZvaluesMelt(Sig_Pro_Exp_ZValues, clusters, orderCondition, columnsNameTime) + +rects <- setBackGroundColors() + +##plot the result. note: make function(s) +plots_withBackground <- list() +plots_withOutBackground <- list() +for (clusterNumber in 1:max(clusters$Cluster)){ + averageProfile <- as.data.frame(matrix(ncol = nrow(orderCondition),nrow=1)) + timesCond <- row.names(orderCondition) + colnames(averageProfile) <- timesCond + for (cond in 1:nrow(orderCondition)) { + averageProfile[timesCond[cond]] <- mean(selectedSignificantExpr[row.names(selectedSignificantExpr) %in% clusters[which(clusters$Cluster == clusterNumber),"ProteinID"], timesCond[cond]]) + } + + zval_DEGenes_cluster <- as.data.frame(t(scale(t(averageProfile)))) + zval_DEGenes_cluster$ClusterNumber <- clusterNumber + zval_DEGenes_cluster_melt <- melt(zval_DEGenes_cluster, id.vars="ClusterNumber") + zval_DEGenes_cluster_melt$ClusterNumber <- NULL + colnames(zval_DEGenes_cluster_melt) <- c("Time","RelativeExpression") + zval_DEGenes_cluster_melt$ProteinID <- "Average" + + for(cond in 1:nrow(orderCondition)) { + zval_DEGenes_cluster_melt$Time2[zval_DEGenes_cluster_melt$Time == timesCond[cond]] <- as.numeric(columnsNameTime[timesCond[cond], 2]) + } + + numberGenes = length(unique(clusters[which(clusters$Cluster == clusterNumber),"ProteinID"])) + + outFilePlot1 = paste("cluster1",clusterNumber,"graphWithoutBackground","pdf",sep=".") + outFilePlot2 = paste("cluster1",clusterNumber,"graphWithBackground","pdf",sep=".") + + p <- ggplot(Sig_Pro_Exp_ZValues_melt[ which(Sig_Pro_Exp_ZValues_melt$Cluster == clusterNumber),], aes(x = Time2, y = RelativeExpression, group = ProteinID)) + + geom_line(col = "lightgray") + + geom_line(data = zval_DEGenes_cluster_melt, aes(x = Time2, y = RelativeExpression)) + + theme_bw() + + xlab("Time point") + + ylab("Relative expression") + + ggtitle(paste("Cluster 1", clusterNumber, sep = ".")) + + annotate("text", x = 3, y = 1, label = paste(numberGenes,"proteins", sep = " ")) + ##ggsave(filename = outFilePlot1, plot = p) + plots_withOutBackground[[clusterNumber]] <- p +} +setwd(directoryPDF$directory) +savePlotPDF(directoryPDF$file, plots_withOutBackground) \ No newline at end of file