Mercurial > repos > nicolas > oghma
changeset 94:f90c741e2a32 draft
Uploaded
author | nicolas |
---|---|
date | Mon, 31 Oct 2016 05:50:25 -0400 |
parents | 3cdfa1e4b1c1 |
children | 5a0b751594e6 |
files | evaluate_aggregation.R |
diffstat | 1 files changed, 451 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evaluate_aggregation.R Mon Oct 31 05:50:25 2016 -0400 @@ -0,0 +1,451 @@ +######################################################## +# +# creation date : 10/10/16 +# last modification : 29/10/16 +# author : Dr Nicolas Beaume +# +######################################################## + +suppressWarnings(suppressMessages(library(GA))) +library("miscTools") +library(rpart) +suppressWarnings(suppressMessages(library(randomForest))) +library(e1071) +suppressWarnings(suppressMessages(library(glmnet))) +library(rrBLUP) +options(warn=-1) +############################ helper functions ####################### + +##### classifiers +prediction <- function(genotype, model, classifier="unknown") { + # run prediction according to the classifier + switch(classifier, + rrBLUP={ + predictions <- as.matrix(genotype) %*% as.matrix(model$u); + predictions <- predictions[,1]+model$beta; + }, + rf={ + predictions <- predict(model, genotype); + }, + svm={ + predictions <- predict(model, genotype); + }, + lasso={ + predictions <- predict(model, as.matrix(genotype), type = "response"); + }, + {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")}) + return(predictions) +} + +# extract parameter from a model, excluding rrBLUP which auto-optimize +extractParameter <- function(model, classifierName) { + param <- NULL + switch(classifierName, + # random forest + rf={ + param <- model$ntree + param <- c(param, list(model$mtry)) + names(param) <- c("ntree", "mtry") + }, + # svm + svm={ + param <- as.numeric(model$cost) + param <- c(param, list(model$gamma)) + param <- c(param, list(model$coef0)) + param <- c(param, list(model$degree)) + param <- c(param, list(model$kernel)) + names(param) <- c("c", "g", "coef", "d", "kernel") + switch((model$kernel+1), + param$kernel <- "linear", + param$kernel <- "polynomial", + param$kernel <- "radial", + param$kernel <- "sigmoid" + ) + }, + # lasso + lasso={ + param <- as.list(model$lambda) + names(param) <- "lambda" + }, + {print(paste("unknown classifier, please choose among rf, svm, lasso")); + stop()} + ) + return(param) +} + +##### Genetic algorithm + +# compute r2 by computing the classic formula +# compare the sum of square difference from target to prediciton +# to the sum of square difference from target to the mean of the target +r2 <- function(target, prediction) { + sst <- sum((target-mean(target))^2) + ssr <- sum((target-prediction)^2) + return(1-ssr/sst) +} + +optimizeOneIndividual <- function(values, trueValue) { + # change the value into a function + f <- function(w) {sum(values * w/sum(w))} + fitness <- function(x) {1/abs(trueValue-f(x))} + resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), + maxiter = 1000, monitor = NULL, keepBest = T) + resp@solution <- resp@solution/sum(resp@solution) + return(resp) +} + +optimizeWeight <- function(values, trueValue, n=1000) { + fitnessAll <- function(w) { + predicted <- apply(values, 1, weightedPrediction.vec, w) + return(mean(r2(trueValue, predicted))) + #return(mean(1/abs(trueValue-predicted))) + } + resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), + maxiter = n, monitor = NULL, keepBest = T) + resp@solution <- resp@solution/sum(resp@solution) + return(resp) +} + +weightedPrediction <- function(classifiers, w) { + if(length(w) > ncol(classifiers)) { + warning("more weights than classifiers, extra weigths are ignored") + w <- w[1:ncol(classifiers)] + } else if(length(w) < ncol(classifiers)) { + warning("less weights than classifiers, extra classifiers are ignored") + classifiers <- classifiers[,1:length(w)] + } + prediction <- NULL + prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) + return(prediction) +} + +weightedPrediction.vec <- function(values, w) { + return(sum(values * w/sum(w))) +} + +##### meta-decision tree + +tuneTree <- function(data, target) { + data <- data.frame(data, target=target) + size <- nrow(data) + xerror <- NULL + split <- 1:ceiling(size/5) + leafSize <- 1:ceiling(size/10) + xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) + cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) + for(i in 1:length(split)) { + for(j in 1:length(leafSize)) { + op <- list(minsplit=split[i], minbucket=leafSize[j]) + tree <- rpart(target ~., data=data, control=op, method="anova") + xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] + cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] + } + } + index <- which(xerror==min(xerror), arr.ind = T) + op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) + return(op) +} + +###### meta-LASSO +# create fold by picking at random row indexes +createFolds <- function(nbObs, n) { + # pick indexes + index <- sample(1:n, size=nbObs, replace = T) + # populate folds + folds <- NULL + for(i in 1:n) { + folds <- c(folds, list(which(index==i))) + } + return(folds) +} + +searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { + folds <- createFolds(nrow(genotype), n = n) + acc <- NULL + indexAlpha <- 1 + for(a in alpha) { + curAcc <- NULL + for(i in 1:n) { + train <- genotype[-folds[[i]],] + test <- genotype[folds[[i]],] + phenoTrain <- phenotype[-folds[[i]]] + phenoTest <- phenotype[folds[[i]]] + cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) + model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) + pred <- predict(model, test, type = "response") + curAcc <- c(curAcc, r2(phenoTest, pred)) + } + acc <- c(acc, mean(curAcc)) + } + names(acc) <- alpha + return(as.numeric(names(acc)[which.max(acc)])) +} + +###### meta-random forest + +searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { + n <- ceiling(nrow(genotype)/3) + indexTest <- sample(1:nrow(genotype), size=n) + train <- genotype[-indexTest,] + test <- genotype[indexTest,] + phenoTrain <- phenotype[-indexTest] + phenoTest <- phenotype[indexTest] + acc <- NULL + indexNtree <- 1 + for(ntree in rangeNtree) { + model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) + pred <- predict(model, test) + acc <- c(acc, r2(phenoTest, pred)) + } + names(acc) <- rangeNtree + best <- which.max(acc) + return(as.numeric(names(acc)[best])) +} + +###### meta-SVM +searchParamSVM <- function(train, target, kernel="radial") { + # tuning parameters then train + model <- NULL + switch(kernel, + sigmoid={ + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); + g <- tune$best.parameters[[1]]; + c <- tune$best.parameters[[2]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, + linear={ + tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); + c <- tune$best.parameters[[1]]; + model <- svm(x=train, y=target, cost = c, kernel = "linear")}, + polynomial={ + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); + d <- tune$best.parameters[[1]]; + g <- tune$best.parameters[[2]]; + coef <- tune$best.parameters[[3]]; + c <- tune$best.parameters[[4]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, + { + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); + g <- tune$best.parameters[[1]]; + c <- tune$best.parameters[[2]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} + ) + return(model) +} + +#################### upper level functions ##################### + +aggregateDT <- function(train, test, target, folds) { + r2Aggreg <- NULL + for (i in 1:length(folds)) { + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + treeParam <- tuneTree(train[[i]], target[trainIndex]) + data <- data.frame(train[[i]], target=target[trainIndex]) + model <- rpart(target ~., data=data, method = "anova", control = treeParam) + model <- prune(model, cp=treeParam["cp"]) + pred <- predict(model, data.frame(test[[i]])) + r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) + } + return(r2Aggreg) +} + +aggregateGeneticMean <- function(train, test, target, folds) { + r2Aggreg <- NULL + for (i in 1:length(folds)) { + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex]) + pred <- weightedPrediction(test[[i]], opt@solution) + r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) + } + return(r2Aggreg) +} + +aggregateLASSO <- function(train, test, target, folds) { + r2Aggreg <- NULL + for (i in 1:length(folds)) { + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + alpha <- searchParamLASSO(train[[i]], target[trainIndex]) + cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha) + model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se) + pred <- predict(model, test[[i]]) + r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) + } + return(r2Aggreg) +} + +aggregateRF <- function(train, test, target, folds) { + r2Aggreg <- NULL + for (i in 1:length(folds)) { + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex], + rangeNtree = seq(100, 1000, 100)) + model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex], + ntree = ntree, mtry = ncol(train[[i]])) + pred <- predict(model, test[[i]]) + r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) + } + return(r2Aggreg) +} + +aggregateSVM <- function(train, test, target, folds, kernel="linear") { + r2Aggreg <- NULL + for (i in 1:length(folds)) { + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel) + pred <- predict(model, test[[i]]) + r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) + } + return(r2Aggreg) +} + +################################### main ############################# +# # load argument +cmd <- commandArgs(T) +source(cmd[1]) +# load folds +con = file(folds) +folds <- readLines(con = con, n = 1, ok=T) +close(con) +folds <- readRDS(folds) +# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve +phenotype <- read.table(phenotype, sep="\t", h=T)[,1] +# load genotype +con = file(genotype) +genotype <- readLines(con = con, n = 1, ok=T) +close(con) +genotype <- read.table(genotype, sep="\t", h=T) +# find which classifiers will be used for aggregation +classifNames <- NULL +if(lassoModel !="None"){ + classifNames <- c(classifNames, "lasso") + con = file(lassoModel) + lassoModel <- readLines(con = con, n = 1, ok=T) + close(con) + lassoModel <- readRDS(lassoModel) +} +if(rrBLUPModel !="None"){ + classifNames <- c(classifNames, "rrBLUP") + con = file(rrBLUPModel) + rrBLUPModel <- readLines(con = con, n = 1, ok=T) + close(con) + rrBLUPModel <- readRDS(rrBLUPModel) +} +if(rfModel !="None"){ + classifNames <- c(classifNames, "rf") + con = file(rfModel) + rfModel <- readLines(con = con, n = 1, ok=T) + close(con) + rfModel <- readRDS(rfModel) +} +if(svmModel !="None"){ + classifNames <- c(classifNames, "svm") + con = file(svmModel) + svmModel <- readLines(con = con, n = 1, ok=T) + close(con) + svmModel <- readRDS(svmModel) +} + +# compute prediction of the training set and test set for each fold and each classifiers +# train predictions and test prediction are stored in separate lists +# where each element of the list represent a folds +predictionTrain.list <- NULL +predictionTest.list <- NULL +r2Classif.list <- NULL +for(i in 1:length(folds)) { + # for the current fold, create training set and test set + trainIndex <- unlist(folds[-i]) + testIndex <- folds[[i]] + train <- genotype[trainIndex,] + phenoTrain <- phenotype[trainIndex] + test <- genotype[testIndex,] + phenoTest <- phenotype[testIndex] + # only to intialize data frame containing predictions + predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), + ncol=length(classifNames)) + colnames(predictionTrain) <- classifNames + predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), + ncol=length(classifNames)) + colnames(predictionTest) <- classifNames + r2Classif <- NULL + # for each classifiers, compute prediction on both sets + # and evaluate r2 to find the best classifier + for(j in 1:length(classifNames)) { + switch(classifNames[j], + # random forest + rf={ + # predict train and test + param <- extractParameter(rfModel, "rf") + model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, + ntree = param$ntree); + predictionTrain[,j] <- prediction(train, model, classifier = "rf"); + predictionTest[,j] <- prediction(test, model, classifier = "rf"); + r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))}, + # svm + svm={ + # predict train and test + param <- extractParameter(svmModel, "svm"); + model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c, + gamma=param$g, degree = param$d, coef0 = param$coef, scale = F) + predictionTrain[,j] <- prediction(train, model, classifier = "svm"); + predictionTest[,j] <- prediction(test, model, classifier = "svm"); + r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))}, + # lasso + lasso={ + # predict train and test + param <- extractParameter(lassoModel, "lasso"); + model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda); + predictionTrain[,j] <- prediction(train, model, classifier = "lasso"); + predictionTest[,j] <- prediction(test, model, classifier = "lasso"); + r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))}, + # rrBLUP + rrBLUP={ + # predict train and test + model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F); + predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP"); + predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP"); + r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))}, + {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP")); + stop()} + ) + } + predictionTrain.list <- c(predictionTrain.list, list(predictionTrain)) + predictionTest.list <- c(predictionTest.list, list(predictionTest)) + r2Classif.list <- c(r2Classif.list, list(r2Classif)) +} +# aggregate ! +switch(method, + geneticMean={ + aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list, + target = phenotype, folds=folds) + }, + dt={ + aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list, + target = phenotype, folds=folds) + }, + lasso={ + aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list, + target = phenotype, folds=folds) + }, + rf={ + aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list, + target = phenotype, folds=folds) + }, + # svm, by default + {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list, + target = phenotype, folds=folds, kernel=kernel)} +) +# determine best classifier +# first, transform list into a matrix +r2Classif.list <- t(data.frame(r2Classif.list)) +# then, compute the mean r2 for each classifier +meanR2Classif <- apply(r2Classif.list, 2, mean) +# choose the best one +bestClassif <- which.max(meanR2Classif) +# compare aggregation and best classifiers +finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg, + diff=(aggreg-r2Classif.list[,bestClassif])) +print(apply(finalRes, 2, mean))