Mercurial > repos > nicolas > oghma
view evaluate_aggregation.R @ 94:f90c741e2a32 draft
Uploaded
author | nicolas |
---|---|
date | Mon, 31 Oct 2016 05:50:25 -0400 |
parents | |
children |
line wrap: on
line source
######################################################## # # creation date : 10/10/16 # last modification : 29/10/16 # author : Dr Nicolas Beaume # ######################################################## suppressWarnings(suppressMessages(library(GA))) library("miscTools") library(rpart) suppressWarnings(suppressMessages(library(randomForest))) library(e1071) suppressWarnings(suppressMessages(library(glmnet))) library(rrBLUP) options(warn=-1) ############################ helper functions ####################### ##### classifiers prediction <- function(genotype, model, classifier="unknown") { # run prediction according to the classifier switch(classifier, rrBLUP={ predictions <- as.matrix(genotype) %*% as.matrix(model$u); predictions <- predictions[,1]+model$beta; }, rf={ predictions <- predict(model, genotype); }, svm={ predictions <- predict(model, genotype); }, lasso={ predictions <- predict(model, as.matrix(genotype), type = "response"); }, {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")}) return(predictions) } # extract parameter from a model, excluding rrBLUP which auto-optimize extractParameter <- function(model, classifierName) { param <- NULL switch(classifierName, # random forest rf={ param <- model$ntree param <- c(param, list(model$mtry)) names(param) <- c("ntree", "mtry") }, # svm svm={ param <- as.numeric(model$cost) param <- c(param, list(model$gamma)) param <- c(param, list(model$coef0)) param <- c(param, list(model$degree)) param <- c(param, list(model$kernel)) names(param) <- c("c", "g", "coef", "d", "kernel") switch((model$kernel+1), param$kernel <- "linear", param$kernel <- "polynomial", param$kernel <- "radial", param$kernel <- "sigmoid" ) }, # lasso lasso={ param <- as.list(model$lambda) names(param) <- "lambda" }, {print(paste("unknown classifier, please choose among rf, svm, lasso")); stop()} ) return(param) } ##### Genetic algorithm # compute r2 by computing the classic formula # compare the sum of square difference from target to prediciton # to the sum of square difference from target to the mean of the target r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) return(1-ssr/sst) } optimizeOneIndividual <- function(values, trueValue) { # change the value into a function f <- function(w) {sum(values * w/sum(w))} fitness <- function(x) {1/abs(trueValue-f(x))} resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), maxiter = 1000, monitor = NULL, keepBest = T) resp@solution <- resp@solution/sum(resp@solution) return(resp) } optimizeWeight <- function(values, trueValue, n=1000) { fitnessAll <- function(w) { predicted <- apply(values, 1, weightedPrediction.vec, w) return(mean(r2(trueValue, predicted))) #return(mean(1/abs(trueValue-predicted))) } resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), maxiter = n, monitor = NULL, keepBest = T) resp@solution <- resp@solution/sum(resp@solution) return(resp) } weightedPrediction <- function(classifiers, w) { if(length(w) > ncol(classifiers)) { warning("more weights than classifiers, extra weigths are ignored") w <- w[1:ncol(classifiers)] } else if(length(w) < ncol(classifiers)) { warning("less weights than classifiers, extra classifiers are ignored") classifiers <- classifiers[,1:length(w)] } prediction <- NULL prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) return(prediction) } weightedPrediction.vec <- function(values, w) { return(sum(values * w/sum(w))) } ##### meta-decision tree tuneTree <- function(data, target) { data <- data.frame(data, target=target) size <- nrow(data) xerror <- NULL split <- 1:ceiling(size/5) leafSize <- 1:ceiling(size/10) xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) for(i in 1:length(split)) { for(j in 1:length(leafSize)) { op <- list(minsplit=split[i], minbucket=leafSize[j]) tree <- rpart(target ~., data=data, control=op, method="anova") xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] } } index <- which(xerror==min(xerror), arr.ind = T) op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) return(op) } ###### meta-LASSO # create fold by picking at random row indexes createFolds <- function(nbObs, n) { # pick indexes index <- sample(1:n, size=nbObs, replace = T) # populate folds folds <- NULL for(i in 1:n) { folds <- c(folds, list(which(index==i))) } return(folds) } searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { folds <- createFolds(nrow(genotype), n = n) acc <- NULL indexAlpha <- 1 for(a in alpha) { curAcc <- NULL for(i in 1:n) { train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] phenoTest <- phenotype[folds[[i]]] cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) pred <- predict(model, test, type = "response") curAcc <- c(curAcc, r2(phenoTest, pred)) } acc <- c(acc, mean(curAcc)) } names(acc) <- alpha return(as.numeric(names(acc)[which.max(acc)])) } ###### meta-random forest searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { n <- ceiling(nrow(genotype)/3) indexTest <- sample(1:nrow(genotype), size=n) train <- genotype[-indexTest,] test <- genotype[indexTest,] phenoTrain <- phenotype[-indexTest] phenoTest <- phenotype[indexTest] acc <- NULL indexNtree <- 1 for(ntree in rangeNtree) { model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) pred <- predict(model, test) acc <- c(acc, r2(phenoTest, pred)) } names(acc) <- rangeNtree best <- which.max(acc) return(as.numeric(names(acc)[best])) } ###### meta-SVM searchParamSVM <- function(train, target, kernel="radial") { # tuning parameters then train model <- NULL switch(kernel, sigmoid={ tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); g <- tune$best.parameters[[1]]; c <- tune$best.parameters[[2]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, linear={ tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); c <- tune$best.parameters[[1]]; model <- svm(x=train, y=target, cost = c, kernel = "linear")}, polynomial={ tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); d <- tune$best.parameters[[1]]; g <- tune$best.parameters[[2]]; coef <- tune$best.parameters[[3]]; c <- tune$best.parameters[[4]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, { tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); g <- tune$best.parameters[[1]]; c <- tune$best.parameters[[2]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} ) return(model) } #################### upper level functions ##################### aggregateDT <- function(train, test, target, folds) { r2Aggreg <- NULL for (i in 1:length(folds)) { trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] treeParam <- tuneTree(train[[i]], target[trainIndex]) data <- data.frame(train[[i]], target=target[trainIndex]) model <- rpart(target ~., data=data, method = "anova", control = treeParam) model <- prune(model, cp=treeParam["cp"]) pred <- predict(model, data.frame(test[[i]])) r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) } return(r2Aggreg) } aggregateGeneticMean <- function(train, test, target, folds) { r2Aggreg <- NULL for (i in 1:length(folds)) { trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex]) pred <- weightedPrediction(test[[i]], opt@solution) r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) } return(r2Aggreg) } aggregateLASSO <- function(train, test, target, folds) { r2Aggreg <- NULL for (i in 1:length(folds)) { trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] alpha <- searchParamLASSO(train[[i]], target[trainIndex]) cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha) model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se) pred <- predict(model, test[[i]]) r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) } return(r2Aggreg) } aggregateRF <- function(train, test, target, folds) { r2Aggreg <- NULL for (i in 1:length(folds)) { trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex], rangeNtree = seq(100, 1000, 100)) model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex], ntree = ntree, mtry = ncol(train[[i]])) pred <- predict(model, test[[i]]) r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) } return(r2Aggreg) } aggregateSVM <- function(train, test, target, folds, kernel="linear") { r2Aggreg <- NULL for (i in 1:length(folds)) { trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel) pred <- predict(model, test[[i]]) r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) } return(r2Aggreg) } ################################### main ############################# # # load argument cmd <- commandArgs(T) source(cmd[1]) # load folds con = file(folds) folds <- readLines(con = con, n = 1, ok=T) close(con) folds <- readRDS(folds) # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # load genotype con = file(genotype) genotype <- readLines(con = con, n = 1, ok=T) close(con) genotype <- read.table(genotype, sep="\t", h=T) # find which classifiers will be used for aggregation classifNames <- NULL if(lassoModel !="None"){ classifNames <- c(classifNames, "lasso") con = file(lassoModel) lassoModel <- readLines(con = con, n = 1, ok=T) close(con) lassoModel <- readRDS(lassoModel) } if(rrBLUPModel !="None"){ classifNames <- c(classifNames, "rrBLUP") con = file(rrBLUPModel) rrBLUPModel <- readLines(con = con, n = 1, ok=T) close(con) rrBLUPModel <- readRDS(rrBLUPModel) } if(rfModel !="None"){ classifNames <- c(classifNames, "rf") con = file(rfModel) rfModel <- readLines(con = con, n = 1, ok=T) close(con) rfModel <- readRDS(rfModel) } if(svmModel !="None"){ classifNames <- c(classifNames, "svm") con = file(svmModel) svmModel <- readLines(con = con, n = 1, ok=T) close(con) svmModel <- readRDS(svmModel) } # compute prediction of the training set and test set for each fold and each classifiers # train predictions and test prediction are stored in separate lists # where each element of the list represent a folds predictionTrain.list <- NULL predictionTest.list <- NULL r2Classif.list <- NULL for(i in 1:length(folds)) { # for the current fold, create training set and test set trainIndex <- unlist(folds[-i]) testIndex <- folds[[i]] train <- genotype[trainIndex,] phenoTrain <- phenotype[trainIndex] test <- genotype[testIndex,] phenoTest <- phenotype[testIndex] # only to intialize data frame containing predictions predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), ncol=length(classifNames)) colnames(predictionTrain) <- classifNames predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), ncol=length(classifNames)) colnames(predictionTest) <- classifNames r2Classif <- NULL # for each classifiers, compute prediction on both sets # and evaluate r2 to find the best classifier for(j in 1:length(classifNames)) { switch(classifNames[j], # random forest rf={ # predict train and test param <- extractParameter(rfModel, "rf") model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, ntree = param$ntree); predictionTrain[,j] <- prediction(train, model, classifier = "rf"); predictionTest[,j] <- prediction(test, model, classifier = "rf"); r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))}, # svm svm={ # predict train and test param <- extractParameter(svmModel, "svm"); model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c, gamma=param$g, degree = param$d, coef0 = param$coef, scale = F) predictionTrain[,j] <- prediction(train, model, classifier = "svm"); predictionTest[,j] <- prediction(test, model, classifier = "svm"); r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))}, # lasso lasso={ # predict train and test param <- extractParameter(lassoModel, "lasso"); model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda); predictionTrain[,j] <- prediction(train, model, classifier = "lasso"); predictionTest[,j] <- prediction(test, model, classifier = "lasso"); r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))}, # rrBLUP rrBLUP={ # predict train and test model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F); predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP"); predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP"); r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))}, {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP")); stop()} ) } predictionTrain.list <- c(predictionTrain.list, list(predictionTrain)) predictionTest.list <- c(predictionTest.list, list(predictionTest)) r2Classif.list <- c(r2Classif.list, list(r2Classif)) } # aggregate ! switch(method, geneticMean={ aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list, target = phenotype, folds=folds) }, dt={ aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list, target = phenotype, folds=folds) }, lasso={ aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list, target = phenotype, folds=folds) }, rf={ aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list, target = phenotype, folds=folds) }, # svm, by default {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list, target = phenotype, folds=folds, kernel=kernel)} ) # determine best classifier # first, transform list into a matrix r2Classif.list <- t(data.frame(r2Classif.list)) # then, compute the mean r2 for each classifier meanR2Classif <- apply(r2Classif.list, 2, mean) # choose the best one bestClassif <- which.max(meanR2Classif) # compare aggregation and best classifiers finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg, diff=(aggreg-r2Classif.list[,bestClassif])) print(apply(finalRes, 2, mean))