Mercurial > repos > nicolas > oghma
changeset 93:3cdfa1e4b1c1 draft
Deleted selected files
author | nicolas |
---|---|
date | Mon, 31 Oct 2016 05:50:01 -0400 |
parents | d1e92ce799c1 |
children | f90c741e2a32 |
files | evaluate_aggregation.R |
diffstat | 1 files changed, 0 insertions(+), 452 deletions(-) [+] |
line wrap: on
line diff
--- a/evaluate_aggregation.R Mon Oct 31 04:53:30 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,452 +0,0 @@ -######################################################## -# -# creation date : 10/10/16 -# last modification : 29/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - -suppressWarnings(suppressMessages(library(GA))) -library("miscTools") -library(rpart) -suppressWarnings(suppressMessages(library(randomForest))) -library(e1071) -suppressWarnings(suppressMessages(library(glmnet))) -library(rrBLUP) -options(warn=-1) -############################ helper functions ####################### - -##### classifiers -prediction <- function(genotype, model, classifier="unknown") { - # run prediction according to the classifier - switch(classifier, - rrBLUP={ - predictions <- as.matrix(genotype) %*% as.matrix(model$u); - predictions <- predictions[,1]+model$beta; - }, - rf={ - predictions <- predict(model, genotype); - }, - svm={ - predictions <- predict(model, genotype); - }, - lasso={ - predictions <- predict(model, as.matrix(genotype), type = "response"); - }, - {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")}) - return(predictions) -} - -# extract parameter from a model, excluding rrBLUP which auto-optimize -extractParameter <- function(model, classifierName) { - param <- NULL - switch(classifierName, - # random forest - rf={ - param <- model$ntree - param <- c(param, list(model$mtry)) - names(param) <- c("ntree", "mtry") - }, - # svm - svm={ - param <- as.numeric(model$cost) - param <- c(param, list(model$gamma)) - param <- c(param, list(model$coef0)) - param <- c(param, list(model$degree)) - param <- c(param, list(model$kernel)) - names(param) <- c("c", "g", "coef", "d", "kernel") - switch((model$kernel+1), - param$kernel <- "linear", - param$kernel <- "polynomial", - param$kernel <- "radial", - param$kernel <- "sigmoid" - ) - }, - # lasso - lasso={ - param <- as.list(model$lambda) - names(param) <- "lambda" - }, - {print(paste("unknown classifier, please choose among rf, svm, lasso")); - stop()} - ) - return(param) -} - -##### Genetic algorithm - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} - -optimizeOneIndividual <- function(values, trueValue) { - # change the value into a function - f <- function(w) {sum(values * w/sum(w))} - fitness <- function(x) {1/abs(trueValue-f(x))} - resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), - maxiter = 1000, monitor = NULL, keepBest = T) - resp@solution <- resp@solution/sum(resp@solution) - return(resp) -} - -optimizeWeight <- function(values, trueValue, n=1000) { - fitnessAll <- function(w) { - predicted <- apply(values, 1, weightedPrediction.vec, w) - return(mean(r2(trueValue, predicted))) - #return(mean(1/abs(trueValue-predicted))) - } - resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), - maxiter = n, monitor = NULL, keepBest = T) - resp@solution <- resp@solution/sum(resp@solution) - return(resp) -} - -weightedPrediction <- function(classifiers, w) { - if(length(w) > ncol(classifiers)) { - warning("more weights than classifiers, extra weigths are ignored") - w <- w[1:ncol(classifiers)] - } else if(length(w) < ncol(classifiers)) { - warning("less weights than classifiers, extra classifiers are ignored") - classifiers <- classifiers[,1:length(w)] - } - prediction <- NULL - prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) - return(prediction) -} - -weightedPrediction.vec <- function(values, w) { - return(sum(values * w/sum(w))) -} - -##### meta-decision tree - -tuneTree <- function(data, target) { - data <- data.frame(data, target=target) - size <- nrow(data) - xerror <- NULL - split <- 1:ceiling(size/5) - leafSize <- 1:ceiling(size/10) - xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) - cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) - for(i in 1:length(split)) { - for(j in 1:length(leafSize)) { - op <- list(minsplit=split[i], minbucket=leafSize[j]) - tree <- rpart(target ~., data=data, control=op, method="anova") - xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] - cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] - } - } - index <- which(xerror==min(xerror), arr.ind = T) - op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) - return(op) -} - -###### meta-LASSO -# create fold by picking at random row indexes -createFolds <- function(nbObs, n) { - # pick indexes - index <- sample(1:n, size=nbObs, replace = T) - # populate folds - folds <- NULL - for(i in 1:n) { - folds <- c(folds, list(which(index==i))) - } - return(folds) -} - -searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { - folds <- createFolds(nrow(genotype), n = n) - acc <- NULL - indexAlpha <- 1 - for(a in alpha) { - curAcc <- NULL - for(i in 1:n) { - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) - model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) - pred <- predict(model, test, type = "response") - curAcc <- c(curAcc, r2(phenoTest, pred)) - } - acc <- c(acc, mean(curAcc)) - } - names(acc) <- alpha - return(as.numeric(names(acc)[which.max(acc)])) -} - -###### meta-random forest - -searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - acc <- NULL - indexNtree <- 1 - for(ntree in rangeNtree) { - model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) - pred <- predict(model, test) - acc <- c(acc, r2(phenoTest, pred)) - } - names(acc) <- rangeNtree - best <- which.max(acc) - return(as.numeric(names(acc)[best])) -} - -###### meta-SVM -searchParamSVM <- function(train, target, kernel="radial") { - # tuning parameters then train - model <- NULL - switch(kernel, - sigmoid={ - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, - linear={ - tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); - c <- tune$best.parameters[[1]]; - model <- svm(x=train, y=target, cost = c, kernel = "linear")}, - polynomial={ - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); - d <- tune$best.parameters[[1]]; - g <- tune$best.parameters[[2]]; - coef <- tune$best.parameters[[3]]; - c <- tune$best.parameters[[4]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, - { - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} - ) - return(model) -} - -#################### upper level functions ##################### - -aggregateDT <- function(train, test, target, folds) { - r2Aggreg <- NULL - for (i in 1:length(folds)) { - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - treeParam <- tuneTree(train[[i]], target[trainIndex]) - data <- data.frame(train[[i]], target=target[trainIndex]) - model <- rpart(target ~., data=data, method = "anova", control = treeParam) - model <- prune(model, cp=treeParam["cp"]) - pred <- predict(model, data.frame(test[[i]])) - r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) - } - return(r2Aggreg) -} - -aggregateGeneticMean <- function(train, test, target, folds) { - r2Aggreg <- NULL - for (i in 1:length(folds)) { - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex]) - pred <- weightedPrediction(test[[i]], opt@solution) - r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) - } - return(r2Aggreg) -} - -aggregateLASSO <- function(train, test, target, folds) { - r2Aggreg <- NULL - for (i in 1:length(folds)) { - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - alpha <- searchParamLASSO(train[[i]], target[trainIndex]) - cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha) - model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se) - pred <- predict(model, test[[i]]) - r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) - } - return(r2Aggreg) -} - -aggregateRF <- function(train, test, target, folds) { - r2Aggreg <- NULL - for (i in 1:length(folds)) { - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex], - rangeNtree = seq(100, 1000, 100)) - model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex], - ntree = ntree, mtry = ncol(train[[i]])) - pred <- predict(model, test[[i]]) - r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) - } - return(r2Aggreg) -} - -aggregateSVM <- function(train, test, target, folds, kernel="linear") { - r2Aggreg <- NULL - for (i in 1:length(folds)) { - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel) - pred <- predict(model, test[[i]]) - r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) - } - return(r2Aggreg) -} - -################################### main ############################# -# # load argument -cmd <- commandArgs(T) -source(cmd[1]) -# load folds -con = file(folds) -folds <- readLines(con = con, n = 1, ok=T) -close(con) -folds <- readRDS(folds) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# load genotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -# find which classifiers will be used for aggregation -classifNames <- NULL -if(lassoModel !="None"){ - classifNames <- c(classifNames, "lasso") - con = file(lassoModel) - lassoModel <- readLines(con = con, n = 1, ok=T) - close(con) - lassoModel <- readRDS(lassoModel) -} -if(rrBLUPModel !="None"){ - classifNames <- c(classifNames, "rrBLUP") - con = file(rrBLUPModel) - rrBLUPModel <- readLines(con = con, n = 1, ok=T) - close(con) - rrBLUPModel <- readRDS(rrBLUPModel) -} -if(rfModel !="None"){ - classifNames <- c(classifNames, "rf") - con = file(rfModel) - rfModel <- readLines(con = con, n = 1, ok=T) - close(con) - rfModel <- readRDS(rfModel) -} -if(svmModel !="None"){ - classifNames <- c(classifNames, "svm") - con = file(svmModel) - svmModel <- readLines(con = con, n = 1, ok=T) - close(con) - svmModel <- readRDS(svmModel) -} - -# compute prediction of the training set and test set for each fold and each classifiers -# train predictions and test prediction are stored in separate lists -# where each element of the list represent a folds -predictionTrain.list <- NULL -predictionTest.list <- NULL -r2Classif.list <- NULL -for(i in 1:length(folds)) { - # for the current fold, create training set and test set - trainIndex <- unlist(folds[-i]) - testIndex <- folds[[i]] - train <- genotype[trainIndex,] - phenoTrain <- phenotype[trainIndex] - test <- genotype[testIndex,] - phenoTest <- phenotype[testIndex] - # only to intialize data frame containing predictions - predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), - ncol=length(classifNames)) - colnames(predictionTrain) <- classifNames - predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), - ncol=length(classifNames)) - colnames(predictionTest) <- classifNames - r2Classif <- NULL - # for each classifiers, compute prediction on both sets - # and evaluate r2 to find the best classifier - for(j in 1:length(classifNames)) { - switch(classifNames[j], - # random forest - rf={ - # predict train and test - param <- extractParameter(rfModel, "rf") - model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, - ntree = param$ntree); - predictionTrain[,j] <- prediction(train, model, classifier = "rf"); - predictionTest[,j] <- prediction(test, model, classifier = "rf"); - r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))}, - # svm - svm={ - # predict train and test - param <- extractParameter(svmModel, "svm"); - model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c, - gamma=param$g, degree = param$d, coef0 = param$coef, scale = F) - predictionTrain[,j] <- prediction(train, model, classifier = "svm"); - predictionTest[,j] <- prediction(test, model, classifier = "svm"); - r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))}, - # lasso - lasso={ - # predict train and test - param <- extractParameter(lassoModel, "lasso"); - model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda); - predictionTrain[,j] <- prediction(train, model, classifier = "lasso"); - predictionTest[,j] <- prediction(test, model, classifier = "lasso"); - r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))}, - # rrBLUP - rrBLUP={ - # predict train and test - model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F); - predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP"); - predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP"); - r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))}, - {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP")); - stop()} - ) - } - predictionTrain.list <- c(predictionTrain.list, list(predictionTrain)) - predictionTest.list <- c(predictionTest.list, list(predictionTest)) - r2Classif.list <- c(r2Classif.list, list(r2Classif)) -} -# aggregate ! -switch(method, - geneticMean={ - aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list, - target = phenotype, folds=folds) - }, - dt={ - aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list, - target = phenotype, folds=folds) - }, - lasso={ - aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list, - target = phenotype, folds=folds) - }, - rf={ - aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list, - target = phenotype, folds=folds) - }, - # svm, by default - {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list, - target = phenotype, folds=folds, kernel=kernel)} -) -# determine best classifier -# first, transform list into a matrix -saveRDS(r2Classif.list, "/Users/nbeaume/Desktop/r2Classif.rds") -r2Classif.list <- t(data.frame(r2Classif.list)) -# then, compute the mean r2 for each classifier -meanR2Classif <- apply(r2Classif.list, 2, mean) -# choose the best one -bestClassif <- which.max(meanR2Classif) -# compare aggregation and best classifiers -finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg, - diff=(aggreg-r2Classif.list[,bestClassif])) -print(apply(finalRes, 2, mean))