Mercurial > repos > nicolas > oghma

--- a/evaluate_aggregation.R	Mon Oct 31 04:53:30 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,452 +0,0 @@
-########################################################
-#
-# creation date : 10/10/16
-# last modification : 29/10/16
-# author : Dr Nicolas Beaume
-#
-########################################################
-
-suppressWarnings(suppressMessages(library(GA)))
-library("miscTools")
-library(rpart)
-suppressWarnings(suppressMessages(library(randomForest)))
-library(e1071)
-suppressWarnings(suppressMessages(library(glmnet)))
-library(rrBLUP)
-options(warn=-1)
-############################ helper functions #######################
-
-##### classifiers
-prediction <- function(genotype, model, classifier="unknown") {
-  # run prediction according to the classifier
-  switch(classifier,
-         rrBLUP={
-           predictions <- as.matrix(genotype) %*% as.matrix(model$u);
-           predictions <- predictions[,1]+model$beta;
-         },
-         rf={
-           predictions <- predict(model, genotype);
-         },
-         svm={
-           predictions <- predict(model, genotype);
-         },
-         lasso={
-           predictions <- predict(model, as.matrix(genotype), type = "response");
-         },
-         {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")})
-  return(predictions)
-}
-
-# extract parameter from a model, excluding rrBLUP which auto-optimize
-extractParameter <- function(model, classifierName) {
-  param <- NULL
-  switch(classifierName,
-         # random forest
-         rf={
-          param <- model$ntree
-          param <- c(param, list(model$mtry))
-          names(param) <- c("ntree", "mtry")
-           },
-         # svm
-         svm={
-          param <- as.numeric(model$cost)
-          param <- c(param, list(model$gamma))
-          param <- c(param, list(model$coef0))
-          param <- c(param, list(model$degree))
-          param <- c(param, list(model$kernel))
-          names(param) <- c("c", "g", "coef", "d", "kernel")
-          switch((model$kernel+1),
-                 param$kernel <- "linear",
-                 param$kernel <- "polynomial",
-                 param$kernel <- "radial",
-                 param$kernel <- "sigmoid"
-                 )
-           },
-         # lasso
-         lasso={
-           param <- as.list(model$lambda)
-           names(param) <- "lambda"
-          },
-         {print(paste("unknown classifier, please choose among rf, svm, lasso"));
-           stop()}
-  )
-  return(param)
-}
-
-##### Genetic algorithm
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-r2 <- function(target, prediction) {
-  sst <- sum((target-mean(target))^2)
-  ssr <- sum((target-prediction)^2)
-  return(1-ssr/sst)
-}
-
-optimizeOneIndividual <- function(values, trueValue) {
-  # change the value into a function
-  f <- function(w) {sum(values * w/sum(w))}
-  fitness <- function(x) {1/abs(trueValue-f(x))}
-  resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
-             maxiter = 1000, monitor = NULL, keepBest = T)
-  resp@solution <- resp@solution/sum(resp@solution)
-  return(resp)
-}
-
-optimizeWeight <- function(values, trueValue, n=1000) {
-  fitnessAll <- function(w) {
-    predicted <- apply(values, 1, weightedPrediction.vec, w)
-    return(mean(r2(trueValue, predicted)))
-    #return(mean(1/abs(trueValue-predicted)))
-  }
-  resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
-             maxiter = n, monitor = NULL, keepBest = T)
-  resp@solution <- resp@solution/sum(resp@solution)
-  return(resp)
-}
-
-weightedPrediction <- function(classifiers, w) {
-  if(length(w) > ncol(classifiers)) {
-    warning("more weights than classifiers, extra weigths are ignored")
-    w <- w[1:ncol(classifiers)]
-  } else if(length(w) < ncol(classifiers)) {
-    warning("less weights than classifiers, extra classifiers are ignored")
-    classifiers <- classifiers[,1:length(w)]
-  }
-  prediction <- NULL
-  prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
-  return(prediction)
-}
-
-weightedPrediction.vec <- function(values, w) {
-  return(sum(values * w/sum(w)))
-}
-
-##### meta-decision tree
-
-tuneTree <- function(data, target) {
-  data <- data.frame(data, target=target)
-  size <-  nrow(data)
-  xerror <-  NULL
-  split <-  1:ceiling(size/5)
-  leafSize <-  1:ceiling(size/10)
-  xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
-  cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
-  for(i in 1:length(split)) {
-    for(j in 1:length(leafSize)) {
-      op <- list(minsplit=split[i], minbucket=leafSize[j])
-      tree <- rpart(target ~., data=data, control=op, method="anova")
-      xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
-      cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
-    }
-  }
-  index <- which(xerror==min(xerror), arr.ind = T)
-  op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
-  return(op)
-}
-
-###### meta-LASSO
-# create fold by picking at random row indexes
-createFolds <- function(nbObs, n) {
-  # pick indexes
-  index <- sample(1:n, size=nbObs, replace = T)
-  # populate folds
-  folds <- NULL
-  for(i in 1:n) {
-    folds <- c(folds, list(which(index==i)))
-  }
-  return(folds)
-}
-
-searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
-  folds <- createFolds(nrow(genotype), n = n)
-  acc <- NULL
-  indexAlpha <- 1
-  for(a in alpha) {
-    curAcc <- NULL
-    for(i in 1:n) {
-      train <- genotype[-folds[[i]],]
-      test <- genotype[folds[[i]],]
-      phenoTrain <- phenotype[-folds[[i]]]
-      phenoTest <- phenotype[folds[[i]]]
-      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
-      model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
-      pred <- predict(model, test, type = "response")
-      curAcc <- c(curAcc, r2(phenoTest, pred))
-    }
-    acc <- c(acc, mean(curAcc))
-  }
-  names(acc) <- alpha
-  return(as.numeric(names(acc)[which.max(acc)]))
-}
-
-###### meta-random forest
-
-searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
-  n <- ceiling(nrow(genotype)/3)
-  indexTest <- sample(1:nrow(genotype), size=n)
-  train <- genotype[-indexTest,]
-  test <- genotype[indexTest,]
-  phenoTrain <- phenotype[-indexTest]
-  phenoTest <- phenotype[indexTest]
-  acc <- NULL
-  indexNtree <- 1
-  for(ntree in rangeNtree) {
-    model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
-    pred <- predict(model, test)
-    acc <- c(acc, r2(phenoTest, pred))
-  }
-  names(acc) <- rangeNtree
-  best <- which.max(acc)
-  return(as.numeric(names(acc)[best]))
-}
-
-###### meta-SVM
-searchParamSVM <- function(train, target, kernel="radial") {
-  # tuning parameters then train
-  model <- NULL
-  switch(kernel,
-         sigmoid={
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
-           g <- tune$best.parameters[[1]];
-           c <- tune$best.parameters[[2]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
-         linear={
-           tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
-           c <- tune$best.parameters[[1]];
-           model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
-         polynomial={
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
-           d <- tune$best.parameters[[1]];
-           g <- tune$best.parameters[[2]];
-           coef <- tune$best.parameters[[3]];
-           c <- tune$best.parameters[[4]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
-         {
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
-           g <- tune$best.parameters[[1]];
-           c <- tune$best.parameters[[2]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
-  )
-  return(model)
-}
-
-#################### upper level functions #####################
-
-aggregateDT <- function(train, test, target, folds) {
-  r2Aggreg <- NULL
-  for (i in 1:length(folds)) {
-    trainIndex <- unlist(folds[-i])
-    testIndex <- folds[[i]]
-    treeParam <- tuneTree(train[[i]], target[trainIndex])
-    data <- data.frame(train[[i]], target=target[trainIndex])
-    model <- rpart(target ~., data=data, method = "anova", control = treeParam)
-    model <- prune(model, cp=treeParam["cp"])
-    pred <- predict(model, data.frame(test[[i]]))
-    r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
-  }
-  return(r2Aggreg)
-}
-
-aggregateGeneticMean <- function(train, test, target, folds) {
-  r2Aggreg <- NULL
-  for (i in 1:length(folds)) {
-    trainIndex <- unlist(folds[-i])
-    testIndex <- folds[[i]]
-    opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex])
-    pred <- weightedPrediction(test[[i]], opt@solution)
-    r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
-  }
-  return(r2Aggreg)
-}
-
-aggregateLASSO <- function(train, test, target, folds) {
-  r2Aggreg <- NULL
-  for (i in 1:length(folds)) {
-    trainIndex <- unlist(folds[-i])
-    testIndex <- folds[[i]]
-    alpha <- searchParamLASSO(train[[i]], target[trainIndex])
-    cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha)
-    model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se)
-    pred <- predict(model, test[[i]])
-    r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
-  }
-  return(r2Aggreg)
-}
-
-aggregateRF <- function(train, test, target, folds) {
-  r2Aggreg <- NULL
-  for (i in 1:length(folds)) {
-    trainIndex <- unlist(folds[-i])
-    testIndex <- folds[[i]]
-    ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex],
-                           rangeNtree = seq(100, 1000, 100))
-    model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex],
-                          ntree = ntree, mtry = ncol(train[[i]]))
-    pred <- predict(model, test[[i]])
-    r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
-  }
-  return(r2Aggreg)
-}
-
-aggregateSVM <- function(train, test, target, folds, kernel="linear") {
-  r2Aggreg <- NULL
-  for (i in 1:length(folds)) {
-    trainIndex <- unlist(folds[-i])
-    testIndex <- folds[[i]]
-    model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel)
-    pred <- predict(model, test[[i]])
-    r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
-  }
-  return(r2Aggreg)
-}
-
-################################### main #############################
-# # load argument
-cmd <- commandArgs(T)
-source(cmd[1])
-# load folds
-con = file(folds)
-folds <- readLines(con = con, n = 1, ok=T)
-close(con)
-folds <- readRDS(folds)
-# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# load genotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-# find which classifiers will be used for aggregation
-classifNames <- NULL
-if(lassoModel !="None"){
-   classifNames <- c(classifNames, "lasso")
-   con = file(lassoModel)
-   lassoModel <- readLines(con = con, n = 1, ok=T)
-   close(con)
-   lassoModel <- readRDS(lassoModel)
-}
-if(rrBLUPModel !="None"){
-  classifNames <- c(classifNames, "rrBLUP")
-  con = file(rrBLUPModel)
-  rrBLUPModel <- readLines(con = con, n = 1, ok=T)
-  close(con)
-  rrBLUPModel <- readRDS(rrBLUPModel)
-}
-if(rfModel !="None"){
-  classifNames <- c(classifNames, "rf")
-  con = file(rfModel)
-  rfModel <- readLines(con = con, n = 1, ok=T)
-  close(con)
-  rfModel <- readRDS(rfModel)
-}
-if(svmModel !="None"){
-  classifNames <- c(classifNames, "svm")
-  con = file(svmModel)
-  svmModel <- readLines(con = con, n = 1, ok=T)
-  close(con)
-  svmModel <- readRDS(svmModel)
-}
-
-# compute prediction of the training set and test set for each fold and each classifiers
-# train predictions and test prediction are stored in separate lists
-# where each element of the list represent a folds
-predictionTrain.list <- NULL
-predictionTest.list <- NULL
-r2Classif.list <- NULL
-for(i in 1:length(folds)) {
-  # for the current fold, create training set and test set
-  trainIndex <- unlist(folds[-i])
-  testIndex <- folds[[i]]
-  train <- genotype[trainIndex,]
-  phenoTrain <- phenotype[trainIndex]
-  test <- genotype[testIndex,]
-  phenoTest <- phenotype[testIndex]
-  # only to intialize data frame containing predictions
-  predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)),
-                            ncol=length(classifNames))
-  colnames(predictionTrain) <- classifNames
-  predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)),
-                           ncol=length(classifNames))
-  colnames(predictionTest) <- classifNames
-  r2Classif <- NULL
-  # for each classifiers, compute prediction on both sets
-  # and evaluate r2 to find the best classifier
-  for(j in 1:length(classifNames)) {
-    switch(classifNames[j],
-           # random forest
-           rf={
-            # predict train and test
-            param <- extractParameter(rfModel, "rf")
-            model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry,
-                                  ntree = param$ntree);
-            predictionTrain[,j] <- prediction(train, model, classifier = "rf");
-            predictionTest[,j] <- prediction(test, model, classifier = "rf");
-            r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))},
-           # svm
-           svm={
-            # predict train and test
-            param <- extractParameter(svmModel, "svm");
-            model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c,
-                         gamma=param$g, degree = param$d, coef0 = param$coef, scale = F)
-            predictionTrain[,j] <- prediction(train, model, classifier = "svm");
-            predictionTest[,j] <- prediction(test, model, classifier = "svm");
-            r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))},
-           # lasso
-           lasso={
-            # predict train and test
-            param <- extractParameter(lassoModel, "lasso");
-            model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda);
-            predictionTrain[,j] <- prediction(train, model, classifier = "lasso");
-            predictionTest[,j] <- prediction(test, model, classifier = "lasso");
-            r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))},
-           # rrBLUP
-           rrBLUP={
-            # predict train and test
-            model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F);
-            predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP");
-            predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP");
-            r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))},
-           {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP"));
-             stop()}
-      )
-  }
-  predictionTrain.list <- c(predictionTrain.list, list(predictionTrain))
-  predictionTest.list <- c(predictionTest.list, list(predictionTest))
-  r2Classif.list <- c(r2Classif.list, list(r2Classif))
-}
-# aggregate !
-switch(method,
-       geneticMean={
-         aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list,
-                                        target = phenotype, folds=folds)
-       },
-       dt={
-         aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list,
-                               target = phenotype, folds=folds)
-       },
-       lasso={
-         aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list,
-                               target = phenotype, folds=folds)
-       },
-       rf={
-         aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list,
-                               target = phenotype, folds=folds)
-       },
-       # svm, by default
-       {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list,
-                               target = phenotype, folds=folds, kernel=kernel)}
-)
-# determine best classifier
-# first, transform list into a matrix
-saveRDS(r2Classif.list, "/Users/nbeaume/Desktop/r2Classif.rds")
-r2Classif.list <- t(data.frame(r2Classif.list))
-# then, compute the mean r2 for each classifier
-meanR2Classif <- apply(r2Classif.list, 2, mean)
-# choose the best one
-bestClassif <- which.max(meanR2Classif)
-# compare aggregation and best classifiers
-finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg,
-                  diff=(aggreg-r2Classif.list[,bestClassif]))
-print(apply(finalRes, 2, mean))