| 
91
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 10/10/16
 | 
| 
 | 
     4 # last modification : 29/10/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 #
 | 
| 
 | 
     7 ########################################################
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 suppressWarnings(suppressMessages(library(GA)))
 | 
| 
 | 
    10 library("miscTools")
 | 
| 
 | 
    11 library(rpart)
 | 
| 
 | 
    12 suppressWarnings(suppressMessages(library(randomForest)))
 | 
| 
 | 
    13 library(e1071)
 | 
| 
 | 
    14 suppressWarnings(suppressMessages(library(glmnet)))
 | 
| 
 | 
    15 library(rrBLUP)
 | 
| 
 | 
    16 options(warn=-1)
 | 
| 
 | 
    17 ############################ helper functions #######################
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 ##### classifiers
 | 
| 
 | 
    20 prediction <- function(genotype, model, classifier="unknown") {
 | 
| 
 | 
    21   # run prediction according to the classifier
 | 
| 
 | 
    22   switch(classifier,
 | 
| 
 | 
    23          rrBLUP={
 | 
| 
 | 
    24            predictions <- as.matrix(genotype) %*% as.matrix(model$u);
 | 
| 
 | 
    25            predictions <- predictions[,1]+model$beta;
 | 
| 
 | 
    26          },
 | 
| 
 | 
    27          rf={
 | 
| 
 | 
    28            predictions <- predict(model, genotype);
 | 
| 
 | 
    29          },
 | 
| 
 | 
    30          svm={
 | 
| 
 | 
    31            predictions <- predict(model, genotype);
 | 
| 
 | 
    32          },
 | 
| 
 | 
    33          lasso={
 | 
| 
 | 
    34            predictions <- predict(model, as.matrix(genotype), type = "response");
 | 
| 
 | 
    35          },
 | 
| 
 | 
    36          {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")})
 | 
| 
 | 
    37   return(predictions)
 | 
| 
 | 
    38 }
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 # extract parameter from a model, excluding rrBLUP which auto-optimize
 | 
| 
 | 
    41 extractParameter <- function(model, classifierName) {
 | 
| 
 | 
    42   param <- NULL
 | 
| 
 | 
    43   switch(classifierName,
 | 
| 
 | 
    44          # random forest
 | 
| 
 | 
    45          rf={
 | 
| 
 | 
    46           param <- model$ntree
 | 
| 
 | 
    47           param <- c(param, list(model$mtry))
 | 
| 
 | 
    48           names(param) <- c("ntree", "mtry")
 | 
| 
 | 
    49            },
 | 
| 
 | 
    50          # svm
 | 
| 
 | 
    51          svm={
 | 
| 
 | 
    52           param <- as.numeric(model$cost)
 | 
| 
 | 
    53           param <- c(param, list(model$gamma))
 | 
| 
 | 
    54           param <- c(param, list(model$coef0))
 | 
| 
 | 
    55           param <- c(param, list(model$degree))
 | 
| 
 | 
    56           param <- c(param, list(model$kernel))
 | 
| 
 | 
    57           names(param) <- c("c", "g", "coef", "d", "kernel")
 | 
| 
 | 
    58           switch((model$kernel+1),
 | 
| 
 | 
    59                  param$kernel <- "linear",
 | 
| 
 | 
    60                  param$kernel <- "polynomial",
 | 
| 
 | 
    61                  param$kernel <- "radial",
 | 
| 
 | 
    62                  param$kernel <- "sigmoid"
 | 
| 
 | 
    63                  )
 | 
| 
 | 
    64            },
 | 
| 
 | 
    65          # lasso
 | 
| 
 | 
    66          lasso={
 | 
| 
 | 
    67            param <- as.list(model$lambda)
 | 
| 
 | 
    68            names(param) <- "lambda"
 | 
| 
 | 
    69           },
 | 
| 
 | 
    70          {print(paste("unknown classifier, please choose among rf, svm, lasso"));
 | 
| 
 | 
    71            stop()}
 | 
| 
 | 
    72   )
 | 
| 
 | 
    73   return(param)
 | 
| 
 | 
    74 }
 | 
| 
 | 
    75 
 | 
| 
 | 
    76 ##### Genetic algorithm
 | 
| 
 | 
    77 
 | 
| 
 | 
    78 # compute r2 by computing the classic formula
 | 
| 
 | 
    79 # compare the sum of square difference from target to prediciton
 | 
| 
 | 
    80 # to the sum of square difference from target to the mean of the target
 | 
| 
 | 
    81 r2 <- function(target, prediction) {
 | 
| 
 | 
    82   sst <- sum((target-mean(target))^2)
 | 
| 
 | 
    83   ssr <- sum((target-prediction)^2)
 | 
| 
 | 
    84   return(1-ssr/sst)
 | 
| 
 | 
    85 }
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 optimizeOneIndividual <- function(values, trueValue) {
 | 
| 
 | 
    88   # change the value into a function
 | 
| 
 | 
    89   f <- function(w) {sum(values * w/sum(w))}
 | 
| 
 | 
    90   fitness <- function(x) {1/abs(trueValue-f(x))}
 | 
| 
 | 
    91   resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), 
 | 
| 
 | 
    92              maxiter = 1000, monitor = NULL, keepBest = T)
 | 
| 
 | 
    93   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
    94   return(resp)
 | 
| 
 | 
    95 }
 | 
| 
 | 
    96 
 | 
| 
 | 
    97 optimizeWeight <- function(values, trueValue, n=1000) {
 | 
| 
 | 
    98   fitnessAll <- function(w) {
 | 
| 
 | 
    99     predicted <- apply(values, 1, weightedPrediction.vec, w)
 | 
| 
 | 
   100     return(mean(r2(trueValue, predicted)))
 | 
| 
 | 
   101     #return(mean(1/abs(trueValue-predicted)))
 | 
| 
 | 
   102   }
 | 
| 
 | 
   103   resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), 
 | 
| 
 | 
   104              maxiter = n, monitor = NULL, keepBest = T)
 | 
| 
 | 
   105   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
   106   return(resp)
 | 
| 
 | 
   107 }
 | 
| 
 | 
   108 
 | 
| 
 | 
   109 weightedPrediction <- function(classifiers, w) {
 | 
| 
 | 
   110   if(length(w) > ncol(classifiers)) {
 | 
| 
 | 
   111     warning("more weights than classifiers, extra weigths are ignored")
 | 
| 
 | 
   112     w <- w[1:ncol(classifiers)]
 | 
| 
 | 
   113   } else if(length(w) < ncol(classifiers)) {
 | 
| 
 | 
   114     warning("less weights than classifiers, extra classifiers are ignored")
 | 
| 
 | 
   115     classifiers <- classifiers[,1:length(w)]
 | 
| 
 | 
   116   }
 | 
| 
 | 
   117   prediction <- NULL
 | 
| 
 | 
   118   prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
 | 
| 
 | 
   119   return(prediction)
 | 
| 
 | 
   120 }
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 weightedPrediction.vec <- function(values, w) {
 | 
| 
 | 
   123   return(sum(values * w/sum(w)))
 | 
| 
 | 
   124 }
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 ##### meta-decision tree
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 tuneTree <- function(data, target) {
 | 
| 
 | 
   129   data <- data.frame(data, target=target)
 | 
| 
 | 
   130   size <-  nrow(data)
 | 
| 
 | 
   131   xerror <-  NULL
 | 
| 
 | 
   132   split <-  1:ceiling(size/5)
 | 
| 
 | 
   133   leafSize <-  1:ceiling(size/10)
 | 
| 
 | 
   134   xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
   135   cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
   136   for(i in 1:length(split)) {
 | 
| 
 | 
   137     for(j in 1:length(leafSize)) {
 | 
| 
 | 
   138       op <- list(minsplit=split[i], minbucket=leafSize[j])
 | 
| 
 | 
   139       tree <- rpart(target ~., data=data, control=op, method="anova")
 | 
| 
 | 
   140       xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
 | 
| 
 | 
   141       cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
 | 
| 
 | 
   142     }
 | 
| 
 | 
   143   }
 | 
| 
 | 
   144   index <- which(xerror==min(xerror), arr.ind = T)
 | 
| 
 | 
   145   op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
 | 
| 
 | 
   146   return(op)
 | 
| 
 | 
   147 }
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 ###### meta-LASSO
 | 
| 
 | 
   150 # create fold by picking at random row indexes
 | 
| 
 | 
   151 createFolds <- function(nbObs, n) {
 | 
| 
 | 
   152   # pick indexes
 | 
| 
 | 
   153   index <- sample(1:n, size=nbObs, replace = T)
 | 
| 
 | 
   154   # populate folds
 | 
| 
 | 
   155   folds <- NULL
 | 
| 
 | 
   156   for(i in 1:n) {
 | 
| 
 | 
   157     folds <- c(folds, list(which(index==i)))
 | 
| 
 | 
   158   }
 | 
| 
 | 
   159   return(folds)
 | 
| 
 | 
   160 }
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
 | 
| 
 | 
   163   folds <- createFolds(nrow(genotype), n = n)
 | 
| 
 | 
   164   acc <- NULL
 | 
| 
 | 
   165   indexAlpha <- 1
 | 
| 
 | 
   166   for(a in alpha) {
 | 
| 
 | 
   167     curAcc <- NULL
 | 
| 
 | 
   168     for(i in 1:n) {
 | 
| 
 | 
   169       train <- genotype[-folds[[i]],]
 | 
| 
 | 
   170       test <- genotype[folds[[i]],]
 | 
| 
 | 
   171       phenoTrain <- phenotype[-folds[[i]]]
 | 
| 
 | 
   172       phenoTest <- phenotype[folds[[i]]]
 | 
| 
 | 
   173       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
 | 
| 
 | 
   174       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
 | 
| 
 | 
   175       pred <- predict(model, test, type = "response")
 | 
| 
 | 
   176       curAcc <- c(curAcc, r2(phenoTest, pred))
 | 
| 
 | 
   177     }
 | 
| 
 | 
   178     acc <- c(acc, mean(curAcc))
 | 
| 
 | 
   179   }
 | 
| 
 | 
   180   names(acc) <- alpha
 | 
| 
 | 
   181   return(as.numeric(names(acc)[which.max(acc)]))
 | 
| 
 | 
   182 }
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 ###### meta-random forest
 | 
| 
 | 
   185 
 | 
| 
 | 
   186 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
 | 
| 
 | 
   187   n <- ceiling(nrow(genotype)/3)
 | 
| 
 | 
   188   indexTest <- sample(1:nrow(genotype), size=n)
 | 
| 
 | 
   189   train <- genotype[-indexTest,]
 | 
| 
 | 
   190   test <- genotype[indexTest,]
 | 
| 
 | 
   191   phenoTrain <- phenotype[-indexTest]
 | 
| 
 | 
   192   phenoTest <- phenotype[indexTest]
 | 
| 
 | 
   193   acc <- NULL
 | 
| 
 | 
   194   indexNtree <- 1
 | 
| 
 | 
   195   for(ntree in rangeNtree) {
 | 
| 
 | 
   196     model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
 | 
| 
 | 
   197     pred <- predict(model, test)
 | 
| 
 | 
   198     acc <- c(acc, r2(phenoTest, pred))
 | 
| 
 | 
   199   }
 | 
| 
 | 
   200   names(acc) <- rangeNtree
 | 
| 
 | 
   201   best <- which.max(acc)
 | 
| 
 | 
   202   return(as.numeric(names(acc)[best]))
 | 
| 
 | 
   203 }
 | 
| 
 | 
   204 
 | 
| 
 | 
   205 ###### meta-SVM
 | 
| 
 | 
   206 searchParamSVM <- function(train, target, kernel="radial") {
 | 
| 
 | 
   207   # tuning parameters then train
 | 
| 
 | 
   208   model <- NULL
 | 
| 
 | 
   209   switch(kernel,
 | 
| 
 | 
   210          sigmoid={
 | 
| 
 | 
   211            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
 | 
| 
 | 
   212            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   213            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   214            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
 | 
| 
 | 
   215          linear={
 | 
| 
 | 
   216            tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
 | 
| 
 | 
   217            c <- tune$best.parameters[[1]];
 | 
| 
 | 
   218            model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
 | 
| 
 | 
   219          polynomial={
 | 
| 
 | 
   220            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
 | 
| 
 | 
   221            d <- tune$best.parameters[[1]];
 | 
| 
 | 
   222            g <- tune$best.parameters[[2]];
 | 
| 
 | 
   223            coef <- tune$best.parameters[[3]];
 | 
| 
 | 
   224            c <- tune$best.parameters[[4]];
 | 
| 
 | 
   225            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
 | 
| 
 | 
   226          {
 | 
| 
 | 
   227            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
 | 
| 
 | 
   228            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   229            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   230            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
 | 
| 
 | 
   231   )
 | 
| 
 | 
   232   return(model)
 | 
| 
 | 
   233 }
 | 
| 
 | 
   234 
 | 
| 
 | 
   235 #################### upper level functions #####################
 | 
| 
 | 
   236 
 | 
| 
 | 
   237 aggregateDT <- function(train, test, target, folds) {
 | 
| 
 | 
   238   r2Aggreg <- NULL
 | 
| 
 | 
   239   for (i in 1:length(folds)) {
 | 
| 
 | 
   240     trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   241     testIndex <- folds[[i]]
 | 
| 
 | 
   242     treeParam <- tuneTree(train[[i]], target[trainIndex])
 | 
| 
 | 
   243     data <- data.frame(train[[i]], target=target[trainIndex])
 | 
| 
 | 
   244     model <- rpart(target ~., data=data, method = "anova", control = treeParam)
 | 
| 
 | 
   245     model <- prune(model, cp=treeParam["cp"])
 | 
| 
 | 
   246     pred <- predict(model, data.frame(test[[i]]))
 | 
| 
 | 
   247     r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
 | 
| 
 | 
   248   }
 | 
| 
 | 
   249   return(r2Aggreg)
 | 
| 
 | 
   250 }
 | 
| 
 | 
   251 
 | 
| 
 | 
   252 aggregateGeneticMean <- function(train, test, target, folds) {
 | 
| 
 | 
   253   r2Aggreg <- NULL
 | 
| 
 | 
   254   for (i in 1:length(folds)) {
 | 
| 
 | 
   255     trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   256     testIndex <- folds[[i]]
 | 
| 
 | 
   257     opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex])
 | 
| 
 | 
   258     pred <- weightedPrediction(test[[i]], opt@solution)
 | 
| 
 | 
   259     r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
 | 
| 
 | 
   260   }
 | 
| 
 | 
   261   return(r2Aggreg)
 | 
| 
 | 
   262 }
 | 
| 
 | 
   263 
 | 
| 
 | 
   264 aggregateLASSO <- function(train, test, target, folds) {
 | 
| 
 | 
   265   r2Aggreg <- NULL
 | 
| 
 | 
   266   for (i in 1:length(folds)) {
 | 
| 
 | 
   267     trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   268     testIndex <- folds[[i]]
 | 
| 
 | 
   269     alpha <- searchParamLASSO(train[[i]], target[trainIndex])
 | 
| 
 | 
   270     cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha)
 | 
| 
 | 
   271     model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se)
 | 
| 
 | 
   272     pred <- predict(model, test[[i]])
 | 
| 
 | 
   273     r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
 | 
| 
 | 
   274   }
 | 
| 
 | 
   275   return(r2Aggreg)
 | 
| 
 | 
   276 }
 | 
| 
 | 
   277 
 | 
| 
 | 
   278 aggregateRF <- function(train, test, target, folds) {
 | 
| 
 | 
   279   r2Aggreg <- NULL
 | 
| 
 | 
   280   for (i in 1:length(folds)) {
 | 
| 
 | 
   281     trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   282     testIndex <- folds[[i]]
 | 
| 
 | 
   283     ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex],
 | 
| 
 | 
   284                            rangeNtree = seq(100, 1000, 100))
 | 
| 
 | 
   285     model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex],
 | 
| 
 | 
   286                           ntree = ntree, mtry = ncol(train[[i]]))
 | 
| 
 | 
   287     pred <- predict(model, test[[i]])
 | 
| 
 | 
   288     r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
 | 
| 
 | 
   289   }
 | 
| 
 | 
   290   return(r2Aggreg)
 | 
| 
 | 
   291 }
 | 
| 
 | 
   292 
 | 
| 
 | 
   293 aggregateSVM <- function(train, test, target, folds, kernel="linear") {
 | 
| 
 | 
   294   r2Aggreg <- NULL
 | 
| 
 | 
   295   for (i in 1:length(folds)) {
 | 
| 
 | 
   296     trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   297     testIndex <- folds[[i]]
 | 
| 
 | 
   298     model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel)
 | 
| 
 | 
   299     pred <- predict(model, test[[i]])
 | 
| 
 | 
   300     r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
 | 
| 
 | 
   301   }
 | 
| 
 | 
   302   return(r2Aggreg)
 | 
| 
 | 
   303 }
 | 
| 
 | 
   304 
 | 
| 
 | 
   305 ################################### main #############################
 | 
| 
 | 
   306 # # load argument
 | 
| 
 | 
   307 cmd <- commandArgs(T)
 | 
| 
 | 
   308 source(cmd[1])
 | 
| 
 | 
   309 # load folds
 | 
| 
 | 
   310 con = file(folds)
 | 
| 
 | 
   311 folds <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   312 close(con)
 | 
| 
 | 
   313 folds <- readRDS(folds)
 | 
| 
 | 
   314 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
 | 
| 
 | 
   315 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
 | 
| 
 | 
   316 # load genotype
 | 
| 
 | 
   317 con = file(genotype)
 | 
| 
 | 
   318 genotype <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   319 close(con)
 | 
| 
 | 
   320 genotype <- read.table(genotype, sep="\t", h=T)
 | 
| 
 | 
   321 # find which classifiers will be used for aggregation
 | 
| 
 | 
   322 classifNames <- NULL
 | 
| 
 | 
   323 if(lassoModel !="None"){
 | 
| 
 | 
   324    classifNames <- c(classifNames, "lasso")
 | 
| 
 | 
   325    con = file(lassoModel)
 | 
| 
 | 
   326    lassoModel <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   327    close(con)
 | 
| 
 | 
   328    lassoModel <- readRDS(lassoModel)
 | 
| 
 | 
   329 }
 | 
| 
 | 
   330 if(rrBLUPModel !="None"){
 | 
| 
 | 
   331   classifNames <- c(classifNames, "rrBLUP")
 | 
| 
 | 
   332   con = file(rrBLUPModel)
 | 
| 
 | 
   333   rrBLUPModel <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   334   close(con)
 | 
| 
 | 
   335   rrBLUPModel <- readRDS(rrBLUPModel)
 | 
| 
 | 
   336 }
 | 
| 
 | 
   337 if(rfModel !="None"){
 | 
| 
 | 
   338   classifNames <- c(classifNames, "rf")
 | 
| 
 | 
   339   con = file(rfModel)
 | 
| 
 | 
   340   rfModel <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   341   close(con)
 | 
| 
 | 
   342   rfModel <- readRDS(rfModel)
 | 
| 
 | 
   343 }
 | 
| 
 | 
   344 if(svmModel !="None"){
 | 
| 
 | 
   345   classifNames <- c(classifNames, "svm")
 | 
| 
 | 
   346   con = file(svmModel)
 | 
| 
 | 
   347   svmModel <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   348   close(con)
 | 
| 
 | 
   349   svmModel <- readRDS(svmModel)
 | 
| 
 | 
   350 }
 | 
| 
 | 
   351 
 | 
| 
 | 
   352 # compute prediction of the training set and test set for each fold and each classifiers
 | 
| 
 | 
   353 # train predictions and test prediction are stored in separate lists 
 | 
| 
 | 
   354 # where each element of the list represent a folds
 | 
| 
 | 
   355 predictionTrain.list <- NULL
 | 
| 
 | 
   356 predictionTest.list <- NULL
 | 
| 
 | 
   357 r2Classif.list <- NULL 
 | 
| 
 | 
   358 for(i in 1:length(folds)) {
 | 
| 
 | 
   359   # for the current fold, create training set and test set
 | 
| 
 | 
   360   trainIndex <- unlist(folds[-i])
 | 
| 
 | 
   361   testIndex <- folds[[i]]
 | 
| 
 | 
   362   train <- genotype[trainIndex,]
 | 
| 
 | 
   363   phenoTrain <- phenotype[trainIndex]
 | 
| 
 | 
   364   test <- genotype[testIndex,]
 | 
| 
 | 
   365   phenoTest <- phenotype[testIndex]
 | 
| 
 | 
   366   # only to intialize data frame containing predictions
 | 
| 
 | 
   367   predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), 
 | 
| 
 | 
   368                             ncol=length(classifNames))
 | 
| 
 | 
   369   colnames(predictionTrain) <- classifNames
 | 
| 
 | 
   370   predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), 
 | 
| 
 | 
   371                            ncol=length(classifNames))
 | 
| 
 | 
   372   colnames(predictionTest) <- classifNames
 | 
| 
 | 
   373   r2Classif <- NULL
 | 
| 
 | 
   374   # for each classifiers, compute prediction on both sets 
 | 
| 
 | 
   375   # and evaluate r2 to find the best classifier
 | 
| 
 | 
   376   for(j in 1:length(classifNames)) {
 | 
| 
 | 
   377     switch(classifNames[j],
 | 
| 
 | 
   378            # random forest
 | 
| 
 | 
   379            rf={
 | 
| 
 | 
   380             # predict train and test
 | 
| 
 | 
   381             param <- extractParameter(rfModel, "rf")
 | 
| 
 | 
   382             model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, 
 | 
| 
 | 
   383                                   ntree = param$ntree); 
 | 
| 
 | 
   384             predictionTrain[,j] <- prediction(train, model, classifier = "rf");
 | 
| 
 | 
   385             predictionTest[,j] <- prediction(test, model, classifier = "rf");
 | 
| 
 | 
   386             r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))},
 | 
| 
 | 
   387            # svm
 | 
| 
 | 
   388            svm={
 | 
| 
 | 
   389             # predict train and test
 | 
| 
 | 
   390             param <- extractParameter(svmModel, "svm");
 | 
| 
 | 
   391             model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c,
 | 
| 
 | 
   392                          gamma=param$g, degree = param$d, coef0 = param$coef, scale = F)
 | 
| 
 | 
   393             predictionTrain[,j] <- prediction(train, model, classifier = "svm");
 | 
| 
 | 
   394             predictionTest[,j] <- prediction(test, model, classifier = "svm");
 | 
| 
 | 
   395             r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))},
 | 
| 
 | 
   396            # lasso
 | 
| 
 | 
   397            lasso={
 | 
| 
 | 
   398             # predict train and test
 | 
| 
 | 
   399             param <- extractParameter(lassoModel, "lasso");
 | 
| 
 | 
   400             model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda);
 | 
| 
 | 
   401             predictionTrain[,j] <- prediction(train, model, classifier = "lasso");
 | 
| 
 | 
   402             predictionTest[,j] <- prediction(test, model, classifier = "lasso");
 | 
| 
 | 
   403             r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))},
 | 
| 
 | 
   404            # rrBLUP
 | 
| 
 | 
   405            rrBLUP={
 | 
| 
 | 
   406             # predict train and test
 | 
| 
 | 
   407             model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F);
 | 
| 
 | 
   408             predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP");
 | 
| 
 | 
   409             predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP");
 | 
| 
 | 
   410             r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))},
 | 
| 
 | 
   411            {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP"));
 | 
| 
 | 
   412              stop()}
 | 
| 
 | 
   413       )
 | 
| 
 | 
   414   }
 | 
| 
 | 
   415   predictionTrain.list <- c(predictionTrain.list, list(predictionTrain))
 | 
| 
 | 
   416   predictionTest.list <- c(predictionTest.list, list(predictionTest))
 | 
| 
 | 
   417   r2Classif.list <- c(r2Classif.list, list(r2Classif))
 | 
| 
 | 
   418 }
 | 
| 
 | 
   419 # aggregate !
 | 
| 
 | 
   420 switch(method,
 | 
| 
 | 
   421        geneticMean={
 | 
| 
 | 
   422          aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list,
 | 
| 
 | 
   423                                         target = phenotype, folds=folds)
 | 
| 
 | 
   424        },
 | 
| 
 | 
   425        dt={
 | 
| 
 | 
   426          aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list,
 | 
| 
 | 
   427                                target = phenotype, folds=folds)
 | 
| 
 | 
   428        },
 | 
| 
 | 
   429        lasso={
 | 
| 
 | 
   430          aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list,
 | 
| 
 | 
   431                                target = phenotype, folds=folds)
 | 
| 
 | 
   432        },
 | 
| 
 | 
   433        rf={
 | 
| 
 | 
   434          aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list,
 | 
| 
 | 
   435                                target = phenotype, folds=folds)
 | 
| 
 | 
   436        },
 | 
| 
 | 
   437        # svm, by default
 | 
| 
 | 
   438        {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list,
 | 
| 
 | 
   439                                target = phenotype, folds=folds, kernel=kernel)}
 | 
| 
 | 
   440 )
 | 
| 
 | 
   441 # determine best classifier
 | 
| 
 | 
   442 # first, transform list into a matrix
 | 
| 
 | 
   443 saveRDS(r2Classif.list, "/Users/nbeaume/Desktop/r2Classif.rds")
 | 
| 
 | 
   444 r2Classif.list <- t(data.frame(r2Classif.list))
 | 
| 
 | 
   445 # then, compute the mean r2 for each classifier
 | 
| 
 | 
   446 meanR2Classif <- apply(r2Classif.list, 2, mean)
 | 
| 
 | 
   447 # choose the best one
 | 
| 
 | 
   448 bestClassif <- which.max(meanR2Classif)
 | 
| 
 | 
   449 # compare aggregation and best classifiers
 | 
| 
 | 
   450 finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg,
 | 
| 
 | 
   451                   diff=(aggreg-r2Classif.list[,bestClassif]))
 | 
| 
 | 
   452 print(apply(finalRes, 2, mean))
 |