| 
68
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 25/10/16
 | 
| 
 | 
     4 # last modification : 25/10/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 #
 | 
| 
 | 
     7 ########################################################
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 suppressWarnings(suppressMessages(library(GA)))
 | 
| 
 | 
    10 library("miscTools")
 | 
| 
 | 
    11 library(rpart)
 | 
| 
 | 
    12 suppressWarnings(suppressMessages(library(randomForest)))
 | 
| 
 | 
    13 library(e1071)
 | 
| 
 | 
    14 suppressWarnings(suppressMessages(library(glmnet)))
 | 
| 
 | 
    15 options(warn=-1)
 | 
| 
 | 
    16 ############################ helper functions #######################
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 ##### Genetic algorithm
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 # compute r2 by computing the classic formula
 | 
| 
 | 
    21 # compare the sum of square difference from target to prediciton
 | 
| 
 | 
    22 # to the sum of square difference from target to the mean of the target
 | 
| 
 | 
    23 r2 <- function(target, prediction) {
 | 
| 
 | 
    24   sst <- sum((target-mean(target))^2)
 | 
| 
 | 
    25   ssr <- sum((target-prediction)^2)
 | 
| 
 | 
    26   return(1-ssr/sst)
 | 
| 
 | 
    27 }
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 optimizeOneIndividual <- function(values, trueValue) {
 | 
| 
 | 
    30   # change the value into a function
 | 
| 
 | 
    31   f <- function(w) {sum(values * w/sum(w))}
 | 
| 
 | 
    32   fitness <- function(x) {1/abs(trueValue-f(x))}
 | 
| 
 | 
    33   resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), 
 | 
| 
 | 
    34              maxiter = 1000, monitor = NULL, keepBest = T)
 | 
| 
 | 
    35   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
    36   return(resp)
 | 
| 
 | 
    37 }
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 optimizeWeight <- function(values, trueValue, n=1000) {
 | 
| 
 | 
    40   fitnessAll <- function(w) {
 | 
| 
 | 
    41     predicted <- apply(values, 1, weightedPrediction.vec, w)
 | 
| 
 | 
    42     return(mean(r2(trueValue, predicted)))
 | 
| 
 | 
    43     #return(mean(1/abs(trueValue-predicted)))
 | 
| 
 | 
    44   }
 | 
| 
 | 
    45   resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), 
 | 
| 
 | 
    46              maxiter = n, monitor = NULL, keepBest = T)
 | 
| 
 | 
    47   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
    48   return(resp)
 | 
| 
 | 
    49 }
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 weightedPrediction <- function(classifiers, w) {
 | 
| 
 | 
    52   if(length(w) > ncol(classifiers)) {
 | 
| 
 | 
    53     warning("more weights than classifiers, extra weigths are ignored")
 | 
| 
 | 
    54     w <- w[1:ncol(classifiers)]
 | 
| 
 | 
    55   } else if(length(w) < ncol(classifiers)) {
 | 
| 
 | 
    56     warning("less weights than classifiers, extra classifiers are ignored")
 | 
| 
 | 
    57     classifiers <- classifiers[,1:length(w)]
 | 
| 
 | 
    58   }
 | 
| 
 | 
    59   prediction <- NULL
 | 
| 
 | 
    60   prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
 | 
| 
 | 
    61   return(prediction)
 | 
| 
 | 
    62 }
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 weightedPrediction.vec <- function(values, w) {
 | 
| 
 | 
    65   return(sum(values * w/sum(w)))
 | 
| 
 | 
    66 }
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 ##### meta-decision tree
 | 
| 
 | 
    69 
 | 
| 
 | 
    70 tuneTree <- function(data, target) {
 | 
| 
 | 
    71   data <- data.frame(data, target=target)
 | 
| 
 | 
    72   size <-  nrow(data)
 | 
| 
 | 
    73   xerror <-  NULL
 | 
| 
 | 
    74   split <-  1:ceiling(size/5)
 | 
| 
 | 
    75   leafSize <-  1:ceiling(size/10)
 | 
| 
 | 
    76   xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
    77   cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
    78   for(i in 1:length(split)) {
 | 
| 
 | 
    79     for(j in 1:length(leafSize)) {
 | 
| 
 | 
    80       op <- list(minsplit=split[i], minbucket=leafSize[j])
 | 
| 
 | 
    81       tree <- rpart(target ~., data=data, control=op, method="anova")
 | 
| 
 | 
    82       xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
 | 
| 
 | 
    83       cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
 | 
| 
 | 
    84     }
 | 
| 
 | 
    85   }
 | 
| 
 | 
    86   index <- which(xerror==min(xerror), arr.ind = T)
 | 
| 
 | 
    87   op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
 | 
| 
 | 
    88   return(op)
 | 
| 
 | 
    89 }
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 ###### meta-LASSO
 | 
| 
 | 
    92 # create fold by picking at random row indexes
 | 
| 
 | 
    93 createFolds <- function(nbObs, n) {
 | 
| 
 | 
    94   # pick indexes
 | 
| 
 | 
    95   index <- sample(1:n, size=nbObs, replace = T)
 | 
| 
 | 
    96   # populate folds
 | 
| 
 | 
    97   folds <- NULL
 | 
| 
 | 
    98   for(i in 1:n) {
 | 
| 
 | 
    99     folds <- c(folds, list(which(index==i)))
 | 
| 
 | 
   100   }
 | 
| 
 | 
   101   return(folds)
 | 
| 
 | 
   102 }
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
 | 
| 
 | 
   105   folds <- createFolds(nrow(genotype), n = n)
 | 
| 
 | 
   106   acc <- NULL
 | 
| 
 | 
   107   indexAlpha <- 1
 | 
| 
 | 
   108   for(a in alpha) {
 | 
| 
 | 
   109     curAcc <- NULL
 | 
| 
 | 
   110     for(i in 1:n) {
 | 
| 
 | 
   111       train <- genotype[-folds[[i]],]
 | 
| 
 | 
   112       test <- genotype[folds[[i]],]
 | 
| 
 | 
   113       phenoTrain <- phenotype[-folds[[i]]]
 | 
| 
 | 
   114       phenoTest <- phenotype[folds[[i]]]
 | 
| 
 | 
   115       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
 | 
| 
 | 
   116       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
 | 
| 
 | 
   117       pred <- predict(model, test, type = "response")
 | 
| 
 | 
   118       curAcc <- c(curAcc, r2(phenoTest, pred))
 | 
| 
 | 
   119     }
 | 
| 
 | 
   120     acc <- c(acc, mean(curAcc))
 | 
| 
 | 
   121   }
 | 
| 
 | 
   122   names(acc) <- alpha
 | 
| 
 | 
   123   return(as.numeric(names(acc)[which.max(acc)]))
 | 
| 
 | 
   124 }
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 ###### meta-random forest
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
 | 
| 
 | 
   129   n <- ceiling(nrow(genotype)/3)
 | 
| 
 | 
   130   indexTest <- sample(1:nrow(genotype), size=n)
 | 
| 
 | 
   131   train <- genotype[-indexTest,]
 | 
| 
 | 
   132   test <- genotype[indexTest,]
 | 
| 
 | 
   133   phenoTrain <- phenotype[-indexTest]
 | 
| 
 | 
   134   phenoTest <- phenotype[indexTest]
 | 
| 
 | 
   135   acc <- NULL
 | 
| 
 | 
   136   indexNtree <- 1
 | 
| 
 | 
   137   for(ntree in rangeNtree) {
 | 
| 
 | 
   138     model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
 | 
| 
 | 
   139     pred <- predict(model, test)
 | 
| 
 | 
   140     acc <- c(acc, r2(phenoTest, pred))
 | 
| 
 | 
   141   }
 | 
| 
 | 
   142   names(acc) <- rangeNtree
 | 
| 
 | 
   143   best <- which.max(acc)
 | 
| 
 | 
   144   return(as.numeric(names(acc)[best]))
 | 
| 
 | 
   145 }
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 ###### meta-SVM
 | 
| 
 | 
   148 searchParamSVM <- function(train, target, kernel="radial") {
 | 
| 
 | 
   149   # tuning parameters then train
 | 
| 
 | 
   150   model <- NULL
 | 
| 
 | 
   151   switch(kernel,
 | 
| 
 | 
   152          sigmoid={
 | 
| 
 | 
   153            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
 | 
| 
 | 
   154            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   155            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   156            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
 | 
| 
 | 
   157          linear={
 | 
| 
 | 
   158            tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
 | 
| 
 | 
   159            c <- tune$best.parameters[[1]];
 | 
| 
 | 
   160            model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
 | 
| 
 | 
   161          polynomial={
 | 
| 
 | 
   162            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
 | 
| 
 | 
   163            d <- tune$best.parameters[[1]];
 | 
| 
 | 
   164            g <- tune$best.parameters[[2]];
 | 
| 
 | 
   165            coef <- tune$best.parameters[[3]];
 | 
| 
 | 
   166            c <- tune$best.parameters[[4]];
 | 
| 
 | 
   167            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
 | 
| 
 | 
   168          {
 | 
| 
 | 
   169            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
 | 
| 
 | 
   170            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   171            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   172            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
 | 
| 
 | 
   173   )
 | 
| 
 | 
   174   return(model)
 | 
| 
 | 
   175 }
 | 
| 
 | 
   176 
 | 
| 
 | 
   177 #################### upper level functions #####################
 | 
| 
 | 
   178 
 | 
| 
 | 
   179 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
 | 
| 
 | 
   180   if(!prediction) {
 | 
| 
 | 
   181     treeParam <- tuneTree(classifiers, target)
 | 
| 
 | 
   182     data <- data.frame(classifiers, target)
 | 
| 
 | 
   183     model <- rpart(target ~., data=data, method = "anova", control = treeParam)
 | 
| 
 | 
   184     model <- prune(model, cp=treeParam["cp"])
 | 
| 
 | 
   185     out <- paste(out, ".rds", sep = "")
 | 
| 
 | 
   186     saveRDS(model, out)
 | 
| 
 | 
   187     return(out)
 | 
| 
 | 
   188   } else {
 | 
| 
 | 
   189     return(predict(model, classifiers))
 | 
| 
 | 
   190   }
 | 
| 
 | 
   191 }
 | 
| 
 | 
   192 
 | 
| 
 | 
   193 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
 | 
| 
 | 
   194   if(!prediction) {
 | 
| 
 | 
   195     opt <- optimizeWeight(values = classifiers, trueValue = target)
 | 
| 
 | 
   196     out <- paste(out, ".rds", sep = "")
 | 
| 
 | 
   197     saveRDS(opt@solution, out)
 | 
| 
 | 
   198     return(out)
 | 
| 
 | 
   199   } else {
 | 
| 
 | 
   200     return(weightedPrediction.vec(classifiers, model))
 | 
| 
 | 
   201   }
 | 
| 
 | 
   202 }
 | 
| 
 | 
   203 
 | 
| 
 | 
   204 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
 | 
| 
 | 
   205   if(!prediction) {
 | 
| 
 | 
   206     alpha <- searchParamLASSO(classifiers, target)
 | 
| 
 | 
   207     cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
 | 
| 
 | 
   208     model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
 | 
| 
 | 
   209     out <- paste(out, ".rds", sep = "")
 | 
| 
 | 
   210     saveRDS(model, out)
 | 
| 
 | 
   211     return(out)
 | 
| 
 | 
   212   } else {
 | 
| 
 | 
   213     return(predict(model, classifiers))
 | 
| 
 | 
   214   }
 | 
| 
 | 
   215 }
 | 
| 
 | 
   216 
 | 
| 
 | 
   217 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
 | 
| 
 | 
   218   if(!prediction) {
 | 
| 
 | 
   219    ntree <- searchParamRF(genotype = classifiers, phenotype = target,
 | 
| 
 | 
   220                                               rangeNtree = seq(100, 1000, 100))
 | 
| 
 | 
   221     model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
 | 
| 
 | 
   222     out <- paste(out, ".rds", sep = "")
 | 
| 
 | 
   223     saveRDS(model, out)
 | 
| 
 | 
   224     return(out)
 | 
| 
 | 
   225   } else {
 | 
| 
 | 
   226     return(predict(model, classifiers))
 | 
| 
 | 
   227   }
 | 
| 
 | 
   228 }
 | 
| 
 | 
   229 
 | 
| 
 | 
   230 aggregateSVM <- function(classifiers, target=NULL, prediction=F, 
 | 
| 
 | 
   231                          model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
 | 
| 
 | 
   232   if(!prediction) {
 | 
| 
 | 
   233       model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
 | 
| 
 | 
   234       out <- paste(out, ".rds", sep="")
 | 
| 
 | 
   235       saveRDS(model, out)
 | 
| 
 | 
   236       return(out)
 | 
| 
 | 
   237   } else {
 | 
| 
 | 
   238     return(predict(model, classifiers))
 | 
| 
 | 
   239   }
 | 
| 
 | 
   240 }
 | 
| 
 | 
   241 
 | 
| 
 | 
   242 ################################### main #############################
 | 
| 
 | 
   243 # # load argument
 | 
| 
 | 
   244 cmd <- commandArgs(T)
 | 
| 
 | 
   245 source(cmd[1])
 | 
| 
 | 
   246 # check if evaluation is required
 | 
| 
 | 
   247 evaluation <- F
 | 
| 
 | 
   248 if(as.integer(doEvaluation) == 1) {
 | 
| 
 | 
   249   evaluation <- T
 | 
| 
 | 
   250   con = file(folds)
 | 
| 
 | 
   251   folds <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   252   close(con)
 | 
| 
 | 
   253   folds <- readRDS(folds)
 | 
| 
 | 
   254 }
 | 
| 
 | 
   255 # check for model
 | 
| 
 | 
   256 if(model == "None") {
 | 
| 
 | 
   257   model <- NULL
 | 
| 
 | 
   258   prediction <- F
 | 
| 
 | 
   259 } else {
 | 
| 
 | 
   260   prediction <- T
 | 
| 
 | 
   261   con = file(model)
 | 
| 
 | 
   262   model <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   263   close(con)
 | 
| 
 | 
   264   model <- readRDS(model)
 | 
| 
 | 
   265 }
 | 
| 
 | 
   266 # load classifiers and phenotype
 | 
| 
 | 
   267 classifiers <- NULL
 | 
| 
 | 
   268 classifNames <- NULL
 | 
| 
 | 
   269 if(lassoPred !="None"){
 | 
| 
 | 
   270   classifiers <- c(classifiers, lassoPred)
 | 
| 
 | 
   271   classifNames <- c(classifNames, "lasso")
 | 
| 
 | 
   272 }
 | 
| 
 | 
   273 if(rrBLUPPred !="None"){
 | 
| 
 | 
   274   classifiers <- c(classifiers, rrBLUPPred)
 | 
| 
 | 
   275   classifNames <- c(classifNames, "rrBLUP")
 | 
| 
 | 
   276 }
 | 
| 
 | 
   277 if(rfPred !="None"){
 | 
| 
 | 
   278   classifiers <- c(classifiers, rfPred)
 | 
| 
 | 
   279   classifNames <- c(classifNames, "rf")
 | 
| 
 | 
   280 }
 | 
| 
 | 
   281 if(svmPred !="None"){
 | 
| 
 | 
   282   classifiers <- c(classifiers, svmPred)
 | 
| 
 | 
   283   classifNames <- c(classifNames, "svm")
 | 
| 
 | 
   284 }
 | 
| 
 | 
   285 classifPrediction <- NULL
 | 
| 
 | 
   286 for(classif in classifiers) {
 | 
| 
 | 
   287   classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
 | 
| 
 | 
   288 }
 | 
| 
 | 
   289 classifPrediction <- data.frame(classifPrediction)
 | 
| 
 | 
   290 colnames(classifPrediction) <- classifNames
 | 
| 
 | 
   291 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
 | 
| 
 | 
   292 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
 | 
| 
 | 
   293 # aggregate !
 | 
| 
 | 
   294 switch(method,
 | 
| 
 | 
   295        geneticMean={
 | 
| 
 | 
   296          res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   297                               out = out, prediction = prediction, model=model)
 | 
| 
 | 
   298        },
 | 
| 
 | 
   299        dt={
 | 
| 
 | 
   300          res <- aggregateDT(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   301                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   302        },
 | 
| 
 | 
   303        lasso={
 | 
| 
 | 
   304          res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,
 | 
| 
 | 
   305                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   306        },
 | 
| 
 | 
   307        rf={
 | 
| 
 | 
   308          res <- aggregateRF(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   309                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   310        },
 | 
| 
 | 
   311        # svm
 | 
| 
 | 
   312        {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
 | 
| 
 | 
   313                      out = out, prediction = prediction, model = model)}
 | 
| 
 | 
   314        )
 | 
| 
 | 
   315 if(prediction) {
 | 
| 
 | 
   316   write.table(data.frame(lines=rownames(classifPrediction), res), out,
 | 
| 
 | 
   317                          sep="\t", row.names = F)
 | 
| 
 | 
   318 } else {
 | 
| 
 | 
   319   cat(paste(out, ".rds", sep=""), "\n", sep="")
 | 
| 
 | 
   320 } |