| 50 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 25/10/16 | 
|  | 4 # last modification : 25/10/16 | 
|  | 5 # author : Dr Nicolas Beaume | 
|  | 6 # | 
|  | 7 ######################################################## | 
|  | 8 | 
|  | 9 suppressWarnings(suppressMessages(library(GA))) | 
|  | 10 library("miscTools") | 
|  | 11 library(rpart) | 
|  | 12 suppressWarnings(suppressMessages(library(randomForest))) | 
|  | 13 library(e1071) | 
|  | 14 suppressWarnings(suppressMessages(library(glmnet))) | 
|  | 15 options(warn=-1) | 
|  | 16 ############################ helper functions ####################### | 
|  | 17 | 
|  | 18 ##### Genetic algorithm | 
|  | 19 | 
|  | 20 # compute r2 by computing the classic formula | 
|  | 21 # compare the sum of square difference from target to prediciton | 
|  | 22 # to the sum of square difference from target to the mean of the target | 
|  | 23 r2 <- function(target, prediction) { | 
|  | 24   sst <- sum((target-mean(target))^2) | 
|  | 25   ssr <- sum((target-prediction)^2) | 
|  | 26   return(1-ssr/sst) | 
|  | 27 } | 
|  | 28 | 
|  | 29 optimizeOneIndividual <- function(values, trueValue) { | 
|  | 30   # change the value into a function | 
|  | 31   f <- function(w) {sum(values * w/sum(w))} | 
|  | 32   fitness <- function(x) {1/abs(trueValue-f(x))} | 
|  | 33   resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | 
|  | 34              maxiter = 1000, monitor = NULL, keepBest = T) | 
|  | 35   resp@solution <- resp@solution/sum(resp@solution) | 
|  | 36   return(resp) | 
|  | 37 } | 
|  | 38 | 
|  | 39 optimizeWeight <- function(values, trueValue, n=1000) { | 
|  | 40   fitnessAll <- function(w) { | 
|  | 41     predicted <- apply(values, 1, weightedPrediction.vec, w) | 
|  | 42     return(mean(r2(trueValue, predicted))) | 
|  | 43     #return(mean(1/abs(trueValue-predicted))) | 
|  | 44   } | 
|  | 45   resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | 
|  | 46              maxiter = n, monitor = NULL, keepBest = T) | 
|  | 47   resp@solution <- resp@solution/sum(resp@solution) | 
|  | 48   return(resp) | 
|  | 49 } | 
|  | 50 | 
|  | 51 weightedPrediction <- function(classifiers, w) { | 
|  | 52   if(length(w) > ncol(classifiers)) { | 
|  | 53     warning("more weights than classifiers, extra weigths are ignored") | 
|  | 54     w <- w[1:ncol(classifiers)] | 
|  | 55   } else if(length(w) < ncol(classifiers)) { | 
|  | 56     warning("less weights than classifiers, extra classifiers are ignored") | 
|  | 57     classifiers <- classifiers[,1:length(w)] | 
|  | 58   } | 
|  | 59   prediction <- NULL | 
|  | 60   prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | 
|  | 61   return(prediction) | 
|  | 62 } | 
|  | 63 | 
|  | 64 weightedPrediction.vec <- function(values, w) { | 
|  | 65   return(sum(values * w/sum(w))) | 
|  | 66 } | 
|  | 67 | 
|  | 68 ##### meta-decision tree | 
|  | 69 | 
|  | 70 tuneTree <- function(data, target) { | 
|  | 71   data <- data.frame(data, target=target) | 
|  | 72   size <-  nrow(data) | 
|  | 73   xerror <-  NULL | 
|  | 74   split <-  1:ceiling(size/5) | 
|  | 75   leafSize <-  1:ceiling(size/10) | 
|  | 76   xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | 
|  | 77   cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | 
|  | 78   for(i in 1:length(split)) { | 
|  | 79     for(j in 1:length(leafSize)) { | 
|  | 80       op <- list(minsplit=split[i], minbucket=leafSize[j]) | 
|  | 81       tree <- rpart(target ~., data=data, control=op, method="anova") | 
|  | 82       xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | 
|  | 83       cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | 
|  | 84     } | 
|  | 85   } | 
|  | 86   index <- which(xerror==min(xerror), arr.ind = T) | 
|  | 87   op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | 
|  | 88   return(op) | 
|  | 89 } | 
|  | 90 | 
|  | 91 ###### meta-LASSO | 
|  | 92 # create fold by picking at random row indexes | 
|  | 93 createFolds <- function(nbObs, n) { | 
|  | 94   # pick indexes | 
|  | 95   index <- sample(1:n, size=nbObs, replace = T) | 
|  | 96   # populate folds | 
|  | 97   folds <- NULL | 
|  | 98   for(i in 1:n) { | 
|  | 99     folds <- c(folds, list(which(index==i))) | 
|  | 100   } | 
|  | 101   return(folds) | 
|  | 102 } | 
|  | 103 | 
|  | 104 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | 
|  | 105   folds <- createFolds(nrow(genotype), n = n) | 
|  | 106   acc <- NULL | 
|  | 107   indexAlpha <- 1 | 
|  | 108   for(a in alpha) { | 
|  | 109     curAcc <- NULL | 
|  | 110     for(i in 1:n) { | 
|  | 111       train <- genotype[-folds[[i]],] | 
|  | 112       test <- genotype[folds[[i]],] | 
|  | 113       phenoTrain <- phenotype[-folds[[i]]] | 
|  | 114       phenoTest <- phenotype[folds[[i]]] | 
|  | 115       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | 
|  | 116       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | 
|  | 117       pred <- predict(model, test, type = "response") | 
|  | 118       curAcc <- c(curAcc, r2(phenoTest, pred)) | 
|  | 119     } | 
|  | 120     acc <- c(acc, mean(curAcc)) | 
|  | 121   } | 
|  | 122   names(acc) <- alpha | 
|  | 123   return(as.numeric(names(acc)[which.max(acc)])) | 
|  | 124 } | 
|  | 125 | 
|  | 126 ###### meta-random forest | 
|  | 127 | 
|  | 128 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | 
|  | 129   n <- ceiling(nrow(genotype)/3) | 
|  | 130   indexTest <- sample(1:nrow(genotype), size=n) | 
|  | 131   train <- genotype[-indexTest,] | 
|  | 132   test <- genotype[indexTest,] | 
|  | 133   phenoTrain <- phenotype[-indexTest] | 
|  | 134   phenoTest <- phenotype[indexTest] | 
|  | 135   acc <- NULL | 
|  | 136   indexNtree <- 1 | 
|  | 137   for(ntree in rangeNtree) { | 
|  | 138     model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | 
|  | 139     pred <- predict(model, test) | 
|  | 140     acc <- c(acc, r2(phenoTest, pred)) | 
|  | 141   } | 
|  | 142   names(acc) <- rangeNtree | 
|  | 143   best <- which.max(acc) | 
|  | 144   return(as.numeric(names(acc)[best])) | 
|  | 145 } | 
|  | 146 | 
|  | 147 ###### meta-SVM | 
|  | 148 searchParamSVM <- function(train, target, kernel="radial") { | 
|  | 149   # tuning parameters then train | 
|  | 150   model <- NULL | 
|  | 151   switch(kernel, | 
|  | 152          sigmoid={ | 
|  | 153            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | 
|  | 154            g <- tune$best.parameters[[1]]; | 
|  | 155            c <- tune$best.parameters[[2]]; | 
|  | 156            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | 
|  | 157          linear={ | 
|  | 158            tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | 
|  | 159            c <- tune$best.parameters[[1]]; | 
|  | 160            model <-  svm(x=train, y=target, cost = c, kernel = "linear")}, | 
|  | 161          polynomial={ | 
|  | 162            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | 
|  | 163            d <- tune$best.parameters[[1]]; | 
|  | 164            g <- tune$best.parameters[[2]]; | 
|  | 165            coef <- tune$best.parameters[[3]]; | 
|  | 166            c <- tune$best.parameters[[4]]; | 
|  | 167            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | 
|  | 168          { | 
|  | 169            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | 
|  | 170            g <- tune$best.parameters[[1]]; | 
|  | 171            c <- tune$best.parameters[[2]]; | 
|  | 172            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | 
|  | 173   ) | 
|  | 174   return(model) | 
|  | 175 } | 
|  | 176 | 
|  | 177 #################### upper level functions ##################### | 
|  | 178 | 
|  | 179 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { | 
|  | 180   if(!prediction) { | 
|  | 181     treeParam <- tuneTree(classifiers, target) | 
|  | 182     data <- data.frame(classifiers, target) | 
|  | 183     model <- rpart(target ~., data=data, method = "anova", control = treeParam) | 
|  | 184     model <- prune(model, cp=treeParam["cp"]) | 
|  | 185     saveRDS(model, out) | 
|  | 186   } else { | 
|  | 187     saveRDS(predict(model, data.frame(classifiers)), out) | 
|  | 188   } | 
|  | 189 } | 
|  | 190 | 
|  | 191 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ | 
|  | 192   if(!prediction) { | 
|  | 193     opt <- optimizeWeight(values = classifiers, trueValue = target) | 
|  | 194     saveRDS(opt@solution, out) | 
|  | 195     # evaluation of the method | 
|  | 196   } else { | 
|  | 197     saveRDS(weightedPrediction.vec(classifiers, model), out) | 
|  | 198   } | 
|  | 199 } | 
|  | 200 | 
|  | 201 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { | 
|  | 202   if(!prediction) { | 
|  | 203     alpha <- searchParamLASSO(classifiers, target) | 
|  | 204     cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) | 
|  | 205     model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) | 
|  | 206     saveRDS(model, out) | 
|  | 207   } else { | 
|  | 208     saveRDS(predict(model, classifiers), out) | 
|  | 209   } | 
|  | 210 } | 
|  | 211 | 
|  | 212 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { | 
|  | 213   if(!prediction) { | 
|  | 214    ntree <- searchParamRF(genotype = classifiers, phenotype = target, | 
|  | 215                                               rangeNtree = seq(100, 1000, 100)) | 
|  | 216     model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) | 
|  | 217     saveRDS(model, out) | 
|  | 218   } else { | 
|  | 219     saveRDS(predict(model, classifiers), out) | 
|  | 220   } | 
|  | 221 } | 
|  | 222 | 
|  | 223 aggregateSVM <- function(classifiers, target=NULL, prediction=F, | 
|  | 224                          model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { | 
|  | 225   if(!prediction) { | 
|  | 226       model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) | 
|  | 227       saveRDS(model, out) | 
|  | 228   } else { | 
|  | 229     saveRDS(predict(model, classifiers), out) | 
|  | 230   } | 
|  | 231 } | 
|  | 232 | 
|  | 233 ################################### main ############################# | 
|  | 234 # # load argument | 
|  | 235 cmd <- commandArgs(T) | 
|  | 236 source(cmd[1]) | 
|  | 237 # check if evaluation is required | 
|  | 238 evaluation <- F | 
|  | 239 if(as.integer(doEvaluation) == 1) { | 
|  | 240   evaluation <- T | 
|  | 241   con = file(folds) | 
|  | 242   folds <- readLines(con = con, n = 1, ok=T) | 
|  | 243   close(con) | 
|  | 244   folds <- readRDS(folds) | 
|  | 245 } | 
|  | 246 # check for model | 
|  | 247 if(model == "None") { | 
|  | 248   model <- NULL | 
|  | 249   prediction <- F | 
|  | 250 } else { | 
|  | 251   prediction <- T | 
|  | 252   con = file(model) | 
|  | 253   model <- readLines(con = con, n = 1, ok=T) | 
|  | 254   close(con) | 
|  | 255   model <- readRDS(model) | 
|  | 256 } | 
|  | 257 # load classifiers and phenotype | 
|  | 258 classifiers <- NULL | 
|  | 259 classifNames <- NULL | 
|  | 260 if(lassoPred !="None"){ | 
|  | 261   classifiers <- c(classifiers, lassoPred) | 
|  | 262   classifNames <- c(classifNames, "lasso") | 
|  | 263 } | 
|  | 264 if(rrBLUPPred !="None"){ | 
|  | 265   classifiers <- c(classifiers, rrBLUPPred) | 
|  | 266   classifNames <- c(classifNames, "rrBLUP") | 
|  | 267 } | 
|  | 268 if(rfPred !="None"){ | 
|  | 269   classifiers <- c(classifiers, rfPred) | 
|  | 270   classifNames <- c(classifNames, "rf") | 
|  | 271 } | 
|  | 272 if(svmPred !="None"){ | 
|  | 273   classifiers <- c(classifiers, svmPred) | 
|  | 274   classifNames <- c(classifNames, "svm") | 
|  | 275 } | 
|  | 276 classifPrediction <- NULL | 
|  | 277 for(classif in classifiers) { | 
|  | 278   classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) | 
|  | 279 } | 
|  | 280 classifPrediction <- data.frame(classifPrediction) | 
|  | 281 colnames(classifPrediction) <- classifNames | 
|  | 282 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | 
|  | 283 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | 
|  | 284 out <- paste(out, ".rds", sep = "") | 
|  | 285 # aggregate ! | 
|  | 286 switch(method, | 
|  | 287        geneticMean={ | 
|  | 288          aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, | 
|  | 289                               out = out, prediction = prediction, model=model) | 
|  | 290        }, | 
|  | 291        dt={ | 
|  | 292          aggregateDT(classifiers = classifPrediction, target = phenotype, | 
|  | 293                      out = out, prediction = prediction, model=model) | 
|  | 294        }, | 
|  | 295        lasso={ | 
|  | 296          aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, | 
|  | 297                      out = out, prediction = prediction, model=model) | 
|  | 298        }, | 
|  | 299        rf={ | 
|  | 300          aggregateRF(classifiers = classifPrediction, target = phenotype, | 
|  | 301                      out = out, prediction = prediction, model=model) | 
|  | 302        }, | 
|  | 303        # svm | 
|  | 304        {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, | 
|  | 305                      out = out, prediction = prediction, model = model)} | 
|  | 306        ) | 
|  | 307 # return path of the result file to galaxy | 
|  | 308 cat(paste(out, "\n", sep="")) |