comparison evaluate_aggregation.R @ 94:f90c741e2a32 draft

Uploaded
author nicolas
date Mon, 31 Oct 2016 05:50:25 -0400
parents
children
comparison
equal deleted inserted replaced
93:3cdfa1e4b1c1 94:f90c741e2a32
1 ########################################################
2 #
3 # creation date : 10/10/16
4 # last modification : 29/10/16
5 # author : Dr Nicolas Beaume
6 #
7 ########################################################
8
9 suppressWarnings(suppressMessages(library(GA)))
10 library("miscTools")
11 library(rpart)
12 suppressWarnings(suppressMessages(library(randomForest)))
13 library(e1071)
14 suppressWarnings(suppressMessages(library(glmnet)))
15 library(rrBLUP)
16 options(warn=-1)
17 ############################ helper functions #######################
18
19 ##### classifiers
20 prediction <- function(genotype, model, classifier="unknown") {
21 # run prediction according to the classifier
22 switch(classifier,
23 rrBLUP={
24 predictions <- as.matrix(genotype) %*% as.matrix(model$u);
25 predictions <- predictions[,1]+model$beta;
26 },
27 rf={
28 predictions <- predict(model, genotype);
29 },
30 svm={
31 predictions <- predict(model, genotype);
32 },
33 lasso={
34 predictions <- predict(model, as.matrix(genotype), type = "response");
35 },
36 {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")})
37 return(predictions)
38 }
39
40 # extract parameter from a model, excluding rrBLUP which auto-optimize
41 extractParameter <- function(model, classifierName) {
42 param <- NULL
43 switch(classifierName,
44 # random forest
45 rf={
46 param <- model$ntree
47 param <- c(param, list(model$mtry))
48 names(param) <- c("ntree", "mtry")
49 },
50 # svm
51 svm={
52 param <- as.numeric(model$cost)
53 param <- c(param, list(model$gamma))
54 param <- c(param, list(model$coef0))
55 param <- c(param, list(model$degree))
56 param <- c(param, list(model$kernel))
57 names(param) <- c("c", "g", "coef", "d", "kernel")
58 switch((model$kernel+1),
59 param$kernel <- "linear",
60 param$kernel <- "polynomial",
61 param$kernel <- "radial",
62 param$kernel <- "sigmoid"
63 )
64 },
65 # lasso
66 lasso={
67 param <- as.list(model$lambda)
68 names(param) <- "lambda"
69 },
70 {print(paste("unknown classifier, please choose among rf, svm, lasso"));
71 stop()}
72 )
73 return(param)
74 }
75
76 ##### Genetic algorithm
77
78 # compute r2 by computing the classic formula
79 # compare the sum of square difference from target to prediciton
80 # to the sum of square difference from target to the mean of the target
81 r2 <- function(target, prediction) {
82 sst <- sum((target-mean(target))^2)
83 ssr <- sum((target-prediction)^2)
84 return(1-ssr/sst)
85 }
86
87 optimizeOneIndividual <- function(values, trueValue) {
88 # change the value into a function
89 f <- function(w) {sum(values * w/sum(w))}
90 fitness <- function(x) {1/abs(trueValue-f(x))}
91 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
92 maxiter = 1000, monitor = NULL, keepBest = T)
93 resp@solution <- resp@solution/sum(resp@solution)
94 return(resp)
95 }
96
97 optimizeWeight <- function(values, trueValue, n=1000) {
98 fitnessAll <- function(w) {
99 predicted <- apply(values, 1, weightedPrediction.vec, w)
100 return(mean(r2(trueValue, predicted)))
101 #return(mean(1/abs(trueValue-predicted)))
102 }
103 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
104 maxiter = n, monitor = NULL, keepBest = T)
105 resp@solution <- resp@solution/sum(resp@solution)
106 return(resp)
107 }
108
109 weightedPrediction <- function(classifiers, w) {
110 if(length(w) > ncol(classifiers)) {
111 warning("more weights than classifiers, extra weigths are ignored")
112 w <- w[1:ncol(classifiers)]
113 } else if(length(w) < ncol(classifiers)) {
114 warning("less weights than classifiers, extra classifiers are ignored")
115 classifiers <- classifiers[,1:length(w)]
116 }
117 prediction <- NULL
118 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
119 return(prediction)
120 }
121
122 weightedPrediction.vec <- function(values, w) {
123 return(sum(values * w/sum(w)))
124 }
125
126 ##### meta-decision tree
127
128 tuneTree <- function(data, target) {
129 data <- data.frame(data, target=target)
130 size <- nrow(data)
131 xerror <- NULL
132 split <- 1:ceiling(size/5)
133 leafSize <- 1:ceiling(size/10)
134 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
135 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
136 for(i in 1:length(split)) {
137 for(j in 1:length(leafSize)) {
138 op <- list(minsplit=split[i], minbucket=leafSize[j])
139 tree <- rpart(target ~., data=data, control=op, method="anova")
140 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
141 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
142 }
143 }
144 index <- which(xerror==min(xerror), arr.ind = T)
145 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
146 return(op)
147 }
148
149 ###### meta-LASSO
150 # create fold by picking at random row indexes
151 createFolds <- function(nbObs, n) {
152 # pick indexes
153 index <- sample(1:n, size=nbObs, replace = T)
154 # populate folds
155 folds <- NULL
156 for(i in 1:n) {
157 folds <- c(folds, list(which(index==i)))
158 }
159 return(folds)
160 }
161
162 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
163 folds <- createFolds(nrow(genotype), n = n)
164 acc <- NULL
165 indexAlpha <- 1
166 for(a in alpha) {
167 curAcc <- NULL
168 for(i in 1:n) {
169 train <- genotype[-folds[[i]],]
170 test <- genotype[folds[[i]],]
171 phenoTrain <- phenotype[-folds[[i]]]
172 phenoTest <- phenotype[folds[[i]]]
173 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
174 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
175 pred <- predict(model, test, type = "response")
176 curAcc <- c(curAcc, r2(phenoTest, pred))
177 }
178 acc <- c(acc, mean(curAcc))
179 }
180 names(acc) <- alpha
181 return(as.numeric(names(acc)[which.max(acc)]))
182 }
183
184 ###### meta-random forest
185
186 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
187 n <- ceiling(nrow(genotype)/3)
188 indexTest <- sample(1:nrow(genotype), size=n)
189 train <- genotype[-indexTest,]
190 test <- genotype[indexTest,]
191 phenoTrain <- phenotype[-indexTest]
192 phenoTest <- phenotype[indexTest]
193 acc <- NULL
194 indexNtree <- 1
195 for(ntree in rangeNtree) {
196 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
197 pred <- predict(model, test)
198 acc <- c(acc, r2(phenoTest, pred))
199 }
200 names(acc) <- rangeNtree
201 best <- which.max(acc)
202 return(as.numeric(names(acc)[best]))
203 }
204
205 ###### meta-SVM
206 searchParamSVM <- function(train, target, kernel="radial") {
207 # tuning parameters then train
208 model <- NULL
209 switch(kernel,
210 sigmoid={
211 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
212 g <- tune$best.parameters[[1]];
213 c <- tune$best.parameters[[2]];
214 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
215 linear={
216 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear");
217 c <- tune$best.parameters[[1]];
218 model <- svm(x=train, y=target, cost = c, kernel = "linear")},
219 polynomial={
220 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
221 d <- tune$best.parameters[[1]];
222 g <- tune$best.parameters[[2]];
223 coef <- tune$best.parameters[[3]];
224 c <- tune$best.parameters[[4]];
225 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
226 {
227 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
228 g <- tune$best.parameters[[1]];
229 c <- tune$best.parameters[[2]];
230 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
231 )
232 return(model)
233 }
234
235 #################### upper level functions #####################
236
237 aggregateDT <- function(train, test, target, folds) {
238 r2Aggreg <- NULL
239 for (i in 1:length(folds)) {
240 trainIndex <- unlist(folds[-i])
241 testIndex <- folds[[i]]
242 treeParam <- tuneTree(train[[i]], target[trainIndex])
243 data <- data.frame(train[[i]], target=target[trainIndex])
244 model <- rpart(target ~., data=data, method = "anova", control = treeParam)
245 model <- prune(model, cp=treeParam["cp"])
246 pred <- predict(model, data.frame(test[[i]]))
247 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
248 }
249 return(r2Aggreg)
250 }
251
252 aggregateGeneticMean <- function(train, test, target, folds) {
253 r2Aggreg <- NULL
254 for (i in 1:length(folds)) {
255 trainIndex <- unlist(folds[-i])
256 testIndex <- folds[[i]]
257 opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex])
258 pred <- weightedPrediction(test[[i]], opt@solution)
259 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
260 }
261 return(r2Aggreg)
262 }
263
264 aggregateLASSO <- function(train, test, target, folds) {
265 r2Aggreg <- NULL
266 for (i in 1:length(folds)) {
267 trainIndex <- unlist(folds[-i])
268 testIndex <- folds[[i]]
269 alpha <- searchParamLASSO(train[[i]], target[trainIndex])
270 cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha)
271 model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se)
272 pred <- predict(model, test[[i]])
273 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
274 }
275 return(r2Aggreg)
276 }
277
278 aggregateRF <- function(train, test, target, folds) {
279 r2Aggreg <- NULL
280 for (i in 1:length(folds)) {
281 trainIndex <- unlist(folds[-i])
282 testIndex <- folds[[i]]
283 ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex],
284 rangeNtree = seq(100, 1000, 100))
285 model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex],
286 ntree = ntree, mtry = ncol(train[[i]]))
287 pred <- predict(model, test[[i]])
288 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
289 }
290 return(r2Aggreg)
291 }
292
293 aggregateSVM <- function(train, test, target, folds, kernel="linear") {
294 r2Aggreg <- NULL
295 for (i in 1:length(folds)) {
296 trainIndex <- unlist(folds[-i])
297 testIndex <- folds[[i]]
298 model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel)
299 pred <- predict(model, test[[i]])
300 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
301 }
302 return(r2Aggreg)
303 }
304
305 ################################### main #############################
306 # # load argument
307 cmd <- commandArgs(T)
308 source(cmd[1])
309 # load folds
310 con = file(folds)
311 folds <- readLines(con = con, n = 1, ok=T)
312 close(con)
313 folds <- readRDS(folds)
314 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
315 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
316 # load genotype
317 con = file(genotype)
318 genotype <- readLines(con = con, n = 1, ok=T)
319 close(con)
320 genotype <- read.table(genotype, sep="\t", h=T)
321 # find which classifiers will be used for aggregation
322 classifNames <- NULL
323 if(lassoModel !="None"){
324 classifNames <- c(classifNames, "lasso")
325 con = file(lassoModel)
326 lassoModel <- readLines(con = con, n = 1, ok=T)
327 close(con)
328 lassoModel <- readRDS(lassoModel)
329 }
330 if(rrBLUPModel !="None"){
331 classifNames <- c(classifNames, "rrBLUP")
332 con = file(rrBLUPModel)
333 rrBLUPModel <- readLines(con = con, n = 1, ok=T)
334 close(con)
335 rrBLUPModel <- readRDS(rrBLUPModel)
336 }
337 if(rfModel !="None"){
338 classifNames <- c(classifNames, "rf")
339 con = file(rfModel)
340 rfModel <- readLines(con = con, n = 1, ok=T)
341 close(con)
342 rfModel <- readRDS(rfModel)
343 }
344 if(svmModel !="None"){
345 classifNames <- c(classifNames, "svm")
346 con = file(svmModel)
347 svmModel <- readLines(con = con, n = 1, ok=T)
348 close(con)
349 svmModel <- readRDS(svmModel)
350 }
351
352 # compute prediction of the training set and test set for each fold and each classifiers
353 # train predictions and test prediction are stored in separate lists
354 # where each element of the list represent a folds
355 predictionTrain.list <- NULL
356 predictionTest.list <- NULL
357 r2Classif.list <- NULL
358 for(i in 1:length(folds)) {
359 # for the current fold, create training set and test set
360 trainIndex <- unlist(folds[-i])
361 testIndex <- folds[[i]]
362 train <- genotype[trainIndex,]
363 phenoTrain <- phenotype[trainIndex]
364 test <- genotype[testIndex,]
365 phenoTest <- phenotype[testIndex]
366 # only to intialize data frame containing predictions
367 predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)),
368 ncol=length(classifNames))
369 colnames(predictionTrain) <- classifNames
370 predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)),
371 ncol=length(classifNames))
372 colnames(predictionTest) <- classifNames
373 r2Classif <- NULL
374 # for each classifiers, compute prediction on both sets
375 # and evaluate r2 to find the best classifier
376 for(j in 1:length(classifNames)) {
377 switch(classifNames[j],
378 # random forest
379 rf={
380 # predict train and test
381 param <- extractParameter(rfModel, "rf")
382 model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry,
383 ntree = param$ntree);
384 predictionTrain[,j] <- prediction(train, model, classifier = "rf");
385 predictionTest[,j] <- prediction(test, model, classifier = "rf");
386 r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))},
387 # svm
388 svm={
389 # predict train and test
390 param <- extractParameter(svmModel, "svm");
391 model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c,
392 gamma=param$g, degree = param$d, coef0 = param$coef, scale = F)
393 predictionTrain[,j] <- prediction(train, model, classifier = "svm");
394 predictionTest[,j] <- prediction(test, model, classifier = "svm");
395 r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))},
396 # lasso
397 lasso={
398 # predict train and test
399 param <- extractParameter(lassoModel, "lasso");
400 model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda);
401 predictionTrain[,j] <- prediction(train, model, classifier = "lasso");
402 predictionTest[,j] <- prediction(test, model, classifier = "lasso");
403 r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))},
404 # rrBLUP
405 rrBLUP={
406 # predict train and test
407 model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F);
408 predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP");
409 predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP");
410 r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))},
411 {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP"));
412 stop()}
413 )
414 }
415 predictionTrain.list <- c(predictionTrain.list, list(predictionTrain))
416 predictionTest.list <- c(predictionTest.list, list(predictionTest))
417 r2Classif.list <- c(r2Classif.list, list(r2Classif))
418 }
419 # aggregate !
420 switch(method,
421 geneticMean={
422 aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list,
423 target = phenotype, folds=folds)
424 },
425 dt={
426 aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list,
427 target = phenotype, folds=folds)
428 },
429 lasso={
430 aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list,
431 target = phenotype, folds=folds)
432 },
433 rf={
434 aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list,
435 target = phenotype, folds=folds)
436 },
437 # svm, by default
438 {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list,
439 target = phenotype, folds=folds, kernel=kernel)}
440 )
441 # determine best classifier
442 # first, transform list into a matrix
443 r2Classif.list <- t(data.frame(r2Classif.list))
444 # then, compute the mean r2 for each classifier
445 meanR2Classif <- apply(r2Classif.list, 2, mean)
446 # choose the best one
447 bestClassif <- which.max(meanR2Classif)
448 # compare aggregation and best classifiers
449 finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg,
450 diff=(aggreg-r2Classif.list[,bestClassif]))
451 print(apply(finalRes, 2, mean))