Mercurial > repos > nicolas > oghma
comparison evaluate_aggregation.R @ 94:f90c741e2a32 draft
Uploaded
author | nicolas |
---|---|
date | Mon, 31 Oct 2016 05:50:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
93:3cdfa1e4b1c1 | 94:f90c741e2a32 |
---|---|
1 ######################################################## | |
2 # | |
3 # creation date : 10/10/16 | |
4 # last modification : 29/10/16 | |
5 # author : Dr Nicolas Beaume | |
6 # | |
7 ######################################################## | |
8 | |
9 suppressWarnings(suppressMessages(library(GA))) | |
10 library("miscTools") | |
11 library(rpart) | |
12 suppressWarnings(suppressMessages(library(randomForest))) | |
13 library(e1071) | |
14 suppressWarnings(suppressMessages(library(glmnet))) | |
15 library(rrBLUP) | |
16 options(warn=-1) | |
17 ############################ helper functions ####################### | |
18 | |
19 ##### classifiers | |
20 prediction <- function(genotype, model, classifier="unknown") { | |
21 # run prediction according to the classifier | |
22 switch(classifier, | |
23 rrBLUP={ | |
24 predictions <- as.matrix(genotype) %*% as.matrix(model$u); | |
25 predictions <- predictions[,1]+model$beta; | |
26 }, | |
27 rf={ | |
28 predictions <- predict(model, genotype); | |
29 }, | |
30 svm={ | |
31 predictions <- predict(model, genotype); | |
32 }, | |
33 lasso={ | |
34 predictions <- predict(model, as.matrix(genotype), type = "response"); | |
35 }, | |
36 {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")}) | |
37 return(predictions) | |
38 } | |
39 | |
40 # extract parameter from a model, excluding rrBLUP which auto-optimize | |
41 extractParameter <- function(model, classifierName) { | |
42 param <- NULL | |
43 switch(classifierName, | |
44 # random forest | |
45 rf={ | |
46 param <- model$ntree | |
47 param <- c(param, list(model$mtry)) | |
48 names(param) <- c("ntree", "mtry") | |
49 }, | |
50 # svm | |
51 svm={ | |
52 param <- as.numeric(model$cost) | |
53 param <- c(param, list(model$gamma)) | |
54 param <- c(param, list(model$coef0)) | |
55 param <- c(param, list(model$degree)) | |
56 param <- c(param, list(model$kernel)) | |
57 names(param) <- c("c", "g", "coef", "d", "kernel") | |
58 switch((model$kernel+1), | |
59 param$kernel <- "linear", | |
60 param$kernel <- "polynomial", | |
61 param$kernel <- "radial", | |
62 param$kernel <- "sigmoid" | |
63 ) | |
64 }, | |
65 # lasso | |
66 lasso={ | |
67 param <- as.list(model$lambda) | |
68 names(param) <- "lambda" | |
69 }, | |
70 {print(paste("unknown classifier, please choose among rf, svm, lasso")); | |
71 stop()} | |
72 ) | |
73 return(param) | |
74 } | |
75 | |
76 ##### Genetic algorithm | |
77 | |
78 # compute r2 by computing the classic formula | |
79 # compare the sum of square difference from target to prediciton | |
80 # to the sum of square difference from target to the mean of the target | |
81 r2 <- function(target, prediction) { | |
82 sst <- sum((target-mean(target))^2) | |
83 ssr <- sum((target-prediction)^2) | |
84 return(1-ssr/sst) | |
85 } | |
86 | |
87 optimizeOneIndividual <- function(values, trueValue) { | |
88 # change the value into a function | |
89 f <- function(w) {sum(values * w/sum(w))} | |
90 fitness <- function(x) {1/abs(trueValue-f(x))} | |
91 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | |
92 maxiter = 1000, monitor = NULL, keepBest = T) | |
93 resp@solution <- resp@solution/sum(resp@solution) | |
94 return(resp) | |
95 } | |
96 | |
97 optimizeWeight <- function(values, trueValue, n=1000) { | |
98 fitnessAll <- function(w) { | |
99 predicted <- apply(values, 1, weightedPrediction.vec, w) | |
100 return(mean(r2(trueValue, predicted))) | |
101 #return(mean(1/abs(trueValue-predicted))) | |
102 } | |
103 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | |
104 maxiter = n, monitor = NULL, keepBest = T) | |
105 resp@solution <- resp@solution/sum(resp@solution) | |
106 return(resp) | |
107 } | |
108 | |
109 weightedPrediction <- function(classifiers, w) { | |
110 if(length(w) > ncol(classifiers)) { | |
111 warning("more weights than classifiers, extra weigths are ignored") | |
112 w <- w[1:ncol(classifiers)] | |
113 } else if(length(w) < ncol(classifiers)) { | |
114 warning("less weights than classifiers, extra classifiers are ignored") | |
115 classifiers <- classifiers[,1:length(w)] | |
116 } | |
117 prediction <- NULL | |
118 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | |
119 return(prediction) | |
120 } | |
121 | |
122 weightedPrediction.vec <- function(values, w) { | |
123 return(sum(values * w/sum(w))) | |
124 } | |
125 | |
126 ##### meta-decision tree | |
127 | |
128 tuneTree <- function(data, target) { | |
129 data <- data.frame(data, target=target) | |
130 size <- nrow(data) | |
131 xerror <- NULL | |
132 split <- 1:ceiling(size/5) | |
133 leafSize <- 1:ceiling(size/10) | |
134 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
135 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
136 for(i in 1:length(split)) { | |
137 for(j in 1:length(leafSize)) { | |
138 op <- list(minsplit=split[i], minbucket=leafSize[j]) | |
139 tree <- rpart(target ~., data=data, control=op, method="anova") | |
140 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | |
141 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | |
142 } | |
143 } | |
144 index <- which(xerror==min(xerror), arr.ind = T) | |
145 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | |
146 return(op) | |
147 } | |
148 | |
149 ###### meta-LASSO | |
150 # create fold by picking at random row indexes | |
151 createFolds <- function(nbObs, n) { | |
152 # pick indexes | |
153 index <- sample(1:n, size=nbObs, replace = T) | |
154 # populate folds | |
155 folds <- NULL | |
156 for(i in 1:n) { | |
157 folds <- c(folds, list(which(index==i))) | |
158 } | |
159 return(folds) | |
160 } | |
161 | |
162 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | |
163 folds <- createFolds(nrow(genotype), n = n) | |
164 acc <- NULL | |
165 indexAlpha <- 1 | |
166 for(a in alpha) { | |
167 curAcc <- NULL | |
168 for(i in 1:n) { | |
169 train <- genotype[-folds[[i]],] | |
170 test <- genotype[folds[[i]],] | |
171 phenoTrain <- phenotype[-folds[[i]]] | |
172 phenoTest <- phenotype[folds[[i]]] | |
173 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | |
174 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | |
175 pred <- predict(model, test, type = "response") | |
176 curAcc <- c(curAcc, r2(phenoTest, pred)) | |
177 } | |
178 acc <- c(acc, mean(curAcc)) | |
179 } | |
180 names(acc) <- alpha | |
181 return(as.numeric(names(acc)[which.max(acc)])) | |
182 } | |
183 | |
184 ###### meta-random forest | |
185 | |
186 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | |
187 n <- ceiling(nrow(genotype)/3) | |
188 indexTest <- sample(1:nrow(genotype), size=n) | |
189 train <- genotype[-indexTest,] | |
190 test <- genotype[indexTest,] | |
191 phenoTrain <- phenotype[-indexTest] | |
192 phenoTest <- phenotype[indexTest] | |
193 acc <- NULL | |
194 indexNtree <- 1 | |
195 for(ntree in rangeNtree) { | |
196 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | |
197 pred <- predict(model, test) | |
198 acc <- c(acc, r2(phenoTest, pred)) | |
199 } | |
200 names(acc) <- rangeNtree | |
201 best <- which.max(acc) | |
202 return(as.numeric(names(acc)[best])) | |
203 } | |
204 | |
205 ###### meta-SVM | |
206 searchParamSVM <- function(train, target, kernel="radial") { | |
207 # tuning parameters then train | |
208 model <- NULL | |
209 switch(kernel, | |
210 sigmoid={ | |
211 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | |
212 g <- tune$best.parameters[[1]]; | |
213 c <- tune$best.parameters[[2]]; | |
214 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | |
215 linear={ | |
216 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | |
217 c <- tune$best.parameters[[1]]; | |
218 model <- svm(x=train, y=target, cost = c, kernel = "linear")}, | |
219 polynomial={ | |
220 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | |
221 d <- tune$best.parameters[[1]]; | |
222 g <- tune$best.parameters[[2]]; | |
223 coef <- tune$best.parameters[[3]]; | |
224 c <- tune$best.parameters[[4]]; | |
225 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | |
226 { | |
227 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | |
228 g <- tune$best.parameters[[1]]; | |
229 c <- tune$best.parameters[[2]]; | |
230 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | |
231 ) | |
232 return(model) | |
233 } | |
234 | |
235 #################### upper level functions ##################### | |
236 | |
237 aggregateDT <- function(train, test, target, folds) { | |
238 r2Aggreg <- NULL | |
239 for (i in 1:length(folds)) { | |
240 trainIndex <- unlist(folds[-i]) | |
241 testIndex <- folds[[i]] | |
242 treeParam <- tuneTree(train[[i]], target[trainIndex]) | |
243 data <- data.frame(train[[i]], target=target[trainIndex]) | |
244 model <- rpart(target ~., data=data, method = "anova", control = treeParam) | |
245 model <- prune(model, cp=treeParam["cp"]) | |
246 pred <- predict(model, data.frame(test[[i]])) | |
247 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
248 } | |
249 return(r2Aggreg) | |
250 } | |
251 | |
252 aggregateGeneticMean <- function(train, test, target, folds) { | |
253 r2Aggreg <- NULL | |
254 for (i in 1:length(folds)) { | |
255 trainIndex <- unlist(folds[-i]) | |
256 testIndex <- folds[[i]] | |
257 opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex]) | |
258 pred <- weightedPrediction(test[[i]], opt@solution) | |
259 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
260 } | |
261 return(r2Aggreg) | |
262 } | |
263 | |
264 aggregateLASSO <- function(train, test, target, folds) { | |
265 r2Aggreg <- NULL | |
266 for (i in 1:length(folds)) { | |
267 trainIndex <- unlist(folds[-i]) | |
268 testIndex <- folds[[i]] | |
269 alpha <- searchParamLASSO(train[[i]], target[trainIndex]) | |
270 cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha) | |
271 model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se) | |
272 pred <- predict(model, test[[i]]) | |
273 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
274 } | |
275 return(r2Aggreg) | |
276 } | |
277 | |
278 aggregateRF <- function(train, test, target, folds) { | |
279 r2Aggreg <- NULL | |
280 for (i in 1:length(folds)) { | |
281 trainIndex <- unlist(folds[-i]) | |
282 testIndex <- folds[[i]] | |
283 ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex], | |
284 rangeNtree = seq(100, 1000, 100)) | |
285 model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex], | |
286 ntree = ntree, mtry = ncol(train[[i]])) | |
287 pred <- predict(model, test[[i]]) | |
288 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
289 } | |
290 return(r2Aggreg) | |
291 } | |
292 | |
293 aggregateSVM <- function(train, test, target, folds, kernel="linear") { | |
294 r2Aggreg <- NULL | |
295 for (i in 1:length(folds)) { | |
296 trainIndex <- unlist(folds[-i]) | |
297 testIndex <- folds[[i]] | |
298 model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel) | |
299 pred <- predict(model, test[[i]]) | |
300 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
301 } | |
302 return(r2Aggreg) | |
303 } | |
304 | |
305 ################################### main ############################# | |
306 # # load argument | |
307 cmd <- commandArgs(T) | |
308 source(cmd[1]) | |
309 # load folds | |
310 con = file(folds) | |
311 folds <- readLines(con = con, n = 1, ok=T) | |
312 close(con) | |
313 folds <- readRDS(folds) | |
314 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
315 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
316 # load genotype | |
317 con = file(genotype) | |
318 genotype <- readLines(con = con, n = 1, ok=T) | |
319 close(con) | |
320 genotype <- read.table(genotype, sep="\t", h=T) | |
321 # find which classifiers will be used for aggregation | |
322 classifNames <- NULL | |
323 if(lassoModel !="None"){ | |
324 classifNames <- c(classifNames, "lasso") | |
325 con = file(lassoModel) | |
326 lassoModel <- readLines(con = con, n = 1, ok=T) | |
327 close(con) | |
328 lassoModel <- readRDS(lassoModel) | |
329 } | |
330 if(rrBLUPModel !="None"){ | |
331 classifNames <- c(classifNames, "rrBLUP") | |
332 con = file(rrBLUPModel) | |
333 rrBLUPModel <- readLines(con = con, n = 1, ok=T) | |
334 close(con) | |
335 rrBLUPModel <- readRDS(rrBLUPModel) | |
336 } | |
337 if(rfModel !="None"){ | |
338 classifNames <- c(classifNames, "rf") | |
339 con = file(rfModel) | |
340 rfModel <- readLines(con = con, n = 1, ok=T) | |
341 close(con) | |
342 rfModel <- readRDS(rfModel) | |
343 } | |
344 if(svmModel !="None"){ | |
345 classifNames <- c(classifNames, "svm") | |
346 con = file(svmModel) | |
347 svmModel <- readLines(con = con, n = 1, ok=T) | |
348 close(con) | |
349 svmModel <- readRDS(svmModel) | |
350 } | |
351 | |
352 # compute prediction of the training set and test set for each fold and each classifiers | |
353 # train predictions and test prediction are stored in separate lists | |
354 # where each element of the list represent a folds | |
355 predictionTrain.list <- NULL | |
356 predictionTest.list <- NULL | |
357 r2Classif.list <- NULL | |
358 for(i in 1:length(folds)) { | |
359 # for the current fold, create training set and test set | |
360 trainIndex <- unlist(folds[-i]) | |
361 testIndex <- folds[[i]] | |
362 train <- genotype[trainIndex,] | |
363 phenoTrain <- phenotype[trainIndex] | |
364 test <- genotype[testIndex,] | |
365 phenoTest <- phenotype[testIndex] | |
366 # only to intialize data frame containing predictions | |
367 predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), | |
368 ncol=length(classifNames)) | |
369 colnames(predictionTrain) <- classifNames | |
370 predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), | |
371 ncol=length(classifNames)) | |
372 colnames(predictionTest) <- classifNames | |
373 r2Classif <- NULL | |
374 # for each classifiers, compute prediction on both sets | |
375 # and evaluate r2 to find the best classifier | |
376 for(j in 1:length(classifNames)) { | |
377 switch(classifNames[j], | |
378 # random forest | |
379 rf={ | |
380 # predict train and test | |
381 param <- extractParameter(rfModel, "rf") | |
382 model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, | |
383 ntree = param$ntree); | |
384 predictionTrain[,j] <- prediction(train, model, classifier = "rf"); | |
385 predictionTest[,j] <- prediction(test, model, classifier = "rf"); | |
386 r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))}, | |
387 # svm | |
388 svm={ | |
389 # predict train and test | |
390 param <- extractParameter(svmModel, "svm"); | |
391 model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c, | |
392 gamma=param$g, degree = param$d, coef0 = param$coef, scale = F) | |
393 predictionTrain[,j] <- prediction(train, model, classifier = "svm"); | |
394 predictionTest[,j] <- prediction(test, model, classifier = "svm"); | |
395 r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))}, | |
396 # lasso | |
397 lasso={ | |
398 # predict train and test | |
399 param <- extractParameter(lassoModel, "lasso"); | |
400 model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda); | |
401 predictionTrain[,j] <- prediction(train, model, classifier = "lasso"); | |
402 predictionTest[,j] <- prediction(test, model, classifier = "lasso"); | |
403 r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))}, | |
404 # rrBLUP | |
405 rrBLUP={ | |
406 # predict train and test | |
407 model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F); | |
408 predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP"); | |
409 predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP"); | |
410 r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))}, | |
411 {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP")); | |
412 stop()} | |
413 ) | |
414 } | |
415 predictionTrain.list <- c(predictionTrain.list, list(predictionTrain)) | |
416 predictionTest.list <- c(predictionTest.list, list(predictionTest)) | |
417 r2Classif.list <- c(r2Classif.list, list(r2Classif)) | |
418 } | |
419 # aggregate ! | |
420 switch(method, | |
421 geneticMean={ | |
422 aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list, | |
423 target = phenotype, folds=folds) | |
424 }, | |
425 dt={ | |
426 aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list, | |
427 target = phenotype, folds=folds) | |
428 }, | |
429 lasso={ | |
430 aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list, | |
431 target = phenotype, folds=folds) | |
432 }, | |
433 rf={ | |
434 aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list, | |
435 target = phenotype, folds=folds) | |
436 }, | |
437 # svm, by default | |
438 {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list, | |
439 target = phenotype, folds=folds, kernel=kernel)} | |
440 ) | |
441 # determine best classifier | |
442 # first, transform list into a matrix | |
443 r2Classif.list <- t(data.frame(r2Classif.list)) | |
444 # then, compute the mean r2 for each classifier | |
445 meanR2Classif <- apply(r2Classif.list, 2, mean) | |
446 # choose the best one | |
447 bestClassif <- which.max(meanR2Classif) | |
448 # compare aggregation and best classifiers | |
449 finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg, | |
450 diff=(aggreg-r2Classif.list[,bestClassif])) | |
451 print(apply(finalRes, 2, mean)) |