94
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 10/10/16
|
|
4 # last modification : 29/10/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 #
|
|
7 ########################################################
|
|
8
|
|
9 suppressWarnings(suppressMessages(library(GA)))
|
|
10 library("miscTools")
|
|
11 library(rpart)
|
|
12 suppressWarnings(suppressMessages(library(randomForest)))
|
|
13 library(e1071)
|
|
14 suppressWarnings(suppressMessages(library(glmnet)))
|
|
15 library(rrBLUP)
|
|
16 options(warn=-1)
|
|
17 ############################ helper functions #######################
|
|
18
|
|
19 ##### classifiers
|
|
20 prediction <- function(genotype, model, classifier="unknown") {
|
|
21 # run prediction according to the classifier
|
|
22 switch(classifier,
|
|
23 rrBLUP={
|
|
24 predictions <- as.matrix(genotype) %*% as.matrix(model$u);
|
|
25 predictions <- predictions[,1]+model$beta;
|
|
26 },
|
|
27 rf={
|
|
28 predictions <- predict(model, genotype);
|
|
29 },
|
|
30 svm={
|
|
31 predictions <- predict(model, genotype);
|
|
32 },
|
|
33 lasso={
|
|
34 predictions <- predict(model, as.matrix(genotype), type = "response");
|
|
35 },
|
|
36 {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")})
|
|
37 return(predictions)
|
|
38 }
|
|
39
|
|
40 # extract parameter from a model, excluding rrBLUP which auto-optimize
|
|
41 extractParameter <- function(model, classifierName) {
|
|
42 param <- NULL
|
|
43 switch(classifierName,
|
|
44 # random forest
|
|
45 rf={
|
|
46 param <- model$ntree
|
|
47 param <- c(param, list(model$mtry))
|
|
48 names(param) <- c("ntree", "mtry")
|
|
49 },
|
|
50 # svm
|
|
51 svm={
|
|
52 param <- as.numeric(model$cost)
|
|
53 param <- c(param, list(model$gamma))
|
|
54 param <- c(param, list(model$coef0))
|
|
55 param <- c(param, list(model$degree))
|
|
56 param <- c(param, list(model$kernel))
|
|
57 names(param) <- c("c", "g", "coef", "d", "kernel")
|
|
58 switch((model$kernel+1),
|
|
59 param$kernel <- "linear",
|
|
60 param$kernel <- "polynomial",
|
|
61 param$kernel <- "radial",
|
|
62 param$kernel <- "sigmoid"
|
|
63 )
|
|
64 },
|
|
65 # lasso
|
|
66 lasso={
|
|
67 param <- as.list(model$lambda)
|
|
68 names(param) <- "lambda"
|
|
69 },
|
|
70 {print(paste("unknown classifier, please choose among rf, svm, lasso"));
|
|
71 stop()}
|
|
72 )
|
|
73 return(param)
|
|
74 }
|
|
75
|
|
76 ##### Genetic algorithm
|
|
77
|
|
78 # compute r2 by computing the classic formula
|
|
79 # compare the sum of square difference from target to prediciton
|
|
80 # to the sum of square difference from target to the mean of the target
|
|
81 r2 <- function(target, prediction) {
|
|
82 sst <- sum((target-mean(target))^2)
|
|
83 ssr <- sum((target-prediction)^2)
|
|
84 return(1-ssr/sst)
|
|
85 }
|
|
86
|
|
87 optimizeOneIndividual <- function(values, trueValue) {
|
|
88 # change the value into a function
|
|
89 f <- function(w) {sum(values * w/sum(w))}
|
|
90 fitness <- function(x) {1/abs(trueValue-f(x))}
|
|
91 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
|
|
92 maxiter = 1000, monitor = NULL, keepBest = T)
|
|
93 resp@solution <- resp@solution/sum(resp@solution)
|
|
94 return(resp)
|
|
95 }
|
|
96
|
|
97 optimizeWeight <- function(values, trueValue, n=1000) {
|
|
98 fitnessAll <- function(w) {
|
|
99 predicted <- apply(values, 1, weightedPrediction.vec, w)
|
|
100 return(mean(r2(trueValue, predicted)))
|
|
101 #return(mean(1/abs(trueValue-predicted)))
|
|
102 }
|
|
103 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
|
|
104 maxiter = n, monitor = NULL, keepBest = T)
|
|
105 resp@solution <- resp@solution/sum(resp@solution)
|
|
106 return(resp)
|
|
107 }
|
|
108
|
|
109 weightedPrediction <- function(classifiers, w) {
|
|
110 if(length(w) > ncol(classifiers)) {
|
|
111 warning("more weights than classifiers, extra weigths are ignored")
|
|
112 w <- w[1:ncol(classifiers)]
|
|
113 } else if(length(w) < ncol(classifiers)) {
|
|
114 warning("less weights than classifiers, extra classifiers are ignored")
|
|
115 classifiers <- classifiers[,1:length(w)]
|
|
116 }
|
|
117 prediction <- NULL
|
|
118 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
|
|
119 return(prediction)
|
|
120 }
|
|
121
|
|
122 weightedPrediction.vec <- function(values, w) {
|
|
123 return(sum(values * w/sum(w)))
|
|
124 }
|
|
125
|
|
126 ##### meta-decision tree
|
|
127
|
|
128 tuneTree <- function(data, target) {
|
|
129 data <- data.frame(data, target=target)
|
|
130 size <- nrow(data)
|
|
131 xerror <- NULL
|
|
132 split <- 1:ceiling(size/5)
|
|
133 leafSize <- 1:ceiling(size/10)
|
|
134 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
|
|
135 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
|
|
136 for(i in 1:length(split)) {
|
|
137 for(j in 1:length(leafSize)) {
|
|
138 op <- list(minsplit=split[i], minbucket=leafSize[j])
|
|
139 tree <- rpart(target ~., data=data, control=op, method="anova")
|
|
140 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
|
|
141 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
|
|
142 }
|
|
143 }
|
|
144 index <- which(xerror==min(xerror), arr.ind = T)
|
|
145 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
|
|
146 return(op)
|
|
147 }
|
|
148
|
|
149 ###### meta-LASSO
|
|
150 # create fold by picking at random row indexes
|
|
151 createFolds <- function(nbObs, n) {
|
|
152 # pick indexes
|
|
153 index <- sample(1:n, size=nbObs, replace = T)
|
|
154 # populate folds
|
|
155 folds <- NULL
|
|
156 for(i in 1:n) {
|
|
157 folds <- c(folds, list(which(index==i)))
|
|
158 }
|
|
159 return(folds)
|
|
160 }
|
|
161
|
|
162 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
|
|
163 folds <- createFolds(nrow(genotype), n = n)
|
|
164 acc <- NULL
|
|
165 indexAlpha <- 1
|
|
166 for(a in alpha) {
|
|
167 curAcc <- NULL
|
|
168 for(i in 1:n) {
|
|
169 train <- genotype[-folds[[i]],]
|
|
170 test <- genotype[folds[[i]],]
|
|
171 phenoTrain <- phenotype[-folds[[i]]]
|
|
172 phenoTest <- phenotype[folds[[i]]]
|
|
173 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
|
|
174 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
|
|
175 pred <- predict(model, test, type = "response")
|
|
176 curAcc <- c(curAcc, r2(phenoTest, pred))
|
|
177 }
|
|
178 acc <- c(acc, mean(curAcc))
|
|
179 }
|
|
180 names(acc) <- alpha
|
|
181 return(as.numeric(names(acc)[which.max(acc)]))
|
|
182 }
|
|
183
|
|
184 ###### meta-random forest
|
|
185
|
|
186 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
|
|
187 n <- ceiling(nrow(genotype)/3)
|
|
188 indexTest <- sample(1:nrow(genotype), size=n)
|
|
189 train <- genotype[-indexTest,]
|
|
190 test <- genotype[indexTest,]
|
|
191 phenoTrain <- phenotype[-indexTest]
|
|
192 phenoTest <- phenotype[indexTest]
|
|
193 acc <- NULL
|
|
194 indexNtree <- 1
|
|
195 for(ntree in rangeNtree) {
|
|
196 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
|
|
197 pred <- predict(model, test)
|
|
198 acc <- c(acc, r2(phenoTest, pred))
|
|
199 }
|
|
200 names(acc) <- rangeNtree
|
|
201 best <- which.max(acc)
|
|
202 return(as.numeric(names(acc)[best]))
|
|
203 }
|
|
204
|
|
205 ###### meta-SVM
|
|
206 searchParamSVM <- function(train, target, kernel="radial") {
|
|
207 # tuning parameters then train
|
|
208 model <- NULL
|
|
209 switch(kernel,
|
|
210 sigmoid={
|
|
211 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
|
|
212 g <- tune$best.parameters[[1]];
|
|
213 c <- tune$best.parameters[[2]];
|
|
214 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
|
|
215 linear={
|
|
216 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear");
|
|
217 c <- tune$best.parameters[[1]];
|
|
218 model <- svm(x=train, y=target, cost = c, kernel = "linear")},
|
|
219 polynomial={
|
|
220 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
|
|
221 d <- tune$best.parameters[[1]];
|
|
222 g <- tune$best.parameters[[2]];
|
|
223 coef <- tune$best.parameters[[3]];
|
|
224 c <- tune$best.parameters[[4]];
|
|
225 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
|
|
226 {
|
|
227 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
|
|
228 g <- tune$best.parameters[[1]];
|
|
229 c <- tune$best.parameters[[2]];
|
|
230 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
|
|
231 )
|
|
232 return(model)
|
|
233 }
|
|
234
|
|
235 #################### upper level functions #####################
|
|
236
|
|
237 aggregateDT <- function(train, test, target, folds) {
|
|
238 r2Aggreg <- NULL
|
|
239 for (i in 1:length(folds)) {
|
|
240 trainIndex <- unlist(folds[-i])
|
|
241 testIndex <- folds[[i]]
|
|
242 treeParam <- tuneTree(train[[i]], target[trainIndex])
|
|
243 data <- data.frame(train[[i]], target=target[trainIndex])
|
|
244 model <- rpart(target ~., data=data, method = "anova", control = treeParam)
|
|
245 model <- prune(model, cp=treeParam["cp"])
|
|
246 pred <- predict(model, data.frame(test[[i]]))
|
|
247 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
|
|
248 }
|
|
249 return(r2Aggreg)
|
|
250 }
|
|
251
|
|
252 aggregateGeneticMean <- function(train, test, target, folds) {
|
|
253 r2Aggreg <- NULL
|
|
254 for (i in 1:length(folds)) {
|
|
255 trainIndex <- unlist(folds[-i])
|
|
256 testIndex <- folds[[i]]
|
|
257 opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex])
|
|
258 pred <- weightedPrediction(test[[i]], opt@solution)
|
|
259 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
|
|
260 }
|
|
261 return(r2Aggreg)
|
|
262 }
|
|
263
|
|
264 aggregateLASSO <- function(train, test, target, folds) {
|
|
265 r2Aggreg <- NULL
|
|
266 for (i in 1:length(folds)) {
|
|
267 trainIndex <- unlist(folds[-i])
|
|
268 testIndex <- folds[[i]]
|
|
269 alpha <- searchParamLASSO(train[[i]], target[trainIndex])
|
|
270 cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha)
|
|
271 model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se)
|
|
272 pred <- predict(model, test[[i]])
|
|
273 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
|
|
274 }
|
|
275 return(r2Aggreg)
|
|
276 }
|
|
277
|
|
278 aggregateRF <- function(train, test, target, folds) {
|
|
279 r2Aggreg <- NULL
|
|
280 for (i in 1:length(folds)) {
|
|
281 trainIndex <- unlist(folds[-i])
|
|
282 testIndex <- folds[[i]]
|
|
283 ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex],
|
|
284 rangeNtree = seq(100, 1000, 100))
|
|
285 model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex],
|
|
286 ntree = ntree, mtry = ncol(train[[i]]))
|
|
287 pred <- predict(model, test[[i]])
|
|
288 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
|
|
289 }
|
|
290 return(r2Aggreg)
|
|
291 }
|
|
292
|
|
293 aggregateSVM <- function(train, test, target, folds, kernel="linear") {
|
|
294 r2Aggreg <- NULL
|
|
295 for (i in 1:length(folds)) {
|
|
296 trainIndex <- unlist(folds[-i])
|
|
297 testIndex <- folds[[i]]
|
|
298 model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel)
|
|
299 pred <- predict(model, test[[i]])
|
|
300 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
|
|
301 }
|
|
302 return(r2Aggreg)
|
|
303 }
|
|
304
|
|
305 ################################### main #############################
|
|
306 # # load argument
|
|
307 cmd <- commandArgs(T)
|
|
308 source(cmd[1])
|
|
309 # load folds
|
|
310 con = file(folds)
|
|
311 folds <- readLines(con = con, n = 1, ok=T)
|
|
312 close(con)
|
|
313 folds <- readRDS(folds)
|
|
314 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
|
|
315 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
|
|
316 # load genotype
|
|
317 con = file(genotype)
|
|
318 genotype <- readLines(con = con, n = 1, ok=T)
|
|
319 close(con)
|
|
320 genotype <- read.table(genotype, sep="\t", h=T)
|
|
321 # find which classifiers will be used for aggregation
|
|
322 classifNames <- NULL
|
|
323 if(lassoModel !="None"){
|
|
324 classifNames <- c(classifNames, "lasso")
|
|
325 con = file(lassoModel)
|
|
326 lassoModel <- readLines(con = con, n = 1, ok=T)
|
|
327 close(con)
|
|
328 lassoModel <- readRDS(lassoModel)
|
|
329 }
|
|
330 if(rrBLUPModel !="None"){
|
|
331 classifNames <- c(classifNames, "rrBLUP")
|
|
332 con = file(rrBLUPModel)
|
|
333 rrBLUPModel <- readLines(con = con, n = 1, ok=T)
|
|
334 close(con)
|
|
335 rrBLUPModel <- readRDS(rrBLUPModel)
|
|
336 }
|
|
337 if(rfModel !="None"){
|
|
338 classifNames <- c(classifNames, "rf")
|
|
339 con = file(rfModel)
|
|
340 rfModel <- readLines(con = con, n = 1, ok=T)
|
|
341 close(con)
|
|
342 rfModel <- readRDS(rfModel)
|
|
343 }
|
|
344 if(svmModel !="None"){
|
|
345 classifNames <- c(classifNames, "svm")
|
|
346 con = file(svmModel)
|
|
347 svmModel <- readLines(con = con, n = 1, ok=T)
|
|
348 close(con)
|
|
349 svmModel <- readRDS(svmModel)
|
|
350 }
|
|
351
|
|
352 # compute prediction of the training set and test set for each fold and each classifiers
|
|
353 # train predictions and test prediction are stored in separate lists
|
|
354 # where each element of the list represent a folds
|
|
355 predictionTrain.list <- NULL
|
|
356 predictionTest.list <- NULL
|
|
357 r2Classif.list <- NULL
|
|
358 for(i in 1:length(folds)) {
|
|
359 # for the current fold, create training set and test set
|
|
360 trainIndex <- unlist(folds[-i])
|
|
361 testIndex <- folds[[i]]
|
|
362 train <- genotype[trainIndex,]
|
|
363 phenoTrain <- phenotype[trainIndex]
|
|
364 test <- genotype[testIndex,]
|
|
365 phenoTest <- phenotype[testIndex]
|
|
366 # only to intialize data frame containing predictions
|
|
367 predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)),
|
|
368 ncol=length(classifNames))
|
|
369 colnames(predictionTrain) <- classifNames
|
|
370 predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)),
|
|
371 ncol=length(classifNames))
|
|
372 colnames(predictionTest) <- classifNames
|
|
373 r2Classif <- NULL
|
|
374 # for each classifiers, compute prediction on both sets
|
|
375 # and evaluate r2 to find the best classifier
|
|
376 for(j in 1:length(classifNames)) {
|
|
377 switch(classifNames[j],
|
|
378 # random forest
|
|
379 rf={
|
|
380 # predict train and test
|
|
381 param <- extractParameter(rfModel, "rf")
|
|
382 model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry,
|
|
383 ntree = param$ntree);
|
|
384 predictionTrain[,j] <- prediction(train, model, classifier = "rf");
|
|
385 predictionTest[,j] <- prediction(test, model, classifier = "rf");
|
|
386 r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))},
|
|
387 # svm
|
|
388 svm={
|
|
389 # predict train and test
|
|
390 param <- extractParameter(svmModel, "svm");
|
|
391 model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c,
|
|
392 gamma=param$g, degree = param$d, coef0 = param$coef, scale = F)
|
|
393 predictionTrain[,j] <- prediction(train, model, classifier = "svm");
|
|
394 predictionTest[,j] <- prediction(test, model, classifier = "svm");
|
|
395 r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))},
|
|
396 # lasso
|
|
397 lasso={
|
|
398 # predict train and test
|
|
399 param <- extractParameter(lassoModel, "lasso");
|
|
400 model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda);
|
|
401 predictionTrain[,j] <- prediction(train, model, classifier = "lasso");
|
|
402 predictionTest[,j] <- prediction(test, model, classifier = "lasso");
|
|
403 r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))},
|
|
404 # rrBLUP
|
|
405 rrBLUP={
|
|
406 # predict train and test
|
|
407 model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F);
|
|
408 predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP");
|
|
409 predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP");
|
|
410 r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))},
|
|
411 {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP"));
|
|
412 stop()}
|
|
413 )
|
|
414 }
|
|
415 predictionTrain.list <- c(predictionTrain.list, list(predictionTrain))
|
|
416 predictionTest.list <- c(predictionTest.list, list(predictionTest))
|
|
417 r2Classif.list <- c(r2Classif.list, list(r2Classif))
|
|
418 }
|
|
419 # aggregate !
|
|
420 switch(method,
|
|
421 geneticMean={
|
|
422 aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list,
|
|
423 target = phenotype, folds=folds)
|
|
424 },
|
|
425 dt={
|
|
426 aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list,
|
|
427 target = phenotype, folds=folds)
|
|
428 },
|
|
429 lasso={
|
|
430 aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list,
|
|
431 target = phenotype, folds=folds)
|
|
432 },
|
|
433 rf={
|
|
434 aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list,
|
|
435 target = phenotype, folds=folds)
|
|
436 },
|
|
437 # svm, by default
|
|
438 {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list,
|
|
439 target = phenotype, folds=folds, kernel=kernel)}
|
|
440 )
|
|
441 # determine best classifier
|
|
442 # first, transform list into a matrix
|
|
443 r2Classif.list <- t(data.frame(r2Classif.list))
|
|
444 # then, compute the mean r2 for each classifier
|
|
445 meanR2Classif <- apply(r2Classif.list, 2, mean)
|
|
446 # choose the best one
|
|
447 bestClassif <- which.max(meanR2Classif)
|
|
448 # compare aggregation and best classifiers
|
|
449 finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg,
|
|
450 diff=(aggreg-r2Classif.list[,bestClassif]))
|
|
451 print(apply(finalRes, 2, mean))
|