comparison aggregation.R @ 46:ac0c3826cca4 draft

Uploaded
author nicolas
date Wed, 26 Oct 2016 02:32:35 -0400
parents
children cea4a54c52d0
comparison
equal deleted inserted replaced
45:96dc9d514099 46:ac0c3826cca4
1 ########################################################
2 #
3 # creation date : 25/10/16
4 # last modification : 25/10/16
5 # author : Dr Nicolas Beaume
6 #
7 ########################################################
8
9 suppressWarnings(suppressMessages(library(GA)))
10 library("miscTools")
11 library(rpart)
12 suppressWarnings(suppressMessages(library(randomForest)))
13 library(e1071)
14 suppressWarnings(suppressMessages(library(glmnet)))
15 ############################ helper functions #######################
16
17 ##### Genetic algorithm
18 optimizeOneIndividual <- function(values, trueValue) {
19 # change the value into a function
20 f <- function(w) {sum(values * w/sum(w))}
21 fitness <- function(x) {1/abs(trueValue-f(x))}
22 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
23 maxiter = 1000, monitor = NULL, keepBest = T)
24 resp@solution <- resp@solution/sum(resp@solution)
25 return(resp)
26 }
27
28 optimizeWeight <- function(values, trueValue, n=1000) {
29 fitnessAll <- function(w) {
30 predicted <- apply(values, 1, weightedPrediction.vec, w)
31 return(mean(r2(trueValue, predicted)))
32 #return(mean(1/abs(trueValue-predicted)))
33 }
34 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
35 maxiter = n, monitor = NULL, keepBest = T)
36 resp@solution <- resp@solution/sum(resp@solution)
37 return(resp)
38 }
39
40 weightedPrediction <- function(classifiers, w) {
41 if(length(w) > ncol(classifiers)) {
42 warning("more weights than classifiers, extra weigths are ignored")
43 w <- w[1:ncol(classifiers)]
44 } else if(length(w) < ncol(classifiers)) {
45 warning("less weights than classifiers, extra classifiers are ignored")
46 classifiers <- classifiers[,1:length(w)]
47 }
48 prediction <- NULL
49 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
50 return(prediction)
51 }
52
53 weightedPrediction.vec <- function(values, w) {
54 return(sum(values * w/sum(w)))
55 }
56
57 ##### meta-decision tree
58
59 tuneTree <- function(data, target) {
60 data <- data.frame(data, target=target)
61 size <- nrow(data)
62 xerror <- NULL
63 split <- 1:ceiling(size/5)
64 leafSize <- 1:ceiling(size/10)
65 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
66 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
67 for(i in 1:length(split)) {
68 for(j in 1:length(leafSize)) {
69 op <- list(minsplit=split[i], minbucket=leafSize[j])
70 tree <- rpart(target ~., data=data, control=op, method="anova")
71 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
72 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
73 }
74 }
75 index <- which(xerror==min(xerror), arr.ind = T)
76 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
77 return(op)
78 }
79
80 ###### meta-LASSO
81 # create fold by picking at random row indexes
82 createFolds <- function(nbObs, n) {
83 # pick indexes
84 index <- sample(1:n, size=nbObs, replace = T)
85 # populate folds
86 folds <- NULL
87 for(i in 1:n) {
88 folds <- c(folds, list(which(index==i)))
89 }
90 return(folds)
91 }
92
93 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
94 folds <- createFolds(nrow(genotype), n = n)
95 acc <- NULL
96 indexAlpha <- 1
97 for(a in alpha) {
98 curAcc <- NULL
99 for(i in 1:n) {
100 train <- genotype[-folds[[i]],]
101 test <- genotype[folds[[i]],]
102 phenoTrain <- phenotype[-folds[[i]]]
103 phenoTest <- phenotype[folds[[i]]]
104 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
105 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
106 pred <- predict(model, test, type = "response")
107 curAcc <- c(curAcc, r2(phenoTest, pred))
108 }
109 acc <- c(acc, mean(curAcc))
110 }
111 names(acc) <- alpha
112 return(as.numeric(names(acc)[which.max(acc)]))
113 }
114
115 ###### meta-random forest
116
117 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
118 n <- ceiling(nrow(genotype)/3)
119 indexTest <- sample(1:nrow(genotype), size=n)
120 train <- genotype[-indexTest,]
121 test <- genotype[indexTest,]
122 phenoTrain <- phenotype[-indexTest]
123 phenoTest <- phenotype[indexTest]
124 acc <- NULL
125 indexNtree <- 1
126 for(ntree in rangeNtree) {
127 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
128 pred <- predict(model, test)
129 acc <- c(acc, r2(phenoTest, pred))
130 }
131 names(acc) <- rangeNtree
132 best <- which.max(acc)
133 return(as.numeric(names(acc)[best]))
134 }
135
136 ###### meta-SVM
137 searchParamSVM <- function(train, target, kernel="radial") {
138 # tuning parameters then train
139 model <- NULL
140 switch(kernel,
141 sigmoid={
142 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
143 g <- tune$best.parameters[[1]];
144 c <- tune$best.parameters[[2]];
145 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
146 linear={
147 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear");
148 c <- tune$best.parameters[[1]];
149 model <- svm(x=train, y=target, cost = c, kernel = "linear")},
150 polynomial={
151 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
152 d <- tune$best.parameters[[1]];
153 g <- tune$best.parameters[[2]];
154 coef <- tune$best.parameters[[3]];
155 c <- tune$best.parameters[[4]];
156 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
157 {
158 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
159 g <- tune$best.parameters[[1]];
160 c <- tune$best.parameters[[2]];
161 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
162 )
163 return(model)
164 }
165
166 #################### upper level functions #####################
167
168 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
169 if(!prediction) {
170 treeParam <- tuneTree(classifiers, target)
171 data <- data.frame(classifiers, target)
172 model <- rpart(target ~., data=data, method = "anova", control = treeParam)
173 model <- prune(model, cp=treeParam["cp"])
174 saveRDS(model, out)
175 } else {
176 saveRDS(predict(model, data.frame(classifiers)), out)
177 }
178 }
179
180 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
181 if(!prediction) {
182 opt <- optimizeWeight(values = classifiers, trueValue = target)
183 saveRDS(opt@solution, out)
184 # evaluation of the method
185 } else {
186 saveRDS(weightedPrediction.vec(classifiers, model), out)
187 }
188 }
189
190 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
191 if(!prediction) {
192 alpha <- searchParamLASSO(classifiers, target)
193 cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
194 model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
195 saveRDS(model, out)
196 } else {
197 saveRDS(predict(model, classifiers), out)
198 }
199 }
200
201 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
202 if(!prediction) {
203 ntree <- searchParamRF(genotype = classifiers, phenotype = target,
204 rangeNtree = seq(100, 1000, 100))
205 model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
206 saveRDS(model, out)
207 } else {
208 saveRDS(predict(model, classifiers), out)
209 }
210 }
211
212 aggregateSVM <- function(classifiers, target=NULL, prediction=F,
213 model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
214 if(!prediction) {
215 model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
216 saveRDS(model, out)
217 } else {
218 saveRDS(predict(model, classifiers), out)
219 }
220 }
221
222 ################################### main #############################
223 # # load argument
224 cmd <- commandArgs(T)
225 source(cmd[1])
226 # check if evaluation is required
227 evaluation <- F
228 if(as.integer(doEvaluation) == 1) {
229 evaluation <- T
230 con = file(folds)
231 folds <- readLines(con = con, n = 1, ok=T)
232 close(con)
233 folds <- readRDS(folds)
234 }
235 # check for model
236 if(model == "None") {
237 model <- NULL
238 prediction <- F
239 } else {
240 prediction <- T
241 con = file(model)
242 model <- readLines(con = con, n = 1, ok=T)
243 close(con)
244 model <- readRDS(model)
245 }
246 # load classifiers and phenotype
247 classifiers <- NULL
248 classifNames <- NULL
249 if(lassoPred !="None"){
250 classifiers <- c(classifiers, lassoPred)
251 classifNames <- c(classifNames, "lasso")
252 }
253 if(rrBLUPPred !="None"){
254 classifiers <- c(classifiers, rrBLUPPred)
255 classifNames <- c(classifNames, "rrBLUP")
256 }
257 if(rfPred !="None"){
258 classifiers <- c(classifiers, rfPred)
259 classifNames <- c(classifNames, "rf")
260 }
261 if(svmPred !="None"){
262 classifiers <- c(classifiers, svmPred)
263 classifNames <- c(classifNames, "svm")
264 }
265 classifPrediction <- NULL
266 for(classif in classifiers) {
267 classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
268 }
269 classifPrediction <- data.frame(classifPrediction)
270 colnames(classifPrediction) <- classifNames
271 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
272 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
273 out <- paste(out, ".rds", sep = "")
274 # aggregate !
275 switch(method,
276 geneticMean={
277 aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
278 out = out, prediction = prediction, model=model)
279 },
280 dt={
281 aggregateDT(classifiers = classifPrediction, target = phenotype,
282 out = out, prediction = prediction, model=model)
283 },
284 lasso={
285 aggregateLASSO(classifiers = classifPrediction, target = phenotype,
286 out = out, prediction = prediction, model=model)
287 },
288 rf={
289 aggregateRF(classifiers = classifPrediction, target = phenotype,
290 out = out, prediction = prediction, model=model)
291 },
292 # svm
293 {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
294 out = out, prediction = prediction, model = model)}
295 )
296 # return path of the result file to galaxy
297 cat(paste(out, "\n", sep=""))