Mercurial > repos > nicolas > oghma
comparison aggregation.R @ 65:9a6bade6e77a draft
Uploaded
author | nicolas |
---|---|
date | Wed, 26 Oct 2016 18:42:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
64:333fffc740d7 | 65:9a6bade6e77a |
---|---|
1 ######################################################## | |
2 # | |
3 # creation date : 25/10/16 | |
4 # last modification : 25/10/16 | |
5 # author : Dr Nicolas Beaume | |
6 # | |
7 ######################################################## | |
8 | |
9 suppressWarnings(suppressMessages(library(GA))) | |
10 library("miscTools") | |
11 library(rpart) | |
12 suppressWarnings(suppressMessages(library(randomForest))) | |
13 library(e1071) | |
14 suppressWarnings(suppressMessages(library(glmnet))) | |
15 options(warn=-1) | |
16 ############################ helper functions ####################### | |
17 | |
18 ##### Genetic algorithm | |
19 | |
20 # compute r2 by computing the classic formula | |
21 # compare the sum of square difference from target to prediciton | |
22 # to the sum of square difference from target to the mean of the target | |
23 r2 <- function(target, prediction) { | |
24 sst <- sum((target-mean(target))^2) | |
25 ssr <- sum((target-prediction)^2) | |
26 return(1-ssr/sst) | |
27 } | |
28 | |
29 optimizeOneIndividual <- function(values, trueValue) { | |
30 # change the value into a function | |
31 f <- function(w) {sum(values * w/sum(w))} | |
32 fitness <- function(x) {1/abs(trueValue-f(x))} | |
33 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | |
34 maxiter = 1000, monitor = NULL, keepBest = T) | |
35 resp@solution <- resp@solution/sum(resp@solution) | |
36 return(resp) | |
37 } | |
38 | |
39 optimizeWeight <- function(values, trueValue, n=1000) { | |
40 fitnessAll <- function(w) { | |
41 predicted <- apply(values, 1, weightedPrediction.vec, w) | |
42 return(mean(r2(trueValue, predicted))) | |
43 #return(mean(1/abs(trueValue-predicted))) | |
44 } | |
45 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | |
46 maxiter = n, monitor = NULL, keepBest = T) | |
47 resp@solution <- resp@solution/sum(resp@solution) | |
48 return(resp) | |
49 } | |
50 | |
51 weightedPrediction <- function(classifiers, w) { | |
52 if(length(w) > ncol(classifiers)) { | |
53 warning("more weights than classifiers, extra weigths are ignored") | |
54 w <- w[1:ncol(classifiers)] | |
55 } else if(length(w) < ncol(classifiers)) { | |
56 warning("less weights than classifiers, extra classifiers are ignored") | |
57 classifiers <- classifiers[,1:length(w)] | |
58 } | |
59 prediction <- NULL | |
60 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | |
61 return(prediction) | |
62 } | |
63 | |
64 weightedPrediction.vec <- function(values, w) { | |
65 return(sum(values * w/sum(w))) | |
66 } | |
67 | |
68 ##### meta-decision tree | |
69 | |
70 tuneTree <- function(data, target) { | |
71 data <- data.frame(data, target=target) | |
72 size <- nrow(data) | |
73 xerror <- NULL | |
74 split <- 1:ceiling(size/5) | |
75 leafSize <- 1:ceiling(size/10) | |
76 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
77 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
78 for(i in 1:length(split)) { | |
79 for(j in 1:length(leafSize)) { | |
80 op <- list(minsplit=split[i], minbucket=leafSize[j]) | |
81 tree <- rpart(target ~., data=data, control=op, method="anova") | |
82 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | |
83 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | |
84 } | |
85 } | |
86 index <- which(xerror==min(xerror), arr.ind = T) | |
87 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | |
88 return(op) | |
89 } | |
90 | |
91 ###### meta-LASSO | |
92 # create fold by picking at random row indexes | |
93 createFolds <- function(nbObs, n) { | |
94 # pick indexes | |
95 index <- sample(1:n, size=nbObs, replace = T) | |
96 # populate folds | |
97 folds <- NULL | |
98 for(i in 1:n) { | |
99 folds <- c(folds, list(which(index==i))) | |
100 } | |
101 return(folds) | |
102 } | |
103 | |
104 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | |
105 folds <- createFolds(nrow(genotype), n = n) | |
106 acc <- NULL | |
107 indexAlpha <- 1 | |
108 for(a in alpha) { | |
109 curAcc <- NULL | |
110 for(i in 1:n) { | |
111 train <- genotype[-folds[[i]],] | |
112 test <- genotype[folds[[i]],] | |
113 phenoTrain <- phenotype[-folds[[i]]] | |
114 phenoTest <- phenotype[folds[[i]]] | |
115 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | |
116 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | |
117 pred <- predict(model, test, type = "response") | |
118 curAcc <- c(curAcc, r2(phenoTest, pred)) | |
119 } | |
120 acc <- c(acc, mean(curAcc)) | |
121 } | |
122 names(acc) <- alpha | |
123 return(as.numeric(names(acc)[which.max(acc)])) | |
124 } | |
125 | |
126 ###### meta-random forest | |
127 | |
128 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | |
129 n <- ceiling(nrow(genotype)/3) | |
130 indexTest <- sample(1:nrow(genotype), size=n) | |
131 train <- genotype[-indexTest,] | |
132 test <- genotype[indexTest,] | |
133 phenoTrain <- phenotype[-indexTest] | |
134 phenoTest <- phenotype[indexTest] | |
135 acc <- NULL | |
136 indexNtree <- 1 | |
137 for(ntree in rangeNtree) { | |
138 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | |
139 pred <- predict(model, test) | |
140 acc <- c(acc, r2(phenoTest, pred)) | |
141 } | |
142 names(acc) <- rangeNtree | |
143 best <- which.max(acc) | |
144 return(as.numeric(names(acc)[best])) | |
145 } | |
146 | |
147 ###### meta-SVM | |
148 searchParamSVM <- function(train, target, kernel="radial") { | |
149 # tuning parameters then train | |
150 model <- NULL | |
151 switch(kernel, | |
152 sigmoid={ | |
153 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | |
154 g <- tune$best.parameters[[1]]; | |
155 c <- tune$best.parameters[[2]]; | |
156 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | |
157 linear={ | |
158 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | |
159 c <- tune$best.parameters[[1]]; | |
160 model <- svm(x=train, y=target, cost = c, kernel = "linear")}, | |
161 polynomial={ | |
162 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | |
163 d <- tune$best.parameters[[1]]; | |
164 g <- tune$best.parameters[[2]]; | |
165 coef <- tune$best.parameters[[3]]; | |
166 c <- tune$best.parameters[[4]]; | |
167 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | |
168 { | |
169 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | |
170 g <- tune$best.parameters[[1]]; | |
171 c <- tune$best.parameters[[2]]; | |
172 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | |
173 ) | |
174 return(model) | |
175 } | |
176 | |
177 #################### upper level functions ##################### | |
178 | |
179 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { | |
180 if(!prediction) { | |
181 treeParam <- tuneTree(classifiers, target) | |
182 data <- data.frame(classifiers, target) | |
183 model <- rpart(target ~., data=data, method = "anova", control = treeParam) | |
184 model <- prune(model, cp=treeParam["cp"]) | |
185 out <- paste(out, ".rds", sep = "") | |
186 saveRDS(model, out) | |
187 return(out) | |
188 } else { | |
189 return(predict(model, classifiers)) | |
190 } | |
191 } | |
192 | |
193 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ | |
194 if(!prediction) { | |
195 opt <- optimizeWeight(values = classifiers, trueValue = target) | |
196 out <- paste(out, ".rds", sep = "") | |
197 saveRDS(opt@solution, out) | |
198 return(out) | |
199 } else { | |
200 return(weightedPrediction.vec(classifiers, model)) | |
201 } | |
202 } | |
203 | |
204 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { | |
205 if(!prediction) { | |
206 alpha <- searchParamLASSO(classifiers, target) | |
207 cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) | |
208 model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) | |
209 out <- paste(out, ".rds", sep = "") | |
210 saveRDS(model, out) | |
211 return(out) | |
212 } else { | |
213 return(predict(model, classifiers)) | |
214 } | |
215 } | |
216 | |
217 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { | |
218 if(!prediction) { | |
219 ntree <- searchParamRF(genotype = classifiers, phenotype = target, | |
220 rangeNtree = seq(100, 1000, 100)) | |
221 model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) | |
222 out <- paste(out, ".rds", sep = "") | |
223 saveRDS(model, out) | |
224 return(out) | |
225 } else { | |
226 return(predict(model, classifiers)) | |
227 } | |
228 } | |
229 | |
230 aggregateSVM <- function(classifiers, target=NULL, prediction=F, | |
231 model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { | |
232 if(!prediction) { | |
233 model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) | |
234 out <- paste(out, ".rds", sep = "") | |
235 saveRDS(model, out) | |
236 return(out) | |
237 } else { | |
238 return(predict(model, classifiers)) | |
239 } | |
240 } | |
241 | |
242 ################################### main ############################# | |
243 # # load argument | |
244 cmd <- commandArgs(T) | |
245 source(cmd[1]) | |
246 # check if evaluation is required | |
247 evaluation <- F | |
248 if(as.integer(doEvaluation) == 1) { | |
249 evaluation <- T | |
250 con = file(folds) | |
251 folds <- readLines(con = con, n = 1, ok=T) | |
252 close(con) | |
253 folds <- readRDS(folds) | |
254 } | |
255 # check for model | |
256 if(model == "None") { | |
257 model <- NULL | |
258 prediction <- F | |
259 } else { | |
260 prediction <- T | |
261 con = file(model) | |
262 model <- readLines(con = con, n = 1, ok=T) | |
263 close(con) | |
264 model <- readRDS(model) | |
265 } | |
266 # load classifiers and phenotype | |
267 classifiers <- NULL | |
268 classifNames <- NULL | |
269 if(lassoPred !="None"){ | |
270 classifiers <- c(classifiers, lassoPred) | |
271 classifNames <- c(classifNames, "lasso") | |
272 } | |
273 if(rrBLUPPred !="None"){ | |
274 classifiers <- c(classifiers, rrBLUPPred) | |
275 classifNames <- c(classifNames, "rrBLUP") | |
276 } | |
277 if(rfPred !="None"){ | |
278 classifiers <- c(classifiers, rfPred) | |
279 classifNames <- c(classifNames, "rf") | |
280 } | |
281 if(svmPred !="None"){ | |
282 classifiers <- c(classifiers, svmPred) | |
283 classifNames <- c(classifNames, "svm") | |
284 } | |
285 classifPrediction <- NULL | |
286 for(classif in classifiers) { | |
287 classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) | |
288 } | |
289 classifPrediction <- data.frame(classifPrediction) | |
290 colnames(classifPrediction) <- classifNames | |
291 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
292 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
293 # aggregate ! | |
294 switch(method, | |
295 geneticMean={ | |
296 res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, | |
297 out = out, prediction = prediction, model=model) | |
298 }, | |
299 dt={ | |
300 res <- aggregateDT(classifiers = classifPrediction, target = phenotype, | |
301 out = out, prediction = prediction, model=model) | |
302 }, | |
303 lasso={ | |
304 res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, | |
305 out = out, prediction = prediction, model=model) | |
306 }, | |
307 rf={ | |
308 res <- aggregateRF(classifiers = classifPrediction, target = phenotype, | |
309 out = out, prediction = prediction, model=model) | |
310 }, | |
311 # svm | |
312 {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, | |
313 out = out, prediction = prediction, model = model)} | |
314 ) | |
315 if(prediction) { | |
316 write.table(data.frame(lines=rownames(classifPrediction), res), out, | |
317 sep="\t", row.names = F) | |
318 } else { | |
319 cat(out, "\n", sep="") | |
320 } |