46
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 25/10/16
|
|
4 # last modification : 25/10/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 #
|
|
7 ########################################################
|
|
8
|
|
9 suppressWarnings(suppressMessages(library(GA)))
|
|
10 library("miscTools")
|
|
11 library(rpart)
|
|
12 suppressWarnings(suppressMessages(library(randomForest)))
|
|
13 library(e1071)
|
|
14 suppressWarnings(suppressMessages(library(glmnet)))
|
48
|
15 options(warn=-1)
|
46
|
16 ############################ helper functions #######################
|
|
17
|
|
18 ##### Genetic algorithm
|
48
|
19
|
|
20 # compute r2 by computing the classic formula
|
|
21 # compare the sum of square difference from target to prediciton
|
|
22 # to the sum of square difference from target to the mean of the target
|
|
23 r2 <- function(target, prediction) {
|
|
24 sst <- sum((target-mean(target))^2)
|
|
25 ssr <- sum((target-prediction)^2)
|
|
26 return(1-ssr/sst)
|
|
27 }
|
|
28
|
46
|
29 optimizeOneIndividual <- function(values, trueValue) {
|
|
30 # change the value into a function
|
|
31 f <- function(w) {sum(values * w/sum(w))}
|
|
32 fitness <- function(x) {1/abs(trueValue-f(x))}
|
|
33 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
|
|
34 maxiter = 1000, monitor = NULL, keepBest = T)
|
|
35 resp@solution <- resp@solution/sum(resp@solution)
|
|
36 return(resp)
|
|
37 }
|
|
38
|
|
39 optimizeWeight <- function(values, trueValue, n=1000) {
|
|
40 fitnessAll <- function(w) {
|
|
41 predicted <- apply(values, 1, weightedPrediction.vec, w)
|
|
42 return(mean(r2(trueValue, predicted)))
|
|
43 #return(mean(1/abs(trueValue-predicted)))
|
|
44 }
|
|
45 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
|
|
46 maxiter = n, monitor = NULL, keepBest = T)
|
|
47 resp@solution <- resp@solution/sum(resp@solution)
|
|
48 return(resp)
|
|
49 }
|
|
50
|
|
51 weightedPrediction <- function(classifiers, w) {
|
|
52 if(length(w) > ncol(classifiers)) {
|
|
53 warning("more weights than classifiers, extra weigths are ignored")
|
|
54 w <- w[1:ncol(classifiers)]
|
|
55 } else if(length(w) < ncol(classifiers)) {
|
|
56 warning("less weights than classifiers, extra classifiers are ignored")
|
|
57 classifiers <- classifiers[,1:length(w)]
|
|
58 }
|
|
59 prediction <- NULL
|
|
60 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
|
|
61 return(prediction)
|
|
62 }
|
|
63
|
|
64 weightedPrediction.vec <- function(values, w) {
|
|
65 return(sum(values * w/sum(w)))
|
|
66 }
|
|
67
|
|
68 ##### meta-decision tree
|
|
69
|
|
70 tuneTree <- function(data, target) {
|
|
71 data <- data.frame(data, target=target)
|
|
72 size <- nrow(data)
|
|
73 xerror <- NULL
|
|
74 split <- 1:ceiling(size/5)
|
|
75 leafSize <- 1:ceiling(size/10)
|
|
76 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
|
|
77 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
|
|
78 for(i in 1:length(split)) {
|
|
79 for(j in 1:length(leafSize)) {
|
|
80 op <- list(minsplit=split[i], minbucket=leafSize[j])
|
|
81 tree <- rpart(target ~., data=data, control=op, method="anova")
|
|
82 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
|
|
83 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
|
|
84 }
|
|
85 }
|
|
86 index <- which(xerror==min(xerror), arr.ind = T)
|
|
87 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
|
|
88 return(op)
|
|
89 }
|
|
90
|
|
91 ###### meta-LASSO
|
|
92 # create fold by picking at random row indexes
|
|
93 createFolds <- function(nbObs, n) {
|
|
94 # pick indexes
|
|
95 index <- sample(1:n, size=nbObs, replace = T)
|
|
96 # populate folds
|
|
97 folds <- NULL
|
|
98 for(i in 1:n) {
|
|
99 folds <- c(folds, list(which(index==i)))
|
|
100 }
|
|
101 return(folds)
|
|
102 }
|
|
103
|
|
104 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
|
|
105 folds <- createFolds(nrow(genotype), n = n)
|
|
106 acc <- NULL
|
|
107 indexAlpha <- 1
|
|
108 for(a in alpha) {
|
|
109 curAcc <- NULL
|
|
110 for(i in 1:n) {
|
|
111 train <- genotype[-folds[[i]],]
|
|
112 test <- genotype[folds[[i]],]
|
|
113 phenoTrain <- phenotype[-folds[[i]]]
|
|
114 phenoTest <- phenotype[folds[[i]]]
|
|
115 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
|
|
116 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
|
|
117 pred <- predict(model, test, type = "response")
|
|
118 curAcc <- c(curAcc, r2(phenoTest, pred))
|
|
119 }
|
|
120 acc <- c(acc, mean(curAcc))
|
|
121 }
|
|
122 names(acc) <- alpha
|
|
123 return(as.numeric(names(acc)[which.max(acc)]))
|
|
124 }
|
|
125
|
|
126 ###### meta-random forest
|
|
127
|
|
128 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
|
|
129 n <- ceiling(nrow(genotype)/3)
|
|
130 indexTest <- sample(1:nrow(genotype), size=n)
|
|
131 train <- genotype[-indexTest,]
|
|
132 test <- genotype[indexTest,]
|
|
133 phenoTrain <- phenotype[-indexTest]
|
|
134 phenoTest <- phenotype[indexTest]
|
|
135 acc <- NULL
|
|
136 indexNtree <- 1
|
|
137 for(ntree in rangeNtree) {
|
|
138 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
|
|
139 pred <- predict(model, test)
|
|
140 acc <- c(acc, r2(phenoTest, pred))
|
|
141 }
|
|
142 names(acc) <- rangeNtree
|
|
143 best <- which.max(acc)
|
|
144 return(as.numeric(names(acc)[best]))
|
|
145 }
|
|
146
|
|
147 ###### meta-SVM
|
|
148 searchParamSVM <- function(train, target, kernel="radial") {
|
|
149 # tuning parameters then train
|
|
150 model <- NULL
|
|
151 switch(kernel,
|
|
152 sigmoid={
|
|
153 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
|
|
154 g <- tune$best.parameters[[1]];
|
|
155 c <- tune$best.parameters[[2]];
|
|
156 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
|
|
157 linear={
|
|
158 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear");
|
|
159 c <- tune$best.parameters[[1]];
|
|
160 model <- svm(x=train, y=target, cost = c, kernel = "linear")},
|
|
161 polynomial={
|
|
162 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
|
|
163 d <- tune$best.parameters[[1]];
|
|
164 g <- tune$best.parameters[[2]];
|
|
165 coef <- tune$best.parameters[[3]];
|
|
166 c <- tune$best.parameters[[4]];
|
|
167 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
|
|
168 {
|
|
169 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
|
|
170 g <- tune$best.parameters[[1]];
|
|
171 c <- tune$best.parameters[[2]];
|
|
172 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
|
|
173 )
|
|
174 return(model)
|
|
175 }
|
|
176
|
|
177 #################### upper level functions #####################
|
|
178
|
|
179 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
|
|
180 if(!prediction) {
|
|
181 treeParam <- tuneTree(classifiers, target)
|
|
182 data <- data.frame(classifiers, target)
|
|
183 model <- rpart(target ~., data=data, method = "anova", control = treeParam)
|
|
184 model <- prune(model, cp=treeParam["cp"])
|
|
185 saveRDS(model, out)
|
|
186 } else {
|
|
187 saveRDS(predict(model, data.frame(classifiers)), out)
|
|
188 }
|
|
189 }
|
|
190
|
|
191 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
|
|
192 if(!prediction) {
|
|
193 opt <- optimizeWeight(values = classifiers, trueValue = target)
|
|
194 saveRDS(opt@solution, out)
|
|
195 # evaluation of the method
|
|
196 } else {
|
|
197 saveRDS(weightedPrediction.vec(classifiers, model), out)
|
|
198 }
|
|
199 }
|
|
200
|
|
201 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
|
|
202 if(!prediction) {
|
|
203 alpha <- searchParamLASSO(classifiers, target)
|
|
204 cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
|
|
205 model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
|
|
206 saveRDS(model, out)
|
|
207 } else {
|
|
208 saveRDS(predict(model, classifiers), out)
|
|
209 }
|
|
210 }
|
|
211
|
|
212 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
|
|
213 if(!prediction) {
|
|
214 ntree <- searchParamRF(genotype = classifiers, phenotype = target,
|
|
215 rangeNtree = seq(100, 1000, 100))
|
|
216 model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
|
|
217 saveRDS(model, out)
|
|
218 } else {
|
|
219 saveRDS(predict(model, classifiers), out)
|
|
220 }
|
|
221 }
|
|
222
|
|
223 aggregateSVM <- function(classifiers, target=NULL, prediction=F,
|
|
224 model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
|
|
225 if(!prediction) {
|
|
226 model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
|
|
227 saveRDS(model, out)
|
|
228 } else {
|
|
229 saveRDS(predict(model, classifiers), out)
|
|
230 }
|
|
231 }
|
|
232
|
|
233 ################################### main #############################
|
|
234 # # load argument
|
|
235 cmd <- commandArgs(T)
|
|
236 source(cmd[1])
|
|
237 # check if evaluation is required
|
|
238 evaluation <- F
|
|
239 if(as.integer(doEvaluation) == 1) {
|
|
240 evaluation <- T
|
|
241 con = file(folds)
|
|
242 folds <- readLines(con = con, n = 1, ok=T)
|
|
243 close(con)
|
|
244 folds <- readRDS(folds)
|
|
245 }
|
|
246 # check for model
|
|
247 if(model == "None") {
|
|
248 model <- NULL
|
|
249 prediction <- F
|
|
250 } else {
|
|
251 prediction <- T
|
|
252 con = file(model)
|
|
253 model <- readLines(con = con, n = 1, ok=T)
|
|
254 close(con)
|
|
255 model <- readRDS(model)
|
|
256 }
|
|
257 # load classifiers and phenotype
|
|
258 classifiers <- NULL
|
|
259 classifNames <- NULL
|
|
260 if(lassoPred !="None"){
|
|
261 classifiers <- c(classifiers, lassoPred)
|
|
262 classifNames <- c(classifNames, "lasso")
|
|
263 }
|
|
264 if(rrBLUPPred !="None"){
|
|
265 classifiers <- c(classifiers, rrBLUPPred)
|
|
266 classifNames <- c(classifNames, "rrBLUP")
|
|
267 }
|
|
268 if(rfPred !="None"){
|
|
269 classifiers <- c(classifiers, rfPred)
|
|
270 classifNames <- c(classifNames, "rf")
|
|
271 }
|
|
272 if(svmPred !="None"){
|
|
273 classifiers <- c(classifiers, svmPred)
|
|
274 classifNames <- c(classifNames, "svm")
|
|
275 }
|
|
276 classifPrediction <- NULL
|
|
277 for(classif in classifiers) {
|
|
278 classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
|
|
279 }
|
|
280 classifPrediction <- data.frame(classifPrediction)
|
|
281 colnames(classifPrediction) <- classifNames
|
|
282 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
|
|
283 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
|
|
284 out <- paste(out, ".rds", sep = "")
|
|
285 # aggregate !
|
|
286 switch(method,
|
|
287 geneticMean={
|
|
288 aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
|
|
289 out = out, prediction = prediction, model=model)
|
|
290 },
|
|
291 dt={
|
|
292 aggregateDT(classifiers = classifPrediction, target = phenotype,
|
|
293 out = out, prediction = prediction, model=model)
|
|
294 },
|
|
295 lasso={
|
48
|
296 aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,
|
46
|
297 out = out, prediction = prediction, model=model)
|
|
298 },
|
|
299 rf={
|
|
300 aggregateRF(classifiers = classifPrediction, target = phenotype,
|
|
301 out = out, prediction = prediction, model=model)
|
|
302 },
|
|
303 # svm
|
|
304 {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
|
|
305 out = out, prediction = prediction, model = model)}
|
|
306 )
|
|
307 # return path of the result file to galaxy
|
|
308 cat(paste(out, "\n", sep="")) |