Mercurial > repos > nicolas > oghma
comparison aggregation.R @ 62:613c81a4f7c0 draft
Uploaded
| author | nicolas |
|---|---|
| date | Wed, 26 Oct 2016 18:25:19 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 61:339ae95a3d46 | 62:613c81a4f7c0 |
|---|---|
| 1 ######################################################## | |
| 2 # | |
| 3 # creation date : 25/10/16 | |
| 4 # last modification : 25/10/16 | |
| 5 # author : Dr Nicolas Beaume | |
| 6 # | |
| 7 ######################################################## | |
| 8 | |
| 9 suppressWarnings(suppressMessages(library(GA))) | |
| 10 library("miscTools") | |
| 11 library(rpart) | |
| 12 suppressWarnings(suppressMessages(library(randomForest))) | |
| 13 library(e1071) | |
| 14 suppressWarnings(suppressMessages(library(glmnet))) | |
| 15 options(warn=-1) | |
| 16 ############################ helper functions ####################### | |
| 17 | |
| 18 ##### Genetic algorithm | |
| 19 | |
| 20 # compute r2 by computing the classic formula | |
| 21 # compare the sum of square difference from target to prediciton | |
| 22 # to the sum of square difference from target to the mean of the target | |
| 23 r2 <- function(target, prediction) { | |
| 24 sst <- sum((target-mean(target))^2) | |
| 25 ssr <- sum((target-prediction)^2) | |
| 26 return(1-ssr/sst) | |
| 27 } | |
| 28 | |
| 29 optimizeOneIndividual <- function(values, trueValue) { | |
| 30 # change the value into a function | |
| 31 f <- function(w) {sum(values * w/sum(w))} | |
| 32 fitness <- function(x) {1/abs(trueValue-f(x))} | |
| 33 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | |
| 34 maxiter = 1000, monitor = NULL, keepBest = T) | |
| 35 resp@solution <- resp@solution/sum(resp@solution) | |
| 36 return(resp) | |
| 37 } | |
| 38 | |
| 39 optimizeWeight <- function(values, trueValue, n=1000) { | |
| 40 fitnessAll <- function(w) { | |
| 41 predicted <- apply(values, 1, weightedPrediction.vec, w) | |
| 42 return(mean(r2(trueValue, predicted))) | |
| 43 #return(mean(1/abs(trueValue-predicted))) | |
| 44 } | |
| 45 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | |
| 46 maxiter = n, monitor = NULL, keepBest = T) | |
| 47 resp@solution <- resp@solution/sum(resp@solution) | |
| 48 return(resp) | |
| 49 } | |
| 50 | |
| 51 weightedPrediction <- function(classifiers, w) { | |
| 52 if(length(w) > ncol(classifiers)) { | |
| 53 warning("more weights than classifiers, extra weigths are ignored") | |
| 54 w <- w[1:ncol(classifiers)] | |
| 55 } else if(length(w) < ncol(classifiers)) { | |
| 56 warning("less weights than classifiers, extra classifiers are ignored") | |
| 57 classifiers <- classifiers[,1:length(w)] | |
| 58 } | |
| 59 prediction <- NULL | |
| 60 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | |
| 61 return(prediction) | |
| 62 } | |
| 63 | |
| 64 weightedPrediction.vec <- function(values, w) { | |
| 65 return(sum(values * w/sum(w))) | |
| 66 } | |
| 67 | |
| 68 ##### meta-decision tree | |
| 69 | |
| 70 tuneTree <- function(data, target) { | |
| 71 data <- data.frame(data, target=target) | |
| 72 size <- nrow(data) | |
| 73 xerror <- NULL | |
| 74 split <- 1:ceiling(size/5) | |
| 75 leafSize <- 1:ceiling(size/10) | |
| 76 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 77 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 78 for(i in 1:length(split)) { | |
| 79 for(j in 1:length(leafSize)) { | |
| 80 op <- list(minsplit=split[i], minbucket=leafSize[j]) | |
| 81 tree <- rpart(target ~., data=data, control=op, method="anova") | |
| 82 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | |
| 83 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | |
| 84 } | |
| 85 } | |
| 86 index <- which(xerror==min(xerror), arr.ind = T) | |
| 87 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | |
| 88 return(op) | |
| 89 } | |
| 90 | |
| 91 ###### meta-LASSO | |
| 92 # create fold by picking at random row indexes | |
| 93 createFolds <- function(nbObs, n) { | |
| 94 # pick indexes | |
| 95 index <- sample(1:n, size=nbObs, replace = T) | |
| 96 # populate folds | |
| 97 folds <- NULL | |
| 98 for(i in 1:n) { | |
| 99 folds <- c(folds, list(which(index==i))) | |
| 100 } | |
| 101 return(folds) | |
| 102 } | |
| 103 | |
| 104 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | |
| 105 folds <- createFolds(nrow(genotype), n = n) | |
| 106 acc <- NULL | |
| 107 indexAlpha <- 1 | |
| 108 for(a in alpha) { | |
| 109 curAcc <- NULL | |
| 110 for(i in 1:n) { | |
| 111 train <- genotype[-folds[[i]],] | |
| 112 test <- genotype[folds[[i]],] | |
| 113 phenoTrain <- phenotype[-folds[[i]]] | |
| 114 phenoTest <- phenotype[folds[[i]]] | |
| 115 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | |
| 116 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | |
| 117 pred <- predict(model, test, type = "response") | |
| 118 curAcc <- c(curAcc, r2(phenoTest, pred)) | |
| 119 } | |
| 120 acc <- c(acc, mean(curAcc)) | |
| 121 } | |
| 122 names(acc) <- alpha | |
| 123 return(as.numeric(names(acc)[which.max(acc)])) | |
| 124 } | |
| 125 | |
| 126 ###### meta-random forest | |
| 127 | |
| 128 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | |
| 129 n <- ceiling(nrow(genotype)/3) | |
| 130 indexTest <- sample(1:nrow(genotype), size=n) | |
| 131 train <- genotype[-indexTest,] | |
| 132 test <- genotype[indexTest,] | |
| 133 phenoTrain <- phenotype[-indexTest] | |
| 134 phenoTest <- phenotype[indexTest] | |
| 135 acc <- NULL | |
| 136 indexNtree <- 1 | |
| 137 for(ntree in rangeNtree) { | |
| 138 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | |
| 139 pred <- predict(model, test) | |
| 140 acc <- c(acc, r2(phenoTest, pred)) | |
| 141 } | |
| 142 names(acc) <- rangeNtree | |
| 143 best <- which.max(acc) | |
| 144 return(as.numeric(names(acc)[best])) | |
| 145 } | |
| 146 | |
| 147 ###### meta-SVM | |
| 148 searchParamSVM <- function(train, target, kernel="radial") { | |
| 149 # tuning parameters then train | |
| 150 model <- NULL | |
| 151 switch(kernel, | |
| 152 sigmoid={ | |
| 153 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | |
| 154 g <- tune$best.parameters[[1]]; | |
| 155 c <- tune$best.parameters[[2]]; | |
| 156 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | |
| 157 linear={ | |
| 158 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | |
| 159 c <- tune$best.parameters[[1]]; | |
| 160 model <- svm(x=train, y=target, cost = c, kernel = "linear")}, | |
| 161 polynomial={ | |
| 162 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | |
| 163 d <- tune$best.parameters[[1]]; | |
| 164 g <- tune$best.parameters[[2]]; | |
| 165 coef <- tune$best.parameters[[3]]; | |
| 166 c <- tune$best.parameters[[4]]; | |
| 167 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | |
| 168 { | |
| 169 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | |
| 170 g <- tune$best.parameters[[1]]; | |
| 171 c <- tune$best.parameters[[2]]; | |
| 172 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | |
| 173 ) | |
| 174 return(model) | |
| 175 } | |
| 176 | |
| 177 #################### upper level functions ##################### | |
| 178 | |
| 179 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { | |
| 180 if(!prediction) { | |
| 181 treeParam <- tuneTree(classifiers, target) | |
| 182 data <- data.frame(classifiers, target) | |
| 183 model <- rpart(target ~., data=data, method = "anova", control = treeParam) | |
| 184 model <- prune(model, cp=treeParam["cp"]) | |
| 185 out <- paste(out, ".rds", sep = "") | |
| 186 saveRDS(model, out) | |
| 187 return(out) | |
| 188 } else { | |
| 189 return(predict(model, classifiers)) | |
| 190 } | |
| 191 } | |
| 192 | |
| 193 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ | |
| 194 if(!prediction) { | |
| 195 opt <- optimizeWeight(values = classifiers, trueValue = target) | |
| 196 out <- paste(out, ".rds", sep = "") | |
| 197 saveRDS(opt@solution, out) | |
| 198 return(out) | |
| 199 } else { | |
| 200 return(weightedPrediction.vec(classifiers, model)) | |
| 201 } | |
| 202 } | |
| 203 | |
| 204 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { | |
| 205 if(!prediction) { | |
| 206 alpha <- searchParamLASSO(classifiers, target) | |
| 207 cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) | |
| 208 model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) | |
| 209 out <- paste(out, ".rds", sep = "") | |
| 210 saveRDS(model, out) | |
| 211 return(out) | |
| 212 } else { | |
| 213 return(predict(model, classifiers)) | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { | |
| 218 if(!prediction) { | |
| 219 ntree <- searchParamRF(genotype = classifiers, phenotype = target, | |
| 220 rangeNtree = seq(100, 1000, 100)) | |
| 221 model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) | |
| 222 out <- paste(out, ".rds", sep = "") | |
| 223 saveRDS(model, out) | |
| 224 return(out) | |
| 225 } else { | |
| 226 return(predict(model, classifiers)) | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 aggregateSVM <- function(classifiers, target=NULL, prediction=F, | |
| 231 model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { | |
| 232 if(!prediction) { | |
| 233 model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) | |
| 234 out <- paste(out, ".rds", sep = "") | |
| 235 saveRDS(model, out) | |
| 236 return(out) | |
| 237 } else { | |
| 238 return(predict(model, classifiers)) | |
| 239 } | |
| 240 } | |
| 241 | |
| 242 ################################### main ############################# | |
| 243 # # load argument | |
| 244 cmd <- commandArgs(T) | |
| 245 source(cmd[1]) | |
| 246 # check if evaluation is required | |
| 247 evaluation <- F | |
| 248 if(as.integer(doEvaluation) == 1) { | |
| 249 evaluation <- T | |
| 250 con = file(folds) | |
| 251 folds <- readLines(con = con, n = 1, ok=T) | |
| 252 close(con) | |
| 253 folds <- readRDS(folds) | |
| 254 } | |
| 255 # check for model | |
| 256 if(model == "None") { | |
| 257 model <- NULL | |
| 258 prediction <- F | |
| 259 } else { | |
| 260 prediction <- T | |
| 261 con = file(model) | |
| 262 model <- readLines(con = con, n = 1, ok=T) | |
| 263 close(con) | |
| 264 model <- readRDS(model) | |
| 265 } | |
| 266 # load classifiers and phenotype | |
| 267 classifiers <- NULL | |
| 268 classifNames <- NULL | |
| 269 if(lassoPred !="None"){ | |
| 270 classifiers <- c(classifiers, lassoPred) | |
| 271 classifNames <- c(classifNames, "lasso") | |
| 272 } | |
| 273 if(rrBLUPPred !="None"){ | |
| 274 classifiers <- c(classifiers, rrBLUPPred) | |
| 275 classifNames <- c(classifNames, "rrBLUP") | |
| 276 } | |
| 277 if(rfPred !="None"){ | |
| 278 classifiers <- c(classifiers, rfPred) | |
| 279 classifNames <- c(classifNames, "rf") | |
| 280 } | |
| 281 if(svmPred !="None"){ | |
| 282 classifiers <- c(classifiers, svmPred) | |
| 283 classifNames <- c(classifNames, "svm") | |
| 284 } | |
| 285 classifPrediction <- NULL | |
| 286 for(classif in classifiers) { | |
| 287 classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) | |
| 288 } | |
| 289 classifPrediction <- data.frame(classifPrediction) | |
| 290 colnames(classifPrediction) <- classifNames | |
| 291 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
| 292 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
| 293 # aggregate ! | |
| 294 switch(method, | |
| 295 geneticMean={ | |
| 296 res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, | |
| 297 out = out, prediction = prediction, model=model) | |
| 298 }, | |
| 299 dt={ | |
| 300 res <- aggregateDT(classifiers = classifPrediction, target = phenotype, | |
| 301 out = out, prediction = prediction, model=model) | |
| 302 }, | |
| 303 lasso={ | |
| 304 res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, | |
| 305 out = out, prediction = prediction, model=model) | |
| 306 }, | |
| 307 rf={ | |
| 308 res <- aggregateRF(classifiers = classifPrediction, target = phenotype, | |
| 309 out = out, prediction = prediction, model=model) | |
| 310 }, | |
| 311 # svm | |
| 312 {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, | |
| 313 out = out, prediction = prediction, model = model)} | |
| 314 ) | |
| 315 if(prediction) { | |
| 316 write.table(data.frame(lines=rownames(classifPrediction), res), paste(out,".csv", sep=""), | |
| 317 sep="\t", row.names = F) | |
| 318 } else { | |
| 319 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) | |
| 320 } |
