Mercurial > repos > nicolas > oghma
comparison aggregation.R @ 46:ac0c3826cca4 draft
Uploaded
| author | nicolas | 
|---|---|
| date | Wed, 26 Oct 2016 02:32:35 -0400 | 
| parents | |
| children | cea4a54c52d0 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 45:96dc9d514099 | 46:ac0c3826cca4 | 
|---|---|
| 1 ######################################################## | |
| 2 # | |
| 3 # creation date : 25/10/16 | |
| 4 # last modification : 25/10/16 | |
| 5 # author : Dr Nicolas Beaume | |
| 6 # | |
| 7 ######################################################## | |
| 8 | |
| 9 suppressWarnings(suppressMessages(library(GA))) | |
| 10 library("miscTools") | |
| 11 library(rpart) | |
| 12 suppressWarnings(suppressMessages(library(randomForest))) | |
| 13 library(e1071) | |
| 14 suppressWarnings(suppressMessages(library(glmnet))) | |
| 15 ############################ helper functions ####################### | |
| 16 | |
| 17 ##### Genetic algorithm | |
| 18 optimizeOneIndividual <- function(values, trueValue) { | |
| 19 # change the value into a function | |
| 20 f <- function(w) {sum(values * w/sum(w))} | |
| 21 fitness <- function(x) {1/abs(trueValue-f(x))} | |
| 22 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | |
| 23 maxiter = 1000, monitor = NULL, keepBest = T) | |
| 24 resp@solution <- resp@solution/sum(resp@solution) | |
| 25 return(resp) | |
| 26 } | |
| 27 | |
| 28 optimizeWeight <- function(values, trueValue, n=1000) { | |
| 29 fitnessAll <- function(w) { | |
| 30 predicted <- apply(values, 1, weightedPrediction.vec, w) | |
| 31 return(mean(r2(trueValue, predicted))) | |
| 32 #return(mean(1/abs(trueValue-predicted))) | |
| 33 } | |
| 34 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | |
| 35 maxiter = n, monitor = NULL, keepBest = T) | |
| 36 resp@solution <- resp@solution/sum(resp@solution) | |
| 37 return(resp) | |
| 38 } | |
| 39 | |
| 40 weightedPrediction <- function(classifiers, w) { | |
| 41 if(length(w) > ncol(classifiers)) { | |
| 42 warning("more weights than classifiers, extra weigths are ignored") | |
| 43 w <- w[1:ncol(classifiers)] | |
| 44 } else if(length(w) < ncol(classifiers)) { | |
| 45 warning("less weights than classifiers, extra classifiers are ignored") | |
| 46 classifiers <- classifiers[,1:length(w)] | |
| 47 } | |
| 48 prediction <- NULL | |
| 49 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | |
| 50 return(prediction) | |
| 51 } | |
| 52 | |
| 53 weightedPrediction.vec <- function(values, w) { | |
| 54 return(sum(values * w/sum(w))) | |
| 55 } | |
| 56 | |
| 57 ##### meta-decision tree | |
| 58 | |
| 59 tuneTree <- function(data, target) { | |
| 60 data <- data.frame(data, target=target) | |
| 61 size <- nrow(data) | |
| 62 xerror <- NULL | |
| 63 split <- 1:ceiling(size/5) | |
| 64 leafSize <- 1:ceiling(size/10) | |
| 65 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 66 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 67 for(i in 1:length(split)) { | |
| 68 for(j in 1:length(leafSize)) { | |
| 69 op <- list(minsplit=split[i], minbucket=leafSize[j]) | |
| 70 tree <- rpart(target ~., data=data, control=op, method="anova") | |
| 71 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | |
| 72 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | |
| 73 } | |
| 74 } | |
| 75 index <- which(xerror==min(xerror), arr.ind = T) | |
| 76 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | |
| 77 return(op) | |
| 78 } | |
| 79 | |
| 80 ###### meta-LASSO | |
| 81 # create fold by picking at random row indexes | |
| 82 createFolds <- function(nbObs, n) { | |
| 83 # pick indexes | |
| 84 index <- sample(1:n, size=nbObs, replace = T) | |
| 85 # populate folds | |
| 86 folds <- NULL | |
| 87 for(i in 1:n) { | |
| 88 folds <- c(folds, list(which(index==i))) | |
| 89 } | |
| 90 return(folds) | |
| 91 } | |
| 92 | |
| 93 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | |
| 94 folds <- createFolds(nrow(genotype), n = n) | |
| 95 acc <- NULL | |
| 96 indexAlpha <- 1 | |
| 97 for(a in alpha) { | |
| 98 curAcc <- NULL | |
| 99 for(i in 1:n) { | |
| 100 train <- genotype[-folds[[i]],] | |
| 101 test <- genotype[folds[[i]],] | |
| 102 phenoTrain <- phenotype[-folds[[i]]] | |
| 103 phenoTest <- phenotype[folds[[i]]] | |
| 104 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | |
| 105 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | |
| 106 pred <- predict(model, test, type = "response") | |
| 107 curAcc <- c(curAcc, r2(phenoTest, pred)) | |
| 108 } | |
| 109 acc <- c(acc, mean(curAcc)) | |
| 110 } | |
| 111 names(acc) <- alpha | |
| 112 return(as.numeric(names(acc)[which.max(acc)])) | |
| 113 } | |
| 114 | |
| 115 ###### meta-random forest | |
| 116 | |
| 117 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | |
| 118 n <- ceiling(nrow(genotype)/3) | |
| 119 indexTest <- sample(1:nrow(genotype), size=n) | |
| 120 train <- genotype[-indexTest,] | |
| 121 test <- genotype[indexTest,] | |
| 122 phenoTrain <- phenotype[-indexTest] | |
| 123 phenoTest <- phenotype[indexTest] | |
| 124 acc <- NULL | |
| 125 indexNtree <- 1 | |
| 126 for(ntree in rangeNtree) { | |
| 127 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | |
| 128 pred <- predict(model, test) | |
| 129 acc <- c(acc, r2(phenoTest, pred)) | |
| 130 } | |
| 131 names(acc) <- rangeNtree | |
| 132 best <- which.max(acc) | |
| 133 return(as.numeric(names(acc)[best])) | |
| 134 } | |
| 135 | |
| 136 ###### meta-SVM | |
| 137 searchParamSVM <- function(train, target, kernel="radial") { | |
| 138 # tuning parameters then train | |
| 139 model <- NULL | |
| 140 switch(kernel, | |
| 141 sigmoid={ | |
| 142 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | |
| 143 g <- tune$best.parameters[[1]]; | |
| 144 c <- tune$best.parameters[[2]]; | |
| 145 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | |
| 146 linear={ | |
| 147 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | |
| 148 c <- tune$best.parameters[[1]]; | |
| 149 model <- svm(x=train, y=target, cost = c, kernel = "linear")}, | |
| 150 polynomial={ | |
| 151 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | |
| 152 d <- tune$best.parameters[[1]]; | |
| 153 g <- tune$best.parameters[[2]]; | |
| 154 coef <- tune$best.parameters[[3]]; | |
| 155 c <- tune$best.parameters[[4]]; | |
| 156 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | |
| 157 { | |
| 158 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | |
| 159 g <- tune$best.parameters[[1]]; | |
| 160 c <- tune$best.parameters[[2]]; | |
| 161 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | |
| 162 ) | |
| 163 return(model) | |
| 164 } | |
| 165 | |
| 166 #################### upper level functions ##################### | |
| 167 | |
| 168 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { | |
| 169 if(!prediction) { | |
| 170 treeParam <- tuneTree(classifiers, target) | |
| 171 data <- data.frame(classifiers, target) | |
| 172 model <- rpart(target ~., data=data, method = "anova", control = treeParam) | |
| 173 model <- prune(model, cp=treeParam["cp"]) | |
| 174 saveRDS(model, out) | |
| 175 } else { | |
| 176 saveRDS(predict(model, data.frame(classifiers)), out) | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ | |
| 181 if(!prediction) { | |
| 182 opt <- optimizeWeight(values = classifiers, trueValue = target) | |
| 183 saveRDS(opt@solution, out) | |
| 184 # evaluation of the method | |
| 185 } else { | |
| 186 saveRDS(weightedPrediction.vec(classifiers, model), out) | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { | |
| 191 if(!prediction) { | |
| 192 alpha <- searchParamLASSO(classifiers, target) | |
| 193 cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) | |
| 194 model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) | |
| 195 saveRDS(model, out) | |
| 196 } else { | |
| 197 saveRDS(predict(model, classifiers), out) | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { | |
| 202 if(!prediction) { | |
| 203 ntree <- searchParamRF(genotype = classifiers, phenotype = target, | |
| 204 rangeNtree = seq(100, 1000, 100)) | |
| 205 model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) | |
| 206 saveRDS(model, out) | |
| 207 } else { | |
| 208 saveRDS(predict(model, classifiers), out) | |
| 209 } | |
| 210 } | |
| 211 | |
| 212 aggregateSVM <- function(classifiers, target=NULL, prediction=F, | |
| 213 model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { | |
| 214 if(!prediction) { | |
| 215 model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) | |
| 216 saveRDS(model, out) | |
| 217 } else { | |
| 218 saveRDS(predict(model, classifiers), out) | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 ################################### main ############################# | |
| 223 # # load argument | |
| 224 cmd <- commandArgs(T) | |
| 225 source(cmd[1]) | |
| 226 # check if evaluation is required | |
| 227 evaluation <- F | |
| 228 if(as.integer(doEvaluation) == 1) { | |
| 229 evaluation <- T | |
| 230 con = file(folds) | |
| 231 folds <- readLines(con = con, n = 1, ok=T) | |
| 232 close(con) | |
| 233 folds <- readRDS(folds) | |
| 234 } | |
| 235 # check for model | |
| 236 if(model == "None") { | |
| 237 model <- NULL | |
| 238 prediction <- F | |
| 239 } else { | |
| 240 prediction <- T | |
| 241 con = file(model) | |
| 242 model <- readLines(con = con, n = 1, ok=T) | |
| 243 close(con) | |
| 244 model <- readRDS(model) | |
| 245 } | |
| 246 # load classifiers and phenotype | |
| 247 classifiers <- NULL | |
| 248 classifNames <- NULL | |
| 249 if(lassoPred !="None"){ | |
| 250 classifiers <- c(classifiers, lassoPred) | |
| 251 classifNames <- c(classifNames, "lasso") | |
| 252 } | |
| 253 if(rrBLUPPred !="None"){ | |
| 254 classifiers <- c(classifiers, rrBLUPPred) | |
| 255 classifNames <- c(classifNames, "rrBLUP") | |
| 256 } | |
| 257 if(rfPred !="None"){ | |
| 258 classifiers <- c(classifiers, rfPred) | |
| 259 classifNames <- c(classifNames, "rf") | |
| 260 } | |
| 261 if(svmPred !="None"){ | |
| 262 classifiers <- c(classifiers, svmPred) | |
| 263 classifNames <- c(classifNames, "svm") | |
| 264 } | |
| 265 classifPrediction <- NULL | |
| 266 for(classif in classifiers) { | |
| 267 classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) | |
| 268 } | |
| 269 classifPrediction <- data.frame(classifPrediction) | |
| 270 colnames(classifPrediction) <- classifNames | |
| 271 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
| 272 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
| 273 out <- paste(out, ".rds", sep = "") | |
| 274 # aggregate ! | |
| 275 switch(method, | |
| 276 geneticMean={ | |
| 277 aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, | |
| 278 out = out, prediction = prediction, model=model) | |
| 279 }, | |
| 280 dt={ | |
| 281 aggregateDT(classifiers = classifPrediction, target = phenotype, | |
| 282 out = out, prediction = prediction, model=model) | |
| 283 }, | |
| 284 lasso={ | |
| 285 aggregateLASSO(classifiers = classifPrediction, target = phenotype, | |
| 286 out = out, prediction = prediction, model=model) | |
| 287 }, | |
| 288 rf={ | |
| 289 aggregateRF(classifiers = classifPrediction, target = phenotype, | |
| 290 out = out, prediction = prediction, model=model) | |
| 291 }, | |
| 292 # svm | |
| 293 {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, | |
| 294 out = out, prediction = prediction, model = model)} | |
| 295 ) | |
| 296 # return path of the result file to galaxy | |
| 297 cat(paste(out, "\n", sep="")) | 
