Mercurial > repos > nicolas > oghma
view aggregation.R @ 103:e7115e44d8d8 draft default tip
Uploaded
author | nicolas |
---|---|
date | Mon, 31 Oct 2016 07:20:49 -0400 |
parents | 4af0489af821 |
children |
line wrap: on
line source
######################################################## # # creation date : 25/10/16 # last modification : 25/10/16 # author : Dr Nicolas Beaume # ######################################################## suppressWarnings(suppressMessages(library(GA))) library("miscTools") library(rpart) suppressWarnings(suppressMessages(library(randomForest))) library(e1071) suppressWarnings(suppressMessages(library(glmnet))) options(warn=-1) ############################ helper functions ####################### ##### Genetic algorithm # compute r2 by computing the classic formula # compare the sum of square difference from target to prediciton # to the sum of square difference from target to the mean of the target r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) return(1-ssr/sst) } optimizeOneIndividual <- function(values, trueValue) { # change the value into a function f <- function(w) {sum(values * w/sum(w))} fitness <- function(x) {1/abs(trueValue-f(x))} resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), maxiter = 1000, monitor = NULL, keepBest = T) resp@solution <- resp@solution/sum(resp@solution) return(resp) } optimizeWeight <- function(values, trueValue, n=1000) { fitnessAll <- function(w) { predicted <- apply(values, 1, weightedPrediction.vec, w) return(mean(r2(trueValue, predicted))) #return(mean(1/abs(trueValue-predicted))) } resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), maxiter = n, monitor = NULL, keepBest = T) resp@solution <- resp@solution/sum(resp@solution) return(resp) } weightedPrediction <- function(classifiers, w) { if(length(w) > ncol(classifiers)) { warning("more weights than classifiers, extra weigths are ignored") w <- w[1:ncol(classifiers)] } else if(length(w) < ncol(classifiers)) { warning("less weights than classifiers, extra classifiers are ignored") classifiers <- classifiers[,1:length(w)] } prediction <- NULL prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) return(prediction) } weightedPrediction.vec <- function(values, w) { return(sum(values * w/sum(w))) } ##### meta-decision tree tuneTree <- function(data, target) { data <- data.frame(data, target=target) size <- nrow(data) xerror <- NULL split <- 1:ceiling(size/5) leafSize <- 1:ceiling(size/10) xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) for(i in 1:length(split)) { for(j in 1:length(leafSize)) { op <- list(minsplit=split[i], minbucket=leafSize[j]) tree <- rpart(target ~., data=data, control=op, method="anova") xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] } } index <- which(xerror==min(xerror), arr.ind = T) op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) return(op) } ###### meta-LASSO # create fold by picking at random row indexes createFolds <- function(nbObs, n) { # pick indexes index <- sample(1:n, size=nbObs, replace = T) # populate folds folds <- NULL for(i in 1:n) { folds <- c(folds, list(which(index==i))) } return(folds) } searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { folds <- createFolds(nrow(genotype), n = n) acc <- NULL indexAlpha <- 1 for(a in alpha) { curAcc <- NULL for(i in 1:n) { train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] phenoTest <- phenotype[folds[[i]]] cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) pred <- predict(model, test, type = "response") curAcc <- c(curAcc, r2(phenoTest, pred)) } acc <- c(acc, mean(curAcc)) } names(acc) <- alpha return(as.numeric(names(acc)[which.max(acc)])) } ###### meta-random forest searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { n <- ceiling(nrow(genotype)/3) indexTest <- sample(1:nrow(genotype), size=n) train <- genotype[-indexTest,] test <- genotype[indexTest,] phenoTrain <- phenotype[-indexTest] phenoTest <- phenotype[indexTest] acc <- NULL indexNtree <- 1 for(ntree in rangeNtree) { model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) pred <- predict(model, test) acc <- c(acc, r2(phenoTest, pred)) } names(acc) <- rangeNtree best <- which.max(acc) return(as.numeric(names(acc)[best])) } ###### meta-SVM searchParamSVM <- function(train, target, kernel="radial") { # tuning parameters then train model <- NULL switch(kernel, sigmoid={ tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); g <- tune$best.parameters[[1]]; c <- tune$best.parameters[[2]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, linear={ tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); c <- tune$best.parameters[[1]]; model <- svm(x=train, y=target, cost = c, kernel = "linear")}, polynomial={ tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); d <- tune$best.parameters[[1]]; g <- tune$best.parameters[[2]]; coef <- tune$best.parameters[[3]]; c <- tune$best.parameters[[4]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, { tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); g <- tune$best.parameters[[1]]; c <- tune$best.parameters[[2]]; model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} ) return(model) } #################### upper level functions ##################### aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { if(!prediction) { treeParam <- tuneTree(classifiers, target) data <- data.frame(classifiers, target) model <- rpart(target ~., data=data, method = "anova", control = treeParam) model <- prune(model, cp=treeParam["cp"]) out <- paste(out, ".rds", sep = "") saveRDS(model, out) return(out) } else { return(predict(model, classifiers)) } } aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ if(!prediction) { opt <- optimizeWeight(values = classifiers, trueValue = target) out <- paste(out, ".rds", sep = "") saveRDS(opt@solution, out) return(out) } else { return(weightedPrediction.vec(classifiers, model)) } } aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { if(!prediction) { alpha <- searchParamLASSO(classifiers, target) cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) out <- paste(out, ".rds", sep = "") saveRDS(model, out) return(out) } else { return(predict(model, classifiers)) } } aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { if(!prediction) { ntree <- searchParamRF(genotype = classifiers, phenotype = target, rangeNtree = seq(100, 1000, 100)) model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) out <- paste(out, ".rds", sep = "") saveRDS(model, out) return(out) } else { return(predict(model, classifiers)) } } aggregateSVM <- function(classifiers, target=NULL, prediction=F, model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { if(!prediction) { model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) out <- paste(out, ".rds", sep="") saveRDS(model, out) return(out) } else { return(predict(model, classifiers)) } } ################################### main ############################# # # load argument cmd <- commandArgs(T) source(cmd[1]) # check for model if(model == "None") { model <- NULL prediction <- F } else { prediction <- T con = file(model) model <- readLines(con = con, n = 1, ok=T) close(con) model <- readRDS(model) } # load classifiers and phenotype classifiers <- NULL classifNames <- NULL if(lassoPred !="None"){ classifiers <- c(classifiers, lassoPred) classifNames <- c(classifNames, "lasso") } if(rrBLUPPred !="None"){ classifiers <- c(classifiers, rrBLUPPred) classifNames <- c(classifNames, "rrBLUP") } if(rfPred !="None"){ classifiers <- c(classifiers, rfPred) classifNames <- c(classifNames, "rf") } if(svmPred !="None"){ classifiers <- c(classifiers, svmPred) classifNames <- c(classifNames, "svm") } classifPrediction <- NULL for(classif in classifiers) { classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) } classifPrediction <- data.frame(classifPrediction) colnames(classifPrediction) <- classifNames # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # aggregate ! switch(method, geneticMean={ res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, out = out, prediction = prediction, model=model) }, dt={ res <- aggregateDT(classifiers = classifPrediction, target = phenotype, out = out, prediction = prediction, model=model) }, lasso={ res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, out = out, prediction = prediction, model=model) }, rf={ res <- aggregateRF(classifiers = classifPrediction, target = phenotype, out = out, prediction = prediction, model=model) }, # svm {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, out = out, prediction = prediction, model = model)} ) if(prediction) { write.table(data.frame(lines=rownames(classifPrediction), res), out, sep="\t", row.names = F) } else { cat(paste(out, ".rds", sep=""), "\n", sep="") }