Mercurial > repos > nicolas > oghma

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/aggregation.R	Wed Oct 26 18:08:42 2016 -0400
@@ -0,0 +1,315 @@
+########################################################
+#
+# creation date : 25/10/16
+# last modification : 25/10/16
+# author : Dr Nicolas Beaume
+#
+########################################################
+
+suppressWarnings(suppressMessages(library(GA)))
+library("miscTools")
+library(rpart)
+suppressWarnings(suppressMessages(library(randomForest)))
+library(e1071)
+suppressWarnings(suppressMessages(library(glmnet)))
+options(warn=-1)
+############################ helper functions #######################
+
+##### Genetic algorithm
+
+# compute r2 by computing the classic formula
+# compare the sum of square difference from target to prediciton
+# to the sum of square difference from target to the mean of the target
+r2 <- function(target, prediction) {
+  sst <- sum((target-mean(target))^2)
+  ssr <- sum((target-prediction)^2)
+  return(1-ssr/sst)
+}
+
+optimizeOneIndividual <- function(values, trueValue) {
+  # change the value into a function
+  f <- function(w) {sum(values * w/sum(w))}
+  fitness <- function(x) {1/abs(trueValue-f(x))}
+  resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
+             maxiter = 1000, monitor = NULL, keepBest = T)
+  resp@solution <- resp@solution/sum(resp@solution)
+  return(resp)
+}
+
+optimizeWeight <- function(values, trueValue, n=1000) {
+  fitnessAll <- function(w) {
+    predicted <- apply(values, 1, weightedPrediction.vec, w)
+    return(mean(r2(trueValue, predicted)))
+    #return(mean(1/abs(trueValue-predicted)))
+  }
+  resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
+             maxiter = n, monitor = NULL, keepBest = T)
+  resp@solution <- resp@solution/sum(resp@solution)
+  return(resp)
+}
+
+weightedPrediction <- function(classifiers, w) {
+  if(length(w) > ncol(classifiers)) {
+    warning("more weights than classifiers, extra weigths are ignored")
+    w <- w[1:ncol(classifiers)]
+  } else if(length(w) < ncol(classifiers)) {
+    warning("less weights than classifiers, extra classifiers are ignored")
+    classifiers <- classifiers[,1:length(w)]
+  }
+  prediction <- NULL
+  prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
+  return(prediction)
+}
+
+weightedPrediction.vec <- function(values, w) {
+  return(sum(values * w/sum(w)))
+}
+
+##### meta-decision tree
+
+tuneTree <- function(data, target) {
+  data <- data.frame(data, target=target)
+  size <-  nrow(data)
+  xerror <-  NULL
+  split <-  1:ceiling(size/5)
+  leafSize <-  1:ceiling(size/10)
+  xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
+  cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
+  for(i in 1:length(split)) {
+    for(j in 1:length(leafSize)) {
+      op <- list(minsplit=split[i], minbucket=leafSize[j])
+      tree <- rpart(target ~., data=data, control=op, method="anova")
+      xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
+      cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
+    }
+  }
+  index <- which(xerror==min(xerror), arr.ind = T)
+  op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
+  return(op)
+}
+
+###### meta-LASSO
+# create fold by picking at random row indexes
+createFolds <- function(nbObs, n) {
+  # pick indexes
+  index <- sample(1:n, size=nbObs, replace = T)
+  # populate folds
+  folds <- NULL
+  for(i in 1:n) {
+    folds <- c(folds, list(which(index==i)))
+  }
+  return(folds)
+}
+
+searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
+  folds <- createFolds(nrow(genotype), n = n)
+  acc <- NULL
+  indexAlpha <- 1
+  for(a in alpha) {
+    curAcc <- NULL
+    for(i in 1:n) {
+      train <- genotype[-folds[[i]],]
+      test <- genotype[folds[[i]],]
+      phenoTrain <- phenotype[-folds[[i]]]
+      phenoTest <- phenotype[folds[[i]]]
+      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
+      model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
+      pred <- predict(model, test, type = "response")
+      curAcc <- c(curAcc, r2(phenoTest, pred))
+    }
+    acc <- c(acc, mean(curAcc))
+  }
+  names(acc) <- alpha
+  return(as.numeric(names(acc)[which.max(acc)]))
+}
+
+###### meta-random forest
+
+searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
+  n <- ceiling(nrow(genotype)/3)
+  indexTest <- sample(1:nrow(genotype), size=n)
+  train <- genotype[-indexTest,]
+  test <- genotype[indexTest,]
+  phenoTrain <- phenotype[-indexTest]
+  phenoTest <- phenotype[indexTest]
+  acc <- NULL
+  indexNtree <- 1
+  for(ntree in rangeNtree) {
+    model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
+    pred <- predict(model, test)
+    acc <- c(acc, r2(phenoTest, pred))
+  }
+  names(acc) <- rangeNtree
+  best <- which.max(acc)
+  return(as.numeric(names(acc)[best]))
+}
+
+###### meta-SVM
+searchParamSVM <- function(train, target, kernel="radial") {
+  # tuning parameters then train
+  model <- NULL
+  switch(kernel,
+         sigmoid={
+           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
+           g <- tune$best.parameters[[1]];
+           c <- tune$best.parameters[[2]];
+           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
+         linear={
+           tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
+           c <- tune$best.parameters[[1]];
+           model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
+         polynomial={
+           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
+           d <- tune$best.parameters[[1]];
+           g <- tune$best.parameters[[2]];
+           coef <- tune$best.parameters[[3]];
+           c <- tune$best.parameters[[4]];
+           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
+         {
+           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
+           g <- tune$best.parameters[[1]];
+           c <- tune$best.parameters[[2]];
+           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
+  )
+  return(model)
+}
+
+#################### upper level functions #####################
+
+aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
+  if(!prediction) {
+    treeParam <- tuneTree(classifiers, target)
+    data <- data.frame(classifiers, target)
+    model <- rpart(target ~., data=data, method = "anova", control = treeParam)
+    model <- prune(model, cp=treeParam["cp"])
+    saveRDS(model, out)
+    # return path of the result file to galaxy
+    cat(paste(out, "\n", sep=""))
+  } else {
+    write.table(predict(model, classifiers), out, sep="\t")
+  }
+}
+
+aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
+  if(!prediction) {
+    opt <- optimizeWeight(values = classifiers, trueValue = target)
+    saveRDS(opt@solution, out)
+    # return path of the result file to galaxy
+    cat(paste(out, "\n", sep=""))
+  } else {
+    write.table(weightedPrediction.vec(classifiers, model), out, sep="\t")
+  }
+}
+
+aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
+  if(!prediction) {
+    alpha <- searchParamLASSO(classifiers, target)
+    cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
+    model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
+    saveRDS(model, out)
+    # return path of the result file to galaxy
+    cat(paste(out, "\n", sep=""))
+  } else {
+    write.table(predict(model, classifiers), out, sep="\t")
+  }
+}
+
+aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
+  if(!prediction) {
+   ntree <- searchParamRF(genotype = classifiers, phenotype = target,
+                                              rangeNtree = seq(100, 1000, 100))
+    model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
+    saveRDS(model, out)
+    # return path of the result file to galaxy
+    cat(paste(out, "\n", sep=""))
+  } else {
+    write.table(predict(model, classifiers), out, sep="\t")
+  }
+}
+
+aggregateSVM <- function(classifiers, target=NULL, prediction=F,
+                         model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
+  if(!prediction) {
+      model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
+      saveRDS(model, out)
+      # return path of the result file to galaxy
+      cat(paste(out, "\n", sep=""))
+  } else {
+    write.table(predict(model, classifiers), out, sep="\t")
+  }
+}
+
+################################### main #############################
+# # load argument
+cmd <- commandArgs(T)
+source(cmd[1])
+# check if evaluation is required
+evaluation <- F
+if(as.integer(doEvaluation) == 1) {
+  evaluation <- T
+  con = file(folds)
+  folds <- readLines(con = con, n = 1, ok=T)
+  close(con)
+  folds <- readRDS(folds)
+}
+# check for model
+if(model == "None") {
+  model <- NULL
+  prediction <- F
+} else {
+  prediction <- T
+  con = file(model)
+  model <- readLines(con = con, n = 1, ok=T)
+  close(con)
+  model <- readRDS(model)
+}
+# load classifiers and phenotype
+classifiers <- NULL
+classifNames <- NULL
+if(lassoPred !="None"){
+  classifiers <- c(classifiers, lassoPred)
+  classifNames <- c(classifNames, "lasso")
+}
+if(rrBLUPPred !="None"){
+  classifiers <- c(classifiers, rrBLUPPred)
+  classifNames <- c(classifNames, "rrBLUP")
+}
+if(rfPred !="None"){
+  classifiers <- c(classifiers, rfPred)
+  classifNames <- c(classifNames, "rf")
+}
+if(svmPred !="None"){
+  classifiers <- c(classifiers, svmPred)
+  classifNames <- c(classifNames, "svm")
+}
+classifPrediction <- NULL
+for(classif in classifiers) {
+  classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
+}
+classifPrediction <- data.frame(classifPrediction)
+colnames(classifPrediction) <- classifNames
+# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
+phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
+out <- paste(out, ".rds", sep = "")
+# aggregate !
+switch(method,
+       geneticMean={
+         aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
+                              out = out, prediction = prediction, model=model)
+       },
+       dt={
+         aggregateDT(classifiers = classifPrediction, target = phenotype,
+                     out = out, prediction = prediction, model=model)
+       },
+       lasso={
+         aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,
+                     out = out, prediction = prediction, model=model)
+       },
+       rf={
+         aggregateRF(classifiers = classifPrediction, target = phenotype,
+                     out = out, prediction = prediction, model=model)
+       },
+       # svm
+       {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
+                     out = out, prediction = prediction, model = model)}
+       )
\ No newline at end of file