view aggregation.R @ 103:e7115e44d8d8 draft default tip

Uploaded
author nicolas
date Mon, 31 Oct 2016 07:20:49 -0400
parents 4af0489af821
children
line wrap: on
line source

########################################################
#
# creation date : 25/10/16
# last modification : 25/10/16
# author : Dr Nicolas Beaume
#
########################################################

suppressWarnings(suppressMessages(library(GA)))
library("miscTools")
library(rpart)
suppressWarnings(suppressMessages(library(randomForest)))
library(e1071)
suppressWarnings(suppressMessages(library(glmnet)))
options(warn=-1)
############################ helper functions #######################

##### Genetic algorithm

# compute r2 by computing the classic formula
# compare the sum of square difference from target to prediciton
# to the sum of square difference from target to the mean of the target
r2 <- function(target, prediction) {
  sst <- sum((target-mean(target))^2)
  ssr <- sum((target-prediction)^2)
  return(1-ssr/sst)
}

optimizeOneIndividual <- function(values, trueValue) {
  # change the value into a function
  f <- function(w) {sum(values * w/sum(w))}
  fitness <- function(x) {1/abs(trueValue-f(x))}
  resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), 
             maxiter = 1000, monitor = NULL, keepBest = T)
  resp@solution <- resp@solution/sum(resp@solution)
  return(resp)
}

optimizeWeight <- function(values, trueValue, n=1000) {
  fitnessAll <- function(w) {
    predicted <- apply(values, 1, weightedPrediction.vec, w)
    return(mean(r2(trueValue, predicted)))
    #return(mean(1/abs(trueValue-predicted)))
  }
  resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), 
             maxiter = n, monitor = NULL, keepBest = T)
  resp@solution <- resp@solution/sum(resp@solution)
  return(resp)
}

weightedPrediction <- function(classifiers, w) {
  if(length(w) > ncol(classifiers)) {
    warning("more weights than classifiers, extra weigths are ignored")
    w <- w[1:ncol(classifiers)]
  } else if(length(w) < ncol(classifiers)) {
    warning("less weights than classifiers, extra classifiers are ignored")
    classifiers <- classifiers[,1:length(w)]
  }
  prediction <- NULL
  prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
  return(prediction)
}

weightedPrediction.vec <- function(values, w) {
  return(sum(values * w/sum(w)))
}

##### meta-decision tree

tuneTree <- function(data, target) {
  data <- data.frame(data, target=target)
  size <-  nrow(data)
  xerror <-  NULL
  split <-  1:ceiling(size/5)
  leafSize <-  1:ceiling(size/10)
  xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
  cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
  for(i in 1:length(split)) {
    for(j in 1:length(leafSize)) {
      op <- list(minsplit=split[i], minbucket=leafSize[j])
      tree <- rpart(target ~., data=data, control=op, method="anova")
      xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
      cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
    }
  }
  index <- which(xerror==min(xerror), arr.ind = T)
  op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
  return(op)
}

###### meta-LASSO
# create fold by picking at random row indexes
createFolds <- function(nbObs, n) {
  # pick indexes
  index <- sample(1:n, size=nbObs, replace = T)
  # populate folds
  folds <- NULL
  for(i in 1:n) {
    folds <- c(folds, list(which(index==i)))
  }
  return(folds)
}

searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
  folds <- createFolds(nrow(genotype), n = n)
  acc <- NULL
  indexAlpha <- 1
  for(a in alpha) {
    curAcc <- NULL
    for(i in 1:n) {
      train <- genotype[-folds[[i]],]
      test <- genotype[folds[[i]],]
      phenoTrain <- phenotype[-folds[[i]]]
      phenoTest <- phenotype[folds[[i]]]
      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
      model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
      pred <- predict(model, test, type = "response")
      curAcc <- c(curAcc, r2(phenoTest, pred))
    }
    acc <- c(acc, mean(curAcc))
  }
  names(acc) <- alpha
  return(as.numeric(names(acc)[which.max(acc)]))
}

###### meta-random forest

searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
  n <- ceiling(nrow(genotype)/3)
  indexTest <- sample(1:nrow(genotype), size=n)
  train <- genotype[-indexTest,]
  test <- genotype[indexTest,]
  phenoTrain <- phenotype[-indexTest]
  phenoTest <- phenotype[indexTest]
  acc <- NULL
  indexNtree <- 1
  for(ntree in rangeNtree) {
    model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
    pred <- predict(model, test)
    acc <- c(acc, r2(phenoTest, pred))
  }
  names(acc) <- rangeNtree
  best <- which.max(acc)
  return(as.numeric(names(acc)[best]))
}

###### meta-SVM
searchParamSVM <- function(train, target, kernel="radial") {
  # tuning parameters then train
  model <- NULL
  switch(kernel,
         sigmoid={
           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
           g <- tune$best.parameters[[1]];
           c <- tune$best.parameters[[2]];
           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
         linear={
           tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
           c <- tune$best.parameters[[1]];
           model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
         polynomial={
           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
           d <- tune$best.parameters[[1]];
           g <- tune$best.parameters[[2]];
           coef <- tune$best.parameters[[3]];
           c <- tune$best.parameters[[4]];
           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
         {
           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
           g <- tune$best.parameters[[1]];
           c <- tune$best.parameters[[2]];
           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
  )
  return(model)
}

#################### upper level functions #####################

aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
  if(!prediction) {
    treeParam <- tuneTree(classifiers, target)
    data <- data.frame(classifiers, target)
    model <- rpart(target ~., data=data, method = "anova", control = treeParam)
    model <- prune(model, cp=treeParam["cp"])
    out <- paste(out, ".rds", sep = "")
    saveRDS(model, out)
    return(out)
  } else {
    return(predict(model, classifiers))
  }
}

aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
  if(!prediction) {
    opt <- optimizeWeight(values = classifiers, trueValue = target)
    out <- paste(out, ".rds", sep = "")
    saveRDS(opt@solution, out)
    return(out)
  } else {
    return(weightedPrediction.vec(classifiers, model))
  }
}

aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
  if(!prediction) {
    alpha <- searchParamLASSO(classifiers, target)
    cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
    model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
    out <- paste(out, ".rds", sep = "")
    saveRDS(model, out)
    return(out)
  } else {
    return(predict(model, classifiers))
  }
}

aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
  if(!prediction) {
   ntree <- searchParamRF(genotype = classifiers, phenotype = target,
                                              rangeNtree = seq(100, 1000, 100))
    model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
    out <- paste(out, ".rds", sep = "")
    saveRDS(model, out)
    return(out)
  } else {
    return(predict(model, classifiers))
  }
}

aggregateSVM <- function(classifiers, target=NULL, prediction=F, 
                         model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
  if(!prediction) {
      model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
      out <- paste(out, ".rds", sep="")
      saveRDS(model, out)
      return(out)
  } else {
    return(predict(model, classifiers))
  }
}

################################### main #############################
# # load argument
cmd <- commandArgs(T)
source(cmd[1])
# check for model
if(model == "None") {
  model <- NULL
  prediction <- F
} else {
  prediction <- T
  con = file(model)
  model <- readLines(con = con, n = 1, ok=T)
  close(con)
  model <- readRDS(model)
}
# load classifiers and phenotype
classifiers <- NULL
classifNames <- NULL
if(lassoPred !="None"){
  classifiers <- c(classifiers, lassoPred)
  classifNames <- c(classifNames, "lasso")
}
if(rrBLUPPred !="None"){
  classifiers <- c(classifiers, rrBLUPPred)
  classifNames <- c(classifNames, "rrBLUP")
}
if(rfPred !="None"){
  classifiers <- c(classifiers, rfPred)
  classifNames <- c(classifNames, "rf")
}
if(svmPred !="None"){
  classifiers <- c(classifiers, svmPred)
  classifNames <- c(classifNames, "svm")
}
classifPrediction <- NULL
for(classif in classifiers) {
  classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
}
classifPrediction <- data.frame(classifPrediction)
colnames(classifPrediction) <- classifNames
# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
# aggregate !
switch(method,
       geneticMean={
         res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
                              out = out, prediction = prediction, model=model)
       },
       dt={
         res <- aggregateDT(classifiers = classifPrediction, target = phenotype,
                     out = out, prediction = prediction, model=model)
       },
       lasso={
         res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,
                     out = out, prediction = prediction, model=model)
       },
       rf={
         res <- aggregateRF(classifiers = classifPrediction, target = phenotype,
                     out = out, prediction = prediction, model=model)
       },
       # svm
       {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
                     out = out, prediction = prediction, model = model)}
       )
if(prediction) {
  write.table(data.frame(lines=rownames(classifPrediction), res), out,
                         sep="\t", row.names = F)
} else {
  cat(paste(out, ".rds", sep=""), "\n", sep="")
}