view randomForest.R @ 18:27fb6c2a98a3 draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 06:30:52 -0400
parents f311dc86809f
children 8541f9a21aec
line wrap: on
line source

########################################################
#
# creation date : 07/01/16
# last modification : 02/09/16
# author : Dr Nicolas Beaume
#
########################################################

log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt")
sink(file = log, type="message")

library(randomForest)
#library(pRF)

############################ helper functions #######################
optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), 
                     rangeMtry=seq(ceiling(ncol(genotype)/10),
                                   ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)),
                     repet=3) {
  acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry))
  indexNtree <- 1
  for(ntree in rangeNtree) {
    indexMtry <- 1
    for(mtry in rangeMtry) {
      tempAcc <- NULL
      for(i in 1:repet) {
        n <- ceiling(nrow(genotype)/3)
        indexTest <- sample(1:nrow(genotype), size=n)
        train <- genotype[-indexTest,]
        test <- genotype[indexTest,]
        phenoTrain <- phenotype[-indexTest]
        phenoTest <- phenotype[indexTest]
        model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
        pred <- predict(model, test)
        tempAcc <- c(tempAcc, r2(phenoTest, pred))
      }
      acc[indexNtree,indexMtry] <- mean(tempAcc)
      indexMtry <- indexMtry+1
    }
    indexNtree <- indexNtree+1
  }
  colnames(acc) <- rangeMtry
  rownames(acc) <- rangeNtree
  bestParam <- which(acc==max(acc), arr.ind = T)
  return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]]))
}

r2 <- function(target, prediction) {
  sst <- sum((target-mean(target))^2)
  ssr <- sum((target-prediction)^2)
  return(1-ssr/sst)
}
################################## main function ###########################
rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) {
  
  # go for optimization
  if(is.null(mtry) | is.null(ntree)) {
    if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))}
    if(is.null(ntree)) {ntree <- seq(100,1000,100)}
    opt <- optimize(genotype=genotype, phenotype=phenotype, 
                    rangeNtree = ntree, rangeMtry = mtry)
    mtry <- opt$mtry
    ntree <- opt$ntree
  }
  # evaluation
  if(evaluation) {
    prediction <- NULL
    for(i in 1:length(folds)) {
      train <- genotype[-folds[[i]],]
      test <- genotype[folds[[i]],]
      phenoTrain <- phenotype[-folds[[i]]]
      rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
      prediction <- c(prediction, list(predict(rf, test)))
    }
    saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
  } else {
    model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
    saveRDS(model, file = paste(outFile, ".rds", sep = ""))
  }
}


############################ main #############################
cmd <- commandArgs(T)
source(cmd[1])
# load classifier parameters
mtry <- as.numeric(mtry)
ntree <- as.numeric(ntree)
if(mtry == -1) {mtry <- NULL}
# check if evaluation is required
evaluation <- F
if(as.integer(doEvaluation) == 1) {
  evaluation <- T
  con = file(folds)
  folds <- readLines(con = con, n = 1, ok=T)
  close(con)
  folds <- readRDS(folds)
}
# load genotype and phenotype
con = file(genotype)
genotype <- readLines(con = con, n = 1, ok=T)
close(con)
genotype <- read.table(genotype, sep="\t", h=T)
# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 
# run !
rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
            evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))