Mercurial > repos > nicolas > oghma
view randomForest.R @ 18:27fb6c2a98a3 draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:30:52 -0400 |
parents | f311dc86809f |
children | 8541f9a21aec |
line wrap: on
line source
######################################################## # # creation date : 07/01/16 # last modification : 02/09/16 # author : Dr Nicolas Beaume # ######################################################## log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt") sink(file = log, type="message") library(randomForest) #library(pRF) ############################ helper functions ####################### optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), rangeMtry=seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)), repet=3) { acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry)) indexNtree <- 1 for(ntree in rangeNtree) { indexMtry <- 1 for(mtry in rangeMtry) { tempAcc <- NULL for(i in 1:repet) { n <- ceiling(nrow(genotype)/3) indexTest <- sample(1:nrow(genotype), size=n) train <- genotype[-indexTest,] test <- genotype[indexTest,] phenoTrain <- phenotype[-indexTest] phenoTest <- phenotype[indexTest] model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) pred <- predict(model, test) tempAcc <- c(tempAcc, r2(phenoTest, pred)) } acc[indexNtree,indexMtry] <- mean(tempAcc) indexMtry <- indexMtry+1 } indexNtree <- indexNtree+1 } colnames(acc) <- rangeMtry rownames(acc) <- rangeNtree bestParam <- which(acc==max(acc), arr.ind = T) return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]])) } r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) return(1-ssr/sst) } ################################## main function ########################### rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) { # go for optimization if(is.null(mtry) | is.null(ntree)) { if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))} if(is.null(ntree)) {ntree <- seq(100,1000,100)} opt <- optimize(genotype=genotype, phenotype=phenotype, rangeNtree = ntree, rangeMtry = mtry) mtry <- opt$mtry ntree <- opt$ntree } # evaluation if(evaluation) { prediction <- NULL for(i in 1:length(folds)) { train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) prediction <- c(prediction, list(predict(rf, test))) } saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) } else { model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) saveRDS(model, file = paste(outFile, ".rds", sep = "")) } } ############################ main ############################# cmd <- commandArgs(T) source(cmd[1]) # load classifier parameters mtry <- as.numeric(mtry) ntree <- as.numeric(ntree) if(mtry == -1) {mtry <- NULL} # check if evaluation is required evaluation <- F if(as.integer(doEvaluation) == 1) { evaluation <- T con = file(folds) folds <- readLines(con = con, n = 1, ok=T) close(con) folds <- readRDS(folds) } # load genotype and phenotype con = file(genotype) genotype <- readLines(con = con, n = 1, ok=T) close(con) genotype <- read.table(genotype, sep="\t", h=T) # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # run ! rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))