Mercurial > repos > nicolas > oghma
diff randomForest.R @ 40:8541f9a21aec draft
Uploaded
author | nicolas |
---|---|
date | Tue, 25 Oct 2016 14:42:32 -0400 |
parents | f311dc86809f |
children |
line wrap: on
line diff
--- a/randomForest.R Tue Oct 25 14:41:59 2016 -0400 +++ b/randomForest.R Tue Oct 25 14:42:32 2016 -0400 @@ -1,79 +1,82 @@ ######################################################## # # creation date : 07/01/16 -# last modification : 02/09/16 +# last modification : 25/10/16 # author : Dr Nicolas Beaume # ######################################################## -log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt") -sink(file = log, type="message") - -library(randomForest) -#library(pRF) - +suppressWarnings(suppressMessages(library(randomForest))) ############################ helper functions ####################### -optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), - rangeMtry=seq(ceiling(ncol(genotype)/10), - ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)), +# optimize +optimize <- function(genotype, phenotype, ntree=1000, + rangeMtry=seq(ceiling(ncol(genotype)/5), + ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)), repet=3) { - acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry)) - indexNtree <- 1 - for(ntree in rangeNtree) { - indexMtry <- 1 - for(mtry in rangeMtry) { - tempAcc <- NULL - for(i in 1:repet) { - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) - pred <- predict(model, test) - tempAcc <- c(tempAcc, r2(phenoTest, pred)) - } - acc[indexNtree,indexMtry] <- mean(tempAcc) - indexMtry <- indexMtry+1 + # accuracy over all mtry values + acc <- NULL + for(mtry in rangeMtry) { + # to compute the mean accuracy over repetiotion for the current mtry value + tempAcc <- NULL + for(i in 1:repet) { + # 1/3 of the dataset is used as test set + n <- ceiling(nrow(genotype)/3) + indexTest <- sample(1:nrow(genotype), size=n) + # create training and test set + train <- genotype[-indexTest,] + test <- genotype[indexTest,] + phenoTrain <- phenotype[-indexTest] + phenoTest <- phenotype[indexTest] + # compute model + model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) + # predict on test set and compute accuracy + pred <- predict(model, test) + tempAcc <- c(tempAcc, r2(phenoTest, pred)) } - indexNtree <- indexNtree+1 + # find mean accuracy for the current mtry value + acc <- c(acc, mean(tempAcc)) } - colnames(acc) <- rangeMtry - rownames(acc) <- rangeNtree - bestParam <- which(acc==max(acc), arr.ind = T) - return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]])) + # return mtry for the best accuracy + names(acc) <- rangeMtry + bestParam <- which.max(acc) + return(rangeMtry[bestParam]) } +# compute r2 by computing the classic formula +# compare the sum of square difference from target to prediciton +# to the sum of square difference from target to the mean of the target r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) return(1-ssr/sst) } ################################## main function ########################### -rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) { +rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) { # go for optimization - if(is.null(mtry) | is.null(ntree)) { - if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))} - if(is.null(ntree)) {ntree <- seq(100,1000,100)} - opt <- optimize(genotype=genotype, phenotype=phenotype, - rangeNtree = ntree, rangeMtry = mtry) - mtry <- opt$mtry - ntree <- opt$ntree + if(is.null(mtry)) { + # find best mtry + mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)) + mtry <- optimize(genotype=genotype, phenotype=phenotype, + ntree = ntree, rangeMtry = mtry) } # evaluation if(evaluation) { prediction <- NULL for(i in 1:length(folds)) { + # create training and testing set for the current fold train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] + # compute model rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) + # predict and save prediction for the current fold prediction <- c(prediction, list(predict(rf, test))) } + # save preductions for all folds to be used for evaluation saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) } else { + # just compute the model and save it model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) saveRDS(model, file = paste(outFile, ".rds", sep = "")) } @@ -81,6 +84,7 @@ ############################ main ############################# +# load parameters cmd <- commandArgs(T) source(cmd[1]) # load classifier parameters @@ -101,9 +105,9 @@ genotype <- readLines(con = con, n = 1, ok=T) close(con) genotype <- read.table(genotype, sep="\t", h=T) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # run ! rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) +# send the path containing results to galaxy cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))