Mercurial > repos > nicolas > oghma
changeset 10:f311dc86809f draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:28:10 -0400 |
parents | dcc10adbe46b |
children | 502fbb316d2d |
files | randomForest.R |
diffstat | 1 files changed, 109 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/randomForest.R Fri Oct 21 06:28:10 2016 -0400 @@ -0,0 +1,109 @@ +######################################################## +# +# creation date : 07/01/16 +# last modification : 02/09/16 +# author : Dr Nicolas Beaume +# +######################################################## + +log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt") +sink(file = log, type="message") + +library(randomForest) +#library(pRF) + +############################ helper functions ####################### +optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), + rangeMtry=seq(ceiling(ncol(genotype)/10), + ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)), + repet=3) { + acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry)) + indexNtree <- 1 + for(ntree in rangeNtree) { + indexMtry <- 1 + for(mtry in rangeMtry) { + tempAcc <- NULL + for(i in 1:repet) { + n <- ceiling(nrow(genotype)/3) + indexTest <- sample(1:nrow(genotype), size=n) + train <- genotype[-indexTest,] + test <- genotype[indexTest,] + phenoTrain <- phenotype[-indexTest] + phenoTest <- phenotype[indexTest] + model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) + pred <- predict(model, test) + tempAcc <- c(tempAcc, r2(phenoTest, pred)) + } + acc[indexNtree,indexMtry] <- mean(tempAcc) + indexMtry <- indexMtry+1 + } + indexNtree <- indexNtree+1 + } + colnames(acc) <- rangeMtry + rownames(acc) <- rangeNtree + bestParam <- which(acc==max(acc), arr.ind = T) + return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]])) +} + +r2 <- function(target, prediction) { + sst <- sum((target-mean(target))^2) + ssr <- sum((target-prediction)^2) + return(1-ssr/sst) +} +################################## main function ########################### +rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) { + + # go for optimization + if(is.null(mtry) | is.null(ntree)) { + if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))} + if(is.null(ntree)) {ntree <- seq(100,1000,100)} + opt <- optimize(genotype=genotype, phenotype=phenotype, + rangeNtree = ntree, rangeMtry = mtry) + mtry <- opt$mtry + ntree <- opt$ntree + } + # evaluation + if(evaluation) { + prediction <- NULL + for(i in 1:length(folds)) { + train <- genotype[-folds[[i]],] + test <- genotype[folds[[i]],] + phenoTrain <- phenotype[-folds[[i]]] + rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) + prediction <- c(prediction, list(predict(rf, test))) + } + saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) + } else { + model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) + saveRDS(model, file = paste(outFile, ".rds", sep = "")) + } +} + + +############################ main ############################# +cmd <- commandArgs(T) +source(cmd[1]) +# load classifier parameters +mtry <- as.numeric(mtry) +ntree <- as.numeric(ntree) +if(mtry == -1) {mtry <- NULL} +# check if evaluation is required +evaluation <- F +if(as.integer(doEvaluation) == 1) { + evaluation <- T + con = file(folds) + folds <- readLines(con = con, n = 1, ok=T) + close(con) + folds <- readRDS(folds) +} +# load genotype and phenotype +con = file(genotype) +genotype <- readLines(con = con, n = 1, ok=T) +close(con) +genotype <- read.table(genotype, sep="\t", h=T) +# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve +phenotype <- read.table(phenotype, sep="\t", h=T)[,1] +# run ! +rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, + evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) +cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))