Mercurial > repos > nicolas > oghma
comparison randomForest.R @ 10:f311dc86809f draft
Uploaded
| author | nicolas |
|---|---|
| date | Fri, 21 Oct 2016 06:28:10 -0400 |
| parents | |
| children | 8541f9a21aec |
comparison
equal
deleted
inserted
replaced
| 9:dcc10adbe46b | 10:f311dc86809f |
|---|---|
| 1 ######################################################## | |
| 2 # | |
| 3 # creation date : 07/01/16 | |
| 4 # last modification : 02/09/16 | |
| 5 # author : Dr Nicolas Beaume | |
| 6 # | |
| 7 ######################################################## | |
| 8 | |
| 9 log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt") | |
| 10 sink(file = log, type="message") | |
| 11 | |
| 12 library(randomForest) | |
| 13 #library(pRF) | |
| 14 | |
| 15 ############################ helper functions ####################### | |
| 16 optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), | |
| 17 rangeMtry=seq(ceiling(ncol(genotype)/10), | |
| 18 ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)), | |
| 19 repet=3) { | |
| 20 acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry)) | |
| 21 indexNtree <- 1 | |
| 22 for(ntree in rangeNtree) { | |
| 23 indexMtry <- 1 | |
| 24 for(mtry in rangeMtry) { | |
| 25 tempAcc <- NULL | |
| 26 for(i in 1:repet) { | |
| 27 n <- ceiling(nrow(genotype)/3) | |
| 28 indexTest <- sample(1:nrow(genotype), size=n) | |
| 29 train <- genotype[-indexTest,] | |
| 30 test <- genotype[indexTest,] | |
| 31 phenoTrain <- phenotype[-indexTest] | |
| 32 phenoTest <- phenotype[indexTest] | |
| 33 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) | |
| 34 pred <- predict(model, test) | |
| 35 tempAcc <- c(tempAcc, r2(phenoTest, pred)) | |
| 36 } | |
| 37 acc[indexNtree,indexMtry] <- mean(tempAcc) | |
| 38 indexMtry <- indexMtry+1 | |
| 39 } | |
| 40 indexNtree <- indexNtree+1 | |
| 41 } | |
| 42 colnames(acc) <- rangeMtry | |
| 43 rownames(acc) <- rangeNtree | |
| 44 bestParam <- which(acc==max(acc), arr.ind = T) | |
| 45 return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]])) | |
| 46 } | |
| 47 | |
| 48 r2 <- function(target, prediction) { | |
| 49 sst <- sum((target-mean(target))^2) | |
| 50 ssr <- sum((target-prediction)^2) | |
| 51 return(1-ssr/sst) | |
| 52 } | |
| 53 ################################## main function ########################### | |
| 54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) { | |
| 55 | |
| 56 # go for optimization | |
| 57 if(is.null(mtry) | is.null(ntree)) { | |
| 58 if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))} | |
| 59 if(is.null(ntree)) {ntree <- seq(100,1000,100)} | |
| 60 opt <- optimize(genotype=genotype, phenotype=phenotype, | |
| 61 rangeNtree = ntree, rangeMtry = mtry) | |
| 62 mtry <- opt$mtry | |
| 63 ntree <- opt$ntree | |
| 64 } | |
| 65 # evaluation | |
| 66 if(evaluation) { | |
| 67 prediction <- NULL | |
| 68 for(i in 1:length(folds)) { | |
| 69 train <- genotype[-folds[[i]],] | |
| 70 test <- genotype[folds[[i]],] | |
| 71 phenoTrain <- phenotype[-folds[[i]]] | |
| 72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) | |
| 73 prediction <- c(prediction, list(predict(rf, test))) | |
| 74 } | |
| 75 saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) | |
| 76 } else { | |
| 77 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) | |
| 78 saveRDS(model, file = paste(outFile, ".rds", sep = "")) | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 | |
| 83 ############################ main ############################# | |
| 84 cmd <- commandArgs(T) | |
| 85 source(cmd[1]) | |
| 86 # load classifier parameters | |
| 87 mtry <- as.numeric(mtry) | |
| 88 ntree <- as.numeric(ntree) | |
| 89 if(mtry == -1) {mtry <- NULL} | |
| 90 # check if evaluation is required | |
| 91 evaluation <- F | |
| 92 if(as.integer(doEvaluation) == 1) { | |
| 93 evaluation <- T | |
| 94 con = file(folds) | |
| 95 folds <- readLines(con = con, n = 1, ok=T) | |
| 96 close(con) | |
| 97 folds <- readRDS(folds) | |
| 98 } | |
| 99 # load genotype and phenotype | |
| 100 con = file(genotype) | |
| 101 genotype <- readLines(con = con, n = 1, ok=T) | |
| 102 close(con) | |
| 103 genotype <- read.table(genotype, sep="\t", h=T) | |
| 104 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
| 105 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
| 106 # run ! | |
| 107 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, | |
| 108 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) | |
| 109 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |
