comparison randomForest.R @ 10:f311dc86809f draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 06:28:10 -0400
parents
children 8541f9a21aec
comparison
equal deleted inserted replaced
9:dcc10adbe46b 10:f311dc86809f
1 ########################################################
2 #
3 # creation date : 07/01/16
4 # last modification : 02/09/16
5 # author : Dr Nicolas Beaume
6 #
7 ########################################################
8
9 log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt")
10 sink(file = log, type="message")
11
12 library(randomForest)
13 #library(pRF)
14
15 ############################ helper functions #######################
16 optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100),
17 rangeMtry=seq(ceiling(ncol(genotype)/10),
18 ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)),
19 repet=3) {
20 acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry))
21 indexNtree <- 1
22 for(ntree in rangeNtree) {
23 indexMtry <- 1
24 for(mtry in rangeMtry) {
25 tempAcc <- NULL
26 for(i in 1:repet) {
27 n <- ceiling(nrow(genotype)/3)
28 indexTest <- sample(1:nrow(genotype), size=n)
29 train <- genotype[-indexTest,]
30 test <- genotype[indexTest,]
31 phenoTrain <- phenotype[-indexTest]
32 phenoTest <- phenotype[indexTest]
33 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
34 pred <- predict(model, test)
35 tempAcc <- c(tempAcc, r2(phenoTest, pred))
36 }
37 acc[indexNtree,indexMtry] <- mean(tempAcc)
38 indexMtry <- indexMtry+1
39 }
40 indexNtree <- indexNtree+1
41 }
42 colnames(acc) <- rangeMtry
43 rownames(acc) <- rangeNtree
44 bestParam <- which(acc==max(acc), arr.ind = T)
45 return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]]))
46 }
47
48 r2 <- function(target, prediction) {
49 sst <- sum((target-mean(target))^2)
50 ssr <- sum((target-prediction)^2)
51 return(1-ssr/sst)
52 }
53 ################################## main function ###########################
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) {
55
56 # go for optimization
57 if(is.null(mtry) | is.null(ntree)) {
58 if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))}
59 if(is.null(ntree)) {ntree <- seq(100,1000,100)}
60 opt <- optimize(genotype=genotype, phenotype=phenotype,
61 rangeNtree = ntree, rangeMtry = mtry)
62 mtry <- opt$mtry
63 ntree <- opt$ntree
64 }
65 # evaluation
66 if(evaluation) {
67 prediction <- NULL
68 for(i in 1:length(folds)) {
69 train <- genotype[-folds[[i]],]
70 test <- genotype[folds[[i]],]
71 phenoTrain <- phenotype[-folds[[i]]]
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
73 prediction <- c(prediction, list(predict(rf, test)))
74 }
75 saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
76 } else {
77 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
78 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
79 }
80 }
81
82
83 ############################ main #############################
84 cmd <- commandArgs(T)
85 source(cmd[1])
86 # load classifier parameters
87 mtry <- as.numeric(mtry)
88 ntree <- as.numeric(ntree)
89 if(mtry == -1) {mtry <- NULL}
90 # check if evaluation is required
91 evaluation <- F
92 if(as.integer(doEvaluation) == 1) {
93 evaluation <- T
94 con = file(folds)
95 folds <- readLines(con = con, n = 1, ok=T)
96 close(con)
97 folds <- readRDS(folds)
98 }
99 # load genotype and phenotype
100 con = file(genotype)
101 genotype <- readLines(con = con, n = 1, ok=T)
102 close(con)
103 genotype <- read.table(genotype, sep="\t", h=T)
104 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
105 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
106 # run !
107 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
108 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
109 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))