annotate randomForest.R @ 18:27fb6c2a98a3 draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 06:30:52 -0400
parents f311dc86809f
children 8541f9a21aec
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
f311dc86809f Uploaded
nicolas
parents:
diff changeset
1 ########################################################
f311dc86809f Uploaded
nicolas
parents:
diff changeset
2 #
f311dc86809f Uploaded
nicolas
parents:
diff changeset
3 # creation date : 07/01/16
f311dc86809f Uploaded
nicolas
parents:
diff changeset
4 # last modification : 02/09/16
f311dc86809f Uploaded
nicolas
parents:
diff changeset
5 # author : Dr Nicolas Beaume
f311dc86809f Uploaded
nicolas
parents:
diff changeset
6 #
f311dc86809f Uploaded
nicolas
parents:
diff changeset
7 ########################################################
f311dc86809f Uploaded
nicolas
parents:
diff changeset
8
f311dc86809f Uploaded
nicolas
parents:
diff changeset
9 log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt")
f311dc86809f Uploaded
nicolas
parents:
diff changeset
10 sink(file = log, type="message")
f311dc86809f Uploaded
nicolas
parents:
diff changeset
11
f311dc86809f Uploaded
nicolas
parents:
diff changeset
12 library(randomForest)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
13 #library(pRF)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
14
f311dc86809f Uploaded
nicolas
parents:
diff changeset
15 ############################ helper functions #######################
f311dc86809f Uploaded
nicolas
parents:
diff changeset
16 optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100),
f311dc86809f Uploaded
nicolas
parents:
diff changeset
17 rangeMtry=seq(ceiling(ncol(genotype)/10),
f311dc86809f Uploaded
nicolas
parents:
diff changeset
18 ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)),
f311dc86809f Uploaded
nicolas
parents:
diff changeset
19 repet=3) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
20 acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
21 indexNtree <- 1
f311dc86809f Uploaded
nicolas
parents:
diff changeset
22 for(ntree in rangeNtree) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
23 indexMtry <- 1
f311dc86809f Uploaded
nicolas
parents:
diff changeset
24 for(mtry in rangeMtry) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
25 tempAcc <- NULL
f311dc86809f Uploaded
nicolas
parents:
diff changeset
26 for(i in 1:repet) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
27 n <- ceiling(nrow(genotype)/3)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
28 indexTest <- sample(1:nrow(genotype), size=n)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
29 train <- genotype[-indexTest,]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
30 test <- genotype[indexTest,]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
31 phenoTrain <- phenotype[-indexTest]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
32 phenoTest <- phenotype[indexTest]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
33 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
34 pred <- predict(model, test)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
35 tempAcc <- c(tempAcc, r2(phenoTest, pred))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
36 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
37 acc[indexNtree,indexMtry] <- mean(tempAcc)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
38 indexMtry <- indexMtry+1
f311dc86809f Uploaded
nicolas
parents:
diff changeset
39 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
40 indexNtree <- indexNtree+1
f311dc86809f Uploaded
nicolas
parents:
diff changeset
41 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
42 colnames(acc) <- rangeMtry
f311dc86809f Uploaded
nicolas
parents:
diff changeset
43 rownames(acc) <- rangeNtree
f311dc86809f Uploaded
nicolas
parents:
diff changeset
44 bestParam <- which(acc==max(acc), arr.ind = T)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
45 return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]]))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
46 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
47
f311dc86809f Uploaded
nicolas
parents:
diff changeset
48 r2 <- function(target, prediction) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
49 sst <- sum((target-mean(target))^2)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
50 ssr <- sum((target-prediction)^2)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
51 return(1-ssr/sst)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
52 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
53 ################################## main function ###########################
f311dc86809f Uploaded
nicolas
parents:
diff changeset
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
55
f311dc86809f Uploaded
nicolas
parents:
diff changeset
56 # go for optimization
f311dc86809f Uploaded
nicolas
parents:
diff changeset
57 if(is.null(mtry) | is.null(ntree)) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
58 if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))}
f311dc86809f Uploaded
nicolas
parents:
diff changeset
59 if(is.null(ntree)) {ntree <- seq(100,1000,100)}
f311dc86809f Uploaded
nicolas
parents:
diff changeset
60 opt <- optimize(genotype=genotype, phenotype=phenotype,
f311dc86809f Uploaded
nicolas
parents:
diff changeset
61 rangeNtree = ntree, rangeMtry = mtry)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
62 mtry <- opt$mtry
f311dc86809f Uploaded
nicolas
parents:
diff changeset
63 ntree <- opt$ntree
f311dc86809f Uploaded
nicolas
parents:
diff changeset
64 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
65 # evaluation
f311dc86809f Uploaded
nicolas
parents:
diff changeset
66 if(evaluation) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
67 prediction <- NULL
f311dc86809f Uploaded
nicolas
parents:
diff changeset
68 for(i in 1:length(folds)) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
69 train <- genotype[-folds[[i]],]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
70 test <- genotype[folds[[i]],]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
71 phenoTrain <- phenotype[-folds[[i]]]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
73 prediction <- c(prediction, list(predict(rf, test)))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
74 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
75 saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
76 } else {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
77 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
78 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
f311dc86809f Uploaded
nicolas
parents:
diff changeset
79 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
80 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
81
f311dc86809f Uploaded
nicolas
parents:
diff changeset
82
f311dc86809f Uploaded
nicolas
parents:
diff changeset
83 ############################ main #############################
f311dc86809f Uploaded
nicolas
parents:
diff changeset
84 cmd <- commandArgs(T)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
85 source(cmd[1])
f311dc86809f Uploaded
nicolas
parents:
diff changeset
86 # load classifier parameters
f311dc86809f Uploaded
nicolas
parents:
diff changeset
87 mtry <- as.numeric(mtry)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
88 ntree <- as.numeric(ntree)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
89 if(mtry == -1) {mtry <- NULL}
f311dc86809f Uploaded
nicolas
parents:
diff changeset
90 # check if evaluation is required
f311dc86809f Uploaded
nicolas
parents:
diff changeset
91 evaluation <- F
f311dc86809f Uploaded
nicolas
parents:
diff changeset
92 if(as.integer(doEvaluation) == 1) {
f311dc86809f Uploaded
nicolas
parents:
diff changeset
93 evaluation <- T
f311dc86809f Uploaded
nicolas
parents:
diff changeset
94 con = file(folds)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
95 folds <- readLines(con = con, n = 1, ok=T)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
96 close(con)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
97 folds <- readRDS(folds)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
98 }
f311dc86809f Uploaded
nicolas
parents:
diff changeset
99 # load genotype and phenotype
f311dc86809f Uploaded
nicolas
parents:
diff changeset
100 con = file(genotype)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
101 genotype <- readLines(con = con, n = 1, ok=T)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
102 close(con)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
103 genotype <- read.table(genotype, sep="\t", h=T)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
104 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
f311dc86809f Uploaded
nicolas
parents:
diff changeset
105 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
f311dc86809f Uploaded
nicolas
parents:
diff changeset
106 # run !
f311dc86809f Uploaded
nicolas
parents:
diff changeset
107 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
f311dc86809f Uploaded
nicolas
parents:
diff changeset
108 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
f311dc86809f Uploaded
nicolas
parents:
diff changeset
109 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))