10
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 07/01/16
|
|
4 # last modification : 02/09/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 #
|
|
7 ########################################################
|
|
8
|
|
9 log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt")
|
|
10 sink(file = log, type="message")
|
|
11
|
|
12 library(randomForest)
|
|
13 #library(pRF)
|
|
14
|
|
15 ############################ helper functions #######################
|
|
16 optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100),
|
|
17 rangeMtry=seq(ceiling(ncol(genotype)/10),
|
|
18 ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)),
|
|
19 repet=3) {
|
|
20 acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry))
|
|
21 indexNtree <- 1
|
|
22 for(ntree in rangeNtree) {
|
|
23 indexMtry <- 1
|
|
24 for(mtry in rangeMtry) {
|
|
25 tempAcc <- NULL
|
|
26 for(i in 1:repet) {
|
|
27 n <- ceiling(nrow(genotype)/3)
|
|
28 indexTest <- sample(1:nrow(genotype), size=n)
|
|
29 train <- genotype[-indexTest,]
|
|
30 test <- genotype[indexTest,]
|
|
31 phenoTrain <- phenotype[-indexTest]
|
|
32 phenoTest <- phenotype[indexTest]
|
|
33 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
|
|
34 pred <- predict(model, test)
|
|
35 tempAcc <- c(tempAcc, r2(phenoTest, pred))
|
|
36 }
|
|
37 acc[indexNtree,indexMtry] <- mean(tempAcc)
|
|
38 indexMtry <- indexMtry+1
|
|
39 }
|
|
40 indexNtree <- indexNtree+1
|
|
41 }
|
|
42 colnames(acc) <- rangeMtry
|
|
43 rownames(acc) <- rangeNtree
|
|
44 bestParam <- which(acc==max(acc), arr.ind = T)
|
|
45 return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]]))
|
|
46 }
|
|
47
|
|
48 r2 <- function(target, prediction) {
|
|
49 sst <- sum((target-mean(target))^2)
|
|
50 ssr <- sum((target-prediction)^2)
|
|
51 return(1-ssr/sst)
|
|
52 }
|
|
53 ################################## main function ###########################
|
|
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) {
|
|
55
|
|
56 # go for optimization
|
|
57 if(is.null(mtry) | is.null(ntree)) {
|
|
58 if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))}
|
|
59 if(is.null(ntree)) {ntree <- seq(100,1000,100)}
|
|
60 opt <- optimize(genotype=genotype, phenotype=phenotype,
|
|
61 rangeNtree = ntree, rangeMtry = mtry)
|
|
62 mtry <- opt$mtry
|
|
63 ntree <- opt$ntree
|
|
64 }
|
|
65 # evaluation
|
|
66 if(evaluation) {
|
|
67 prediction <- NULL
|
|
68 for(i in 1:length(folds)) {
|
|
69 train <- genotype[-folds[[i]],]
|
|
70 test <- genotype[folds[[i]],]
|
|
71 phenoTrain <- phenotype[-folds[[i]]]
|
|
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
|
|
73 prediction <- c(prediction, list(predict(rf, test)))
|
|
74 }
|
|
75 saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
|
|
76 } else {
|
|
77 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
|
|
78 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
|
|
79 }
|
|
80 }
|
|
81
|
|
82
|
|
83 ############################ main #############################
|
|
84 cmd <- commandArgs(T)
|
|
85 source(cmd[1])
|
|
86 # load classifier parameters
|
|
87 mtry <- as.numeric(mtry)
|
|
88 ntree <- as.numeric(ntree)
|
|
89 if(mtry == -1) {mtry <- NULL}
|
|
90 # check if evaluation is required
|
|
91 evaluation <- F
|
|
92 if(as.integer(doEvaluation) == 1) {
|
|
93 evaluation <- T
|
|
94 con = file(folds)
|
|
95 folds <- readLines(con = con, n = 1, ok=T)
|
|
96 close(con)
|
|
97 folds <- readRDS(folds)
|
|
98 }
|
|
99 # load genotype and phenotype
|
|
100 con = file(genotype)
|
|
101 genotype <- readLines(con = con, n = 1, ok=T)
|
|
102 close(con)
|
|
103 genotype <- read.table(genotype, sep="\t", h=T)
|
|
104 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
|
|
105 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
|
|
106 # run !
|
|
107 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
|
|
108 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
|
|
109 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
|