85
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 07/01/16
|
|
4 # last modification : 25/10/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 #
|
|
7 ########################################################
|
|
8
|
|
9 suppressWarnings(suppressMessages(library(randomForest)))
|
|
10 ############################ helper functions #######################
|
|
11 # optimize
|
|
12 optimize <- function(genotype, phenotype, ntree=1000,
|
|
13 rangeMtry=seq(ceiling(ncol(genotype)/5),
|
|
14 ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)),
|
|
15 repet=3) {
|
|
16 # accuracy over all mtry values
|
|
17 acc <- NULL
|
|
18 for(mtry in rangeMtry) {
|
|
19 # to compute the mean accuracy over repetiotion for the current mtry value
|
|
20 tempAcc <- NULL
|
|
21 for(i in 1:repet) {
|
|
22 # 1/3 of the dataset is used as test set
|
|
23 n <- ceiling(nrow(genotype)/3)
|
|
24 indexTest <- sample(1:nrow(genotype), size=n)
|
|
25 # create training and test set
|
|
26 train <- genotype[-indexTest,]
|
|
27 test <- genotype[indexTest,]
|
|
28 phenoTrain <- phenotype[-indexTest]
|
|
29 phenoTest <- phenotype[indexTest]
|
|
30 # compute model
|
|
31 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
|
|
32 # predict on test set and compute accuracy
|
|
33 pred <- predict(model, test)
|
|
34 tempAcc <- c(tempAcc, r2(phenoTest, pred))
|
|
35 }
|
|
36 # find mean accuracy for the current mtry value
|
|
37 acc <- c(acc, mean(tempAcc))
|
|
38 }
|
|
39 # return mtry for the best accuracy
|
|
40 names(acc) <- rangeMtry
|
|
41 bestParam <- which.max(acc)
|
|
42 return(rangeMtry[bestParam])
|
|
43 }
|
|
44
|
|
45 # compute r2 by computing the classic formula
|
|
46 # compare the sum of square difference from target to prediciton
|
|
47 # to the sum of square difference from target to the mean of the target
|
|
48 r2 <- function(target, prediction) {
|
|
49 sst <- sum((target-mean(target))^2)
|
|
50 ssr <- sum((target-prediction)^2)
|
|
51 return(1-ssr/sst)
|
|
52 }
|
|
53 ################################## main function ###########################
|
|
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) {
|
|
55
|
|
56 # go for optimization
|
|
57 if(is.null(mtry)) {
|
|
58 # find best mtry
|
|
59 mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))
|
|
60 mtry <- optimize(genotype=genotype, phenotype=phenotype,
|
|
61 ntree = ntree, rangeMtry = mtry)
|
|
62 }
|
|
63 # evaluation
|
|
64 if(evaluation) {
|
|
65 prediction <- NULL
|
|
66 for(i in 1:length(folds)) {
|
|
67 # create training and testing set for the current fold
|
|
68 train <- genotype[-folds[[i]],]
|
|
69 test <- genotype[folds[[i]],]
|
|
70 phenoTrain <- phenotype[-folds[[i]]]
|
|
71 # compute model
|
|
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
|
|
73 # predict and save prediction for the current fold
|
|
74 prediction <- c(prediction, list(predict(rf, test)))
|
|
75 }
|
|
76 # save preductions for all folds to be used for evaluation
|
|
77 saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
|
|
78 } else {
|
|
79 # just compute the model and save it
|
|
80 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
|
|
81 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
|
|
82 }
|
|
83 }
|
|
84
|
|
85
|
|
86 ############################ main #############################
|
|
87 # load parameters
|
|
88 cmd <- commandArgs(T)
|
|
89 source(cmd[1])
|
|
90 # load classifier parameters
|
|
91 mtry <- as.numeric(mtry)
|
|
92 ntree <- as.numeric(ntree)
|
|
93 if(mtry == -1) {mtry <- NULL}
|
|
94 # check if evaluation is required
|
|
95 evaluation <- F
|
|
96 if(as.integer(doEvaluation) == 1) {
|
|
97 evaluation <- T
|
|
98 con = file(folds)
|
|
99 folds <- readLines(con = con, n = 1, ok=T)
|
|
100 close(con)
|
|
101 folds <- readRDS(folds)
|
|
102 }
|
|
103 # load genotype and phenotype
|
|
104 con = file(genotype)
|
|
105 genotype <- readLines(con = con, n = 1, ok=T)
|
|
106 close(con)
|
|
107 genotype <- read.table(genotype, sep="\t", h=T)
|
|
108 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
|
|
109 # run !
|
|
110 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
|
|
111 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
|
|
112 # send the path containing results to galaxy
|
|
113 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
|