annotate randomForest.R @ 101:edced3c17e3b draft

Deleted selected files
author nicolas
date Mon, 31 Oct 2016 07:20:20 -0400
parents 94aa63659613
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
85
94aa63659613 Uploaded
nicolas
parents:
diff changeset
1 ########################################################
94aa63659613 Uploaded
nicolas
parents:
diff changeset
2 #
94aa63659613 Uploaded
nicolas
parents:
diff changeset
3 # creation date : 07/01/16
94aa63659613 Uploaded
nicolas
parents:
diff changeset
4 # last modification : 25/10/16
94aa63659613 Uploaded
nicolas
parents:
diff changeset
5 # author : Dr Nicolas Beaume
94aa63659613 Uploaded
nicolas
parents:
diff changeset
6 #
94aa63659613 Uploaded
nicolas
parents:
diff changeset
7 ########################################################
94aa63659613 Uploaded
nicolas
parents:
diff changeset
8
94aa63659613 Uploaded
nicolas
parents:
diff changeset
9 suppressWarnings(suppressMessages(library(randomForest)))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
10 ############################ helper functions #######################
94aa63659613 Uploaded
nicolas
parents:
diff changeset
11 # optimize
94aa63659613 Uploaded
nicolas
parents:
diff changeset
12 optimize <- function(genotype, phenotype, ntree=1000,
94aa63659613 Uploaded
nicolas
parents:
diff changeset
13 rangeMtry=seq(ceiling(ncol(genotype)/5),
94aa63659613 Uploaded
nicolas
parents:
diff changeset
14 ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)),
94aa63659613 Uploaded
nicolas
parents:
diff changeset
15 repet=3) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
16 # accuracy over all mtry values
94aa63659613 Uploaded
nicolas
parents:
diff changeset
17 acc <- NULL
94aa63659613 Uploaded
nicolas
parents:
diff changeset
18 for(mtry in rangeMtry) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
19 # to compute the mean accuracy over repetiotion for the current mtry value
94aa63659613 Uploaded
nicolas
parents:
diff changeset
20 tempAcc <- NULL
94aa63659613 Uploaded
nicolas
parents:
diff changeset
21 for(i in 1:repet) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
22 # 1/3 of the dataset is used as test set
94aa63659613 Uploaded
nicolas
parents:
diff changeset
23 n <- ceiling(nrow(genotype)/3)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
24 indexTest <- sample(1:nrow(genotype), size=n)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
25 # create training and test set
94aa63659613 Uploaded
nicolas
parents:
diff changeset
26 train <- genotype[-indexTest,]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
27 test <- genotype[indexTest,]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
28 phenoTrain <- phenotype[-indexTest]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
29 phenoTest <- phenotype[indexTest]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
30 # compute model
94aa63659613 Uploaded
nicolas
parents:
diff changeset
31 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
32 # predict on test set and compute accuracy
94aa63659613 Uploaded
nicolas
parents:
diff changeset
33 pred <- predict(model, test)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
34 tempAcc <- c(tempAcc, r2(phenoTest, pred))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
35 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
36 # find mean accuracy for the current mtry value
94aa63659613 Uploaded
nicolas
parents:
diff changeset
37 acc <- c(acc, mean(tempAcc))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
38 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
39 # return mtry for the best accuracy
94aa63659613 Uploaded
nicolas
parents:
diff changeset
40 names(acc) <- rangeMtry
94aa63659613 Uploaded
nicolas
parents:
diff changeset
41 bestParam <- which.max(acc)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
42 return(rangeMtry[bestParam])
94aa63659613 Uploaded
nicolas
parents:
diff changeset
43 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
44
94aa63659613 Uploaded
nicolas
parents:
diff changeset
45 # compute r2 by computing the classic formula
94aa63659613 Uploaded
nicolas
parents:
diff changeset
46 # compare the sum of square difference from target to prediciton
94aa63659613 Uploaded
nicolas
parents:
diff changeset
47 # to the sum of square difference from target to the mean of the target
94aa63659613 Uploaded
nicolas
parents:
diff changeset
48 r2 <- function(target, prediction) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
49 sst <- sum((target-mean(target))^2)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
50 ssr <- sum((target-prediction)^2)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
51 return(1-ssr/sst)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
52 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
53 ################################## main function ###########################
94aa63659613 Uploaded
nicolas
parents:
diff changeset
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
55
94aa63659613 Uploaded
nicolas
parents:
diff changeset
56 # go for optimization
94aa63659613 Uploaded
nicolas
parents:
diff changeset
57 if(is.null(mtry)) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
58 # find best mtry
94aa63659613 Uploaded
nicolas
parents:
diff changeset
59 mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
60 mtry <- optimize(genotype=genotype, phenotype=phenotype,
94aa63659613 Uploaded
nicolas
parents:
diff changeset
61 ntree = ntree, rangeMtry = mtry)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
62 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
63 # evaluation
94aa63659613 Uploaded
nicolas
parents:
diff changeset
64 if(evaluation) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
65 prediction <- NULL
94aa63659613 Uploaded
nicolas
parents:
diff changeset
66 for(i in 1:length(folds)) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
67 # create training and testing set for the current fold
94aa63659613 Uploaded
nicolas
parents:
diff changeset
68 train <- genotype[-folds[[i]],]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
69 test <- genotype[folds[[i]],]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
70 phenoTrain <- phenotype[-folds[[i]]]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
71 # compute model
94aa63659613 Uploaded
nicolas
parents:
diff changeset
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
73 # predict and save prediction for the current fold
94aa63659613 Uploaded
nicolas
parents:
diff changeset
74 prediction <- c(prediction, list(predict(rf, test)))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
75 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
76 # save preductions for all folds to be used for evaluation
94aa63659613 Uploaded
nicolas
parents:
diff changeset
77 saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
78 } else {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
79 # just compute the model and save it
94aa63659613 Uploaded
nicolas
parents:
diff changeset
80 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
81 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
94aa63659613 Uploaded
nicolas
parents:
diff changeset
82 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
83 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
84
94aa63659613 Uploaded
nicolas
parents:
diff changeset
85
94aa63659613 Uploaded
nicolas
parents:
diff changeset
86 ############################ main #############################
94aa63659613 Uploaded
nicolas
parents:
diff changeset
87 # load parameters
94aa63659613 Uploaded
nicolas
parents:
diff changeset
88 cmd <- commandArgs(T)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
89 source(cmd[1])
94aa63659613 Uploaded
nicolas
parents:
diff changeset
90 # load classifier parameters
94aa63659613 Uploaded
nicolas
parents:
diff changeset
91 mtry <- as.numeric(mtry)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
92 ntree <- as.numeric(ntree)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
93 if(mtry == -1) {mtry <- NULL}
94aa63659613 Uploaded
nicolas
parents:
diff changeset
94 # check if evaluation is required
94aa63659613 Uploaded
nicolas
parents:
diff changeset
95 evaluation <- F
94aa63659613 Uploaded
nicolas
parents:
diff changeset
96 if(as.integer(doEvaluation) == 1) {
94aa63659613 Uploaded
nicolas
parents:
diff changeset
97 evaluation <- T
94aa63659613 Uploaded
nicolas
parents:
diff changeset
98 con = file(folds)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
99 folds <- readLines(con = con, n = 1, ok=T)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
100 close(con)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
101 folds <- readRDS(folds)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
102 }
94aa63659613 Uploaded
nicolas
parents:
diff changeset
103 # load genotype and phenotype
94aa63659613 Uploaded
nicolas
parents:
diff changeset
104 con = file(genotype)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
105 genotype <- readLines(con = con, n = 1, ok=T)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
106 close(con)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
107 genotype <- read.table(genotype, sep="\t", h=T)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
108 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
94aa63659613 Uploaded
nicolas
parents:
diff changeset
109 # run !
94aa63659613 Uploaded
nicolas
parents:
diff changeset
110 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
94aa63659613 Uploaded
nicolas
parents:
diff changeset
111 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
94aa63659613 Uploaded
nicolas
parents:
diff changeset
112 # send the path containing results to galaxy
94aa63659613 Uploaded
nicolas
parents:
diff changeset
113 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))