diff randomForest.R @ 40:8541f9a21aec draft

Uploaded
author nicolas
date Tue, 25 Oct 2016 14:42:32 -0400
parents f311dc86809f
children
line wrap: on
line diff
--- a/randomForest.R	Tue Oct 25 14:41:59 2016 -0400
+++ b/randomForest.R	Tue Oct 25 14:42:32 2016 -0400
@@ -1,79 +1,82 @@
 ########################################################
 #
 # creation date : 07/01/16
-# last modification : 02/09/16
+# last modification : 25/10/16
 # author : Dr Nicolas Beaume
 #
 ########################################################
 
-log <- file(paste(getwd(), "log_randomForest.txt", sep="/"), open = "wt")
-sink(file = log, type="message")
-
-library(randomForest)
-#library(pRF)
-
+suppressWarnings(suppressMessages(library(randomForest)))
 ############################ helper functions #######################
-optimize <- function(genotype, phenotype, rangeNtree=seq(100,1000,100), 
-                     rangeMtry=seq(ceiling(ncol(genotype)/10),
-                                   ceiling(ncol(genotype)/5), ceiling(ncol(genotype)/100)),
+# optimize
+optimize <- function(genotype, phenotype, ntree=1000, 
+                     rangeMtry=seq(ceiling(ncol(genotype)/5),
+                                   ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)),
                      repet=3) {
-  acc <- matrix(rep(-1, length(rangeNtree)*length(rangeMtry)), ncol=length(rangeMtry))
-  indexNtree <- 1
-  for(ntree in rangeNtree) {
-    indexMtry <- 1
-    for(mtry in rangeMtry) {
-      tempAcc <- NULL
-      for(i in 1:repet) {
-        n <- ceiling(nrow(genotype)/3)
-        indexTest <- sample(1:nrow(genotype), size=n)
-        train <- genotype[-indexTest,]
-        test <- genotype[indexTest,]
-        phenoTrain <- phenotype[-indexTest]
-        phenoTest <- phenotype[indexTest]
-        model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
-        pred <- predict(model, test)
-        tempAcc <- c(tempAcc, r2(phenoTest, pred))
-      }
-      acc[indexNtree,indexMtry] <- mean(tempAcc)
-      indexMtry <- indexMtry+1
+  # accuracy over all mtry values
+  acc <- NULL
+  for(mtry in rangeMtry) {
+    # to compute the mean accuracy over repetiotion for the current mtry value
+    tempAcc <- NULL
+    for(i in 1:repet) {
+      # 1/3 of the dataset is used as test set
+      n <- ceiling(nrow(genotype)/3)
+      indexTest <- sample(1:nrow(genotype), size=n)
+      # create training and test set
+      train <- genotype[-indexTest,]
+      test <- genotype[indexTest,]
+      phenoTrain <- phenotype[-indexTest]
+      phenoTest <- phenotype[indexTest]
+      # compute model
+      model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
+      # predict on test set and compute accuracy
+      pred <- predict(model, test)
+      tempAcc <- c(tempAcc, r2(phenoTest, pred))
     }
-    indexNtree <- indexNtree+1
+    # find mean accuracy for the current mtry value
+    acc <- c(acc, mean(tempAcc))
   }
-  colnames(acc) <- rangeMtry
-  rownames(acc) <- rangeNtree
-  bestParam <- which(acc==max(acc), arr.ind = T)
-  return(list(ntree=rangeNtree[bestParam[1,1]], mtry=rangeMtry[bestParam[1,2]]))
+  # return mtry for the best accuracy
+  names(acc) <- rangeMtry
+  bestParam <- which.max(acc)
+  return(rangeMtry[bestParam])
 }
 
+# compute r2 by computing the classic formula
+# compare the sum of square difference from target to prediciton
+# to the sum of square difference from target to the mean of the target
 r2 <- function(target, prediction) {
   sst <- sum((target-mean(target))^2)
   ssr <- sum((target-prediction)^2)
   return(1-ssr/sst)
 }
 ################################## main function ###########################
-rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=NULL) {
+rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) {
   
   # go for optimization
-  if(is.null(mtry) | is.null(ntree)) {
-    if(is.null(mtry)) {mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))}
-    if(is.null(ntree)) {ntree <- seq(100,1000,100)}
-    opt <- optimize(genotype=genotype, phenotype=phenotype, 
-                    rangeNtree = ntree, rangeMtry = mtry)
-    mtry <- opt$mtry
-    ntree <- opt$ntree
+  if(is.null(mtry)) {
+    # find best mtry
+    mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))
+    mtry <- optimize(genotype=genotype, phenotype=phenotype, 
+                    ntree = ntree, rangeMtry = mtry)
   }
   # evaluation
   if(evaluation) {
     prediction <- NULL
     for(i in 1:length(folds)) {
+      # create training and testing set for the current fold
       train <- genotype[-folds[[i]],]
       test <- genotype[folds[[i]],]
       phenoTrain <- phenotype[-folds[[i]]]
+      # compute model
       rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
+      # predict and save prediction for the current fold
       prediction <- c(prediction, list(predict(rf, test)))
     }
+    # save preductions for all folds to be used for evaluation
     saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
   } else {
+    # just compute the model and save it
     model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
     saveRDS(model, file = paste(outFile, ".rds", sep = ""))
   }
@@ -81,6 +84,7 @@
 
 
 ############################ main #############################
+# load parameters
 cmd <- commandArgs(T)
 source(cmd[1])
 # load classifier parameters
@@ -101,9 +105,9 @@
 genotype <- readLines(con = con, n = 1, ok=T)
 close(con)
 genotype <- read.table(genotype, sep="\t", h=T)
-# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 
 # run !
 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
             evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
+# send the path containing results to galaxy
 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))