| 
85
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 07/01/16
 | 
| 
 | 
     4 # last modification : 25/10/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 #
 | 
| 
 | 
     7 ########################################################
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 suppressWarnings(suppressMessages(library(randomForest)))
 | 
| 
 | 
    10 ############################ helper functions #######################
 | 
| 
 | 
    11 # optimize
 | 
| 
 | 
    12 optimize <- function(genotype, phenotype, ntree=1000, 
 | 
| 
 | 
    13                      rangeMtry=seq(ceiling(ncol(genotype)/5),
 | 
| 
 | 
    14                                    ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)),
 | 
| 
 | 
    15                      repet=3) {
 | 
| 
 | 
    16   # accuracy over all mtry values
 | 
| 
 | 
    17   acc <- NULL
 | 
| 
 | 
    18   for(mtry in rangeMtry) {
 | 
| 
 | 
    19     # to compute the mean accuracy over repetiotion for the current mtry value
 | 
| 
 | 
    20     tempAcc <- NULL
 | 
| 
 | 
    21     for(i in 1:repet) {
 | 
| 
 | 
    22       # 1/3 of the dataset is used as test set
 | 
| 
 | 
    23       n <- ceiling(nrow(genotype)/3)
 | 
| 
 | 
    24       indexTest <- sample(1:nrow(genotype), size=n)
 | 
| 
 | 
    25       # create training and test set
 | 
| 
 | 
    26       train <- genotype[-indexTest,]
 | 
| 
 | 
    27       test <- genotype[indexTest,]
 | 
| 
 | 
    28       phenoTrain <- phenotype[-indexTest]
 | 
| 
 | 
    29       phenoTest <- phenotype[indexTest]
 | 
| 
 | 
    30       # compute model
 | 
| 
 | 
    31       model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
 | 
| 
 | 
    32       # predict on test set and compute accuracy
 | 
| 
 | 
    33       pred <- predict(model, test)
 | 
| 
 | 
    34       tempAcc <- c(tempAcc, r2(phenoTest, pred))
 | 
| 
 | 
    35     }
 | 
| 
 | 
    36     # find mean accuracy for the current mtry value
 | 
| 
 | 
    37     acc <- c(acc, mean(tempAcc))
 | 
| 
 | 
    38   }
 | 
| 
 | 
    39   # return mtry for the best accuracy
 | 
| 
 | 
    40   names(acc) <- rangeMtry
 | 
| 
 | 
    41   bestParam <- which.max(acc)
 | 
| 
 | 
    42   return(rangeMtry[bestParam])
 | 
| 
 | 
    43 }
 | 
| 
 | 
    44 
 | 
| 
 | 
    45 # compute r2 by computing the classic formula
 | 
| 
 | 
    46 # compare the sum of square difference from target to prediciton
 | 
| 
 | 
    47 # to the sum of square difference from target to the mean of the target
 | 
| 
 | 
    48 r2 <- function(target, prediction) {
 | 
| 
 | 
    49   sst <- sum((target-mean(target))^2)
 | 
| 
 | 
    50   ssr <- sum((target-prediction)^2)
 | 
| 
 | 
    51   return(1-ssr/sst)
 | 
| 
 | 
    52 }
 | 
| 
 | 
    53 ################################## main function ###########################
 | 
| 
 | 
    54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) {
 | 
| 
 | 
    55   
 | 
| 
 | 
    56   # go for optimization
 | 
| 
 | 
    57   if(is.null(mtry)) {
 | 
| 
 | 
    58     # find best mtry
 | 
| 
 | 
    59     mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))
 | 
| 
 | 
    60     mtry <- optimize(genotype=genotype, phenotype=phenotype, 
 | 
| 
 | 
    61                     ntree = ntree, rangeMtry = mtry)
 | 
| 
 | 
    62   }
 | 
| 
 | 
    63   # evaluation
 | 
| 
 | 
    64   if(evaluation) {
 | 
| 
 | 
    65     prediction <- NULL
 | 
| 
 | 
    66     for(i in 1:length(folds)) {
 | 
| 
 | 
    67       # create training and testing set for the current fold
 | 
| 
 | 
    68       train <- genotype[-folds[[i]],]
 | 
| 
 | 
    69       test <- genotype[folds[[i]],]
 | 
| 
 | 
    70       phenoTrain <- phenotype[-folds[[i]]]
 | 
| 
 | 
    71       # compute model
 | 
| 
 | 
    72       rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
 | 
| 
 | 
    73       # predict and save prediction for the current fold
 | 
| 
 | 
    74       prediction <- c(prediction, list(predict(rf, test)))
 | 
| 
 | 
    75     }
 | 
| 
 | 
    76     # save preductions for all folds to be used for evaluation
 | 
| 
 | 
    77     saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
 | 
| 
 | 
    78   } else {
 | 
| 
 | 
    79     # just compute the model and save it
 | 
| 
 | 
    80     model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
 | 
| 
 | 
    81     saveRDS(model, file = paste(outFile, ".rds", sep = ""))
 | 
| 
 | 
    82   }
 | 
| 
 | 
    83 }
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 ############################ main #############################
 | 
| 
 | 
    87 # load parameters
 | 
| 
 | 
    88 cmd <- commandArgs(T)
 | 
| 
 | 
    89 source(cmd[1])
 | 
| 
 | 
    90 # load classifier parameters
 | 
| 
 | 
    91 mtry <- as.numeric(mtry)
 | 
| 
 | 
    92 ntree <- as.numeric(ntree)
 | 
| 
 | 
    93 if(mtry == -1) {mtry <- NULL}
 | 
| 
 | 
    94 # check if evaluation is required
 | 
| 
 | 
    95 evaluation <- F
 | 
| 
 | 
    96 if(as.integer(doEvaluation) == 1) {
 | 
| 
 | 
    97   evaluation <- T
 | 
| 
 | 
    98   con = file(folds)
 | 
| 
 | 
    99   folds <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   100   close(con)
 | 
| 
 | 
   101   folds <- readRDS(folds)
 | 
| 
 | 
   102 }
 | 
| 
 | 
   103 # load genotype and phenotype
 | 
| 
 | 
   104 con = file(genotype)
 | 
| 
 | 
   105 genotype <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   106 close(con)
 | 
| 
 | 
   107 genotype <- read.table(genotype, sep="\t", h=T)
 | 
| 
 | 
   108 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 
 | 
| 
 | 
   109 # run !
 | 
| 
 | 
   110 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
 | 
| 
 | 
   111             evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
 | 
| 
 | 
   112 # send the path containing results to galaxy
 | 
| 
 | 
   113 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
 |