| 
9
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 08/01/16
 | 
| 
 | 
     4 # last modification : 01/09/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 # owner : IRRI
 | 
| 
 | 
     7 #
 | 
| 
 | 
     8 ########################################################
 | 
| 
 | 
     9 log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt")
 | 
| 
 | 
    10 sink(file = log, type="message")
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 library(glmnet)
 | 
| 
 | 
    13 library(methods)
 | 
| 
 | 
    14 ############################ helper functions #######################
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 createFolds <- function(nbObs, n) {
 | 
| 
 | 
    17   index <- sample(1:n, size=nbObs, replace = T)
 | 
| 
 | 
    18   folds <- NULL
 | 
| 
 | 
    19   for(i in 1:n) {
 | 
| 
 | 
    20     folds <- c(folds, list(which(index==i)))
 | 
| 
 | 
    21   }
 | 
| 
 | 
    22   return(folds)
 | 
| 
 | 
    23 }
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) {
 | 
| 
 | 
    26   acc <- NULL
 | 
| 
 | 
    27   indexAlpha <- 1
 | 
| 
 | 
    28   for(a in alpha) {
 | 
| 
 | 
    29     curAcc <- NULL
 | 
| 
 | 
    30     for(i in 1:nfolds) {
 | 
| 
 | 
    31       n <- ceiling(nrow(genotype)/3)
 | 
| 
 | 
    32       indexTest <- sample(1:nrow(genotype), size=n)
 | 
| 
 | 
    33       train <- genotype[-indexTest,]
 | 
| 
 | 
    34       test <- genotype[indexTest,]
 | 
| 
 | 
    35       phenoTrain <- phenotype[-indexTest]
 | 
| 
 | 
    36       phenoTest <- phenotype[indexTest]
 | 
| 
 | 
    37       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
 | 
| 
 | 
    38       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
 | 
| 
 | 
    39       pred <- predict(model, test, type = "response")
 | 
| 
 | 
    40       curAcc <- c(curAcc, r2(phenoTest, pred))
 | 
| 
 | 
    41     }
 | 
| 
 | 
    42     acc <- c(acc, mean(curAcc))
 | 
| 
 | 
    43   }
 | 
| 
 | 
    44   names(acc) <- alpha
 | 
| 
 | 
    45   return(as.numeric(names(acc)[which.max(acc)]))
 | 
| 
 | 
    46 }
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 r2 <- function(target, prediction) {
 | 
| 
 | 
    49   sst <- sum((target-mean(target))^2)
 | 
| 
 | 
    50   ssr <- sum((target-prediction)^2)
 | 
| 
 | 
    51   return(1-ssr/sst)
 | 
| 
 | 
    52 }
 | 
| 
 | 
    53 ################################## main function ###########################
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
 | 
| 
 | 
    56   # go for optimization
 | 
| 
 | 
    57   if(is.null(alpha)) {
 | 
| 
 | 
    58     alpha <- seq(0,1,0.1)
 | 
| 
 | 
    59     alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
 | 
| 
 | 
    60   }
 | 
| 
 | 
    61   # evaluation
 | 
| 
 | 
    62   if(evaluation) {
 | 
| 
 | 
    63     prediction <- NULL
 | 
| 
 | 
    64     for(i in 1:length(folds)) {
 | 
| 
 | 
    65       train <- genotype[-folds[[i]],]
 | 
| 
 | 
    66       test <- genotype[folds[[i]],]
 | 
| 
 | 
    67       phenoTrain <- phenotype[-folds[[i]]]
 | 
| 
 | 
    68       phenoTest <- phenotype[folds[[i]]]
 | 
| 
 | 
    69       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
 | 
| 
 | 
    70       lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
 | 
| 
 | 
    71       prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
 | 
| 
 | 
    72     }
 | 
| 
 | 
    73     saveRDS(prediction, file=paste(outFile,".rds", sep=""))
 | 
| 
 | 
    74     # just create a model
 | 
| 
 | 
    75   } else {
 | 
| 
 | 
    76     cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
 | 
| 
 | 
    77     model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
 | 
| 
 | 
    78     saveRDS(model, file = paste(outFile, ".rds", sep = ""))
 | 
| 
 | 
    79   }
 | 
| 
 | 
    80 }
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 ############################ main #############################
 | 
| 
 | 
    83 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
 | 
| 
 | 
    84 # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file 
 | 
| 
 | 
    85 ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype).
 | 
| 
 | 
    86 # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used)
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R).
 | 
| 
 | 
    89 # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 ## -e : do evaluation instead of producing a model
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 ## -f : index of the folds to which belong each individual
 | 
| 
 | 
    94 # please note that the list must be called "folds" (automatic if folds.R was used)
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 ## -o : path to the output folder and generic name of the analysis. The extension related to each type of
 | 
| 
 | 
    97 # files are automatically added
 | 
| 
 | 
    98 
 | 
| 
 | 
    99 cmd <- commandArgs(T)
 | 
| 
 | 
   100 source(cmd[1])
 | 
| 
 | 
   101 # check if evaluation is required
 | 
| 
 | 
   102 evaluation <- F
 | 
| 
 | 
   103 if(as.integer(doEvaluation) == 1) {
 | 
| 
 | 
   104   evaluation <- T
 | 
| 
 | 
   105   con = file(folds)
 | 
| 
 | 
   106   folds <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   107   close(con)
 | 
| 
 | 
   108   folds <- readRDS(folds)
 | 
| 
 | 
   109 }
 | 
| 
 | 
   110 # load classifier parameters
 | 
| 
 | 
   111 if(as.numeric(alpha) == -1) {alpha <- NULL}
 | 
| 
 | 
   112 # load genotype and phenotype
 | 
| 
 | 
   113 con = file(genotype)
 | 
| 
 | 
   114 genotype <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   115 close(con)
 | 
| 
 | 
   116 genotype <- read.table(genotype, sep="\t", h=T)
 | 
| 
 | 
   117 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
 | 
| 
 | 
   118 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 
 | 
| 
 | 
   119 # run !
 | 
| 
 | 
   120 lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype,
 | 
| 
 | 
   121                evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
 | 
| 
 | 
   122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
 |