view lasso.R @ 22:f0d89ff35ad2 draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 10:35:13 -0400
parents dcc10adbe46b
children 2e66da6efc41
line wrap: on
line source

########################################################
#
# creation date : 08/01/16
# last modification : 01/09/16
# author : Dr Nicolas Beaume
# owner : IRRI
#
########################################################
log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt")
sink(file = log, type="message")

library(glmnet)
library(methods)
############################ helper functions #######################

createFolds <- function(nbObs, n) {
  index <- sample(1:n, size=nbObs, replace = T)
  folds <- NULL
  for(i in 1:n) {
    folds <- c(folds, list(which(index==i)))
  }
  return(folds)
}

optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) {
  acc <- NULL
  indexAlpha <- 1
  for(a in alpha) {
    curAcc <- NULL
    for(i in 1:nfolds) {
      n <- ceiling(nrow(genotype)/3)
      indexTest <- sample(1:nrow(genotype), size=n)
      train <- genotype[-indexTest,]
      test <- genotype[indexTest,]
      phenoTrain <- phenotype[-indexTest]
      phenoTest <- phenotype[indexTest]
      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
      model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
      pred <- predict(model, test, type = "response")
      curAcc <- c(curAcc, r2(phenoTest, pred))
    }
    acc <- c(acc, mean(curAcc))
  }
  names(acc) <- alpha
  return(as.numeric(names(acc)[which.max(acc)]))
}

r2 <- function(target, prediction) {
  sst <- sum((target-mean(target))^2)
  ssr <- sum((target-prediction)^2)
  return(1-ssr/sst)
}
################################## main function ###########################

lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
  # go for optimization
  if(is.null(alpha)) {
    alpha <- seq(0,1,0.1)
    alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
  }
  # evaluation
  if(evaluation) {
    prediction <- NULL
    for(i in 1:length(folds)) {
      train <- genotype[-folds[[i]],]
      test <- genotype[folds[[i]],]
      phenoTrain <- phenotype[-folds[[i]]]
      phenoTest <- phenotype[folds[[i]]]
      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
      lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
      prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
    }
    saveRDS(prediction, file=paste(outFile,".rds", sep=""))
    # just create a model
  } else {
    cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
    model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
    saveRDS(model, file = paste(outFile, ".rds", sep = ""))
  }
}

############################ main #############################
# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
# lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file 
## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype).
# please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used)

## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R).
# please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)

## -e : do evaluation instead of producing a model

## -f : index of the folds to which belong each individual
# please note that the list must be called "folds" (automatic if folds.R was used)

## -o : path to the output folder and generic name of the analysis. The extension related to each type of
# files are automatically added

cmd <- commandArgs(T)
source(cmd[1])
# check if evaluation is required
evaluation <- F
if(as.integer(doEvaluation) == 1) {
  evaluation <- T
  con = file(folds)
  folds <- readLines(con = con, n = 1, ok=T)
  close(con)
  folds <- readRDS(folds)
}
# load classifier parameters
if(as.numeric(alpha) == -1) {alpha <- NULL}
# load genotype and phenotype
con = file(genotype)
genotype <- readLines(con = con, n = 1, ok=T)
close(con)
genotype <- read.table(genotype, sep="\t", h=T)
# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 
# run !
lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype,
               evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))