Mercurial > repos > nicolas > oghma
view lasso.R @ 22:f0d89ff35ad2 draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 10:35:13 -0400 |
parents | dcc10adbe46b |
children | 2e66da6efc41 |
line wrap: on
line source
######################################################## # # creation date : 08/01/16 # last modification : 01/09/16 # author : Dr Nicolas Beaume # owner : IRRI # ######################################################## log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt") sink(file = log, type="message") library(glmnet) library(methods) ############################ helper functions ####################### createFolds <- function(nbObs, n) { index <- sample(1:n, size=nbObs, replace = T) folds <- NULL for(i in 1:n) { folds <- c(folds, list(which(index==i))) } return(folds) } optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { acc <- NULL indexAlpha <- 1 for(a in alpha) { curAcc <- NULL for(i in 1:nfolds) { n <- ceiling(nrow(genotype)/3) indexTest <- sample(1:nrow(genotype), size=n) train <- genotype[-indexTest,] test <- genotype[indexTest,] phenoTrain <- phenotype[-indexTest] phenoTest <- phenotype[indexTest] cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) pred <- predict(model, test, type = "response") curAcc <- c(curAcc, r2(phenoTest, pred)) } acc <- c(acc, mean(curAcc)) } names(acc) <- alpha return(as.numeric(names(acc)[which.max(acc)])) } r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) return(1-ssr/sst) } ################################## main function ########################### lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { # go for optimization if(is.null(alpha)) { alpha <- seq(0,1,0.1) alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) } # evaluation if(evaluation) { prediction <- NULL for(i in 1:length(folds)) { train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] phenoTest <- phenotype[folds[[i]]] cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) } saveRDS(prediction, file=paste(outFile,".rds", sep="")) # just create a model } else { cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) saveRDS(model, file = paste(outFile, ".rds", sep = "")) } } ############################ main ############################# # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype). # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used) ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R). # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) ## -e : do evaluation instead of producing a model ## -f : index of the folds to which belong each individual # please note that the list must be called "folds" (automatic if folds.R was used) ## -o : path to the output folder and generic name of the analysis. The extension related to each type of # files are automatically added cmd <- commandArgs(T) source(cmd[1]) # check if evaluation is required evaluation <- F if(as.integer(doEvaluation) == 1) { evaluation <- T con = file(folds) folds <- readLines(con = con, n = 1, ok=T) close(con) folds <- readRDS(folds) } # load classifier parameters if(as.numeric(alpha) == -1) {alpha <- NULL} # load genotype and phenotype con = file(genotype) genotype <- readLines(con = con, n = 1, ok=T) close(con) genotype <- read.table(genotype, sep="\t", h=T) # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # run ! lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))