Mercurial > repos > nicolas > oghma
changeset 9:dcc10adbe46b draft
Uploaded
author | nicolas |
---|---|
date | Fri, 21 Oct 2016 06:27:31 -0400 |
parents | 2a613e397874 |
children | f311dc86809f |
files | lasso.R |
diffstat | 1 files changed, 122 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lasso.R Fri Oct 21 06:27:31 2016 -0400 @@ -0,0 +1,122 @@ +######################################################## +# +# creation date : 08/01/16 +# last modification : 01/09/16 +# author : Dr Nicolas Beaume +# owner : IRRI +# +######################################################## +log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt") +sink(file = log, type="message") + +library(glmnet) +library(methods) +############################ helper functions ####################### + +createFolds <- function(nbObs, n) { + index <- sample(1:n, size=nbObs, replace = T) + folds <- NULL + for(i in 1:n) { + folds <- c(folds, list(which(index==i))) + } + return(folds) +} + +optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { + acc <- NULL + indexAlpha <- 1 + for(a in alpha) { + curAcc <- NULL + for(i in 1:nfolds) { + n <- ceiling(nrow(genotype)/3) + indexTest <- sample(1:nrow(genotype), size=n) + train <- genotype[-indexTest,] + test <- genotype[indexTest,] + phenoTrain <- phenotype[-indexTest] + phenoTest <- phenotype[indexTest] + cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) + model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) + pred <- predict(model, test, type = "response") + curAcc <- c(curAcc, r2(phenoTest, pred)) + } + acc <- c(acc, mean(curAcc)) + } + names(acc) <- alpha + return(as.numeric(names(acc)[which.max(acc)])) +} + +r2 <- function(target, prediction) { + sst <- sum((target-mean(target))^2) + ssr <- sum((target-prediction)^2) + return(1-ssr/sst) +} +################################## main function ########################### + +lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { + # go for optimization + if(is.null(alpha)) { + alpha <- seq(0,1,0.1) + alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) + } + # evaluation + if(evaluation) { + prediction <- NULL + for(i in 1:length(folds)) { + train <- genotype[-folds[[i]],] + test <- genotype[folds[[i]],] + phenoTrain <- phenotype[-folds[[i]]] + phenoTest <- phenotype[folds[[i]]] + cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) + lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) + prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) + } + saveRDS(prediction, file=paste(outFile,".rds", sep="")) + # just create a model + } else { + cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) + model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) + saveRDS(model, file = paste(outFile, ".rds", sep = "")) + } +} + +############################ main ############################# +# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : +# lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file +## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype). +# please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used) + +## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R). +# please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) + +## -e : do evaluation instead of producing a model + +## -f : index of the folds to which belong each individual +# please note that the list must be called "folds" (automatic if folds.R was used) + +## -o : path to the output folder and generic name of the analysis. The extension related to each type of +# files are automatically added + +cmd <- commandArgs(T) +source(cmd[1]) +# check if evaluation is required +evaluation <- F +if(as.integer(doEvaluation) == 1) { + evaluation <- T + con = file(folds) + folds <- readLines(con = con, n = 1, ok=T) + close(con) + folds <- readRDS(folds) +} +# load classifier parameters +if(as.numeric(alpha) == -1) {alpha <- NULL} +# load genotype and phenotype +con = file(genotype) +genotype <- readLines(con = con, n = 1, ok=T) +close(con) +genotype <- read.table(genotype, sep="\t", h=T) +# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve +phenotype <- read.table(phenotype, sep="\t", h=T)[,1] +# run ! +lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, + evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) +cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))