annotate lasso.R @ 18:27fb6c2a98a3 draft

Uploaded
author nicolas
date Fri, 21 Oct 2016 06:30:52 -0400
parents dcc10adbe46b
children 2e66da6efc41
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
1 ########################################################
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
2 #
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
3 # creation date : 08/01/16
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
4 # last modification : 01/09/16
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
5 # author : Dr Nicolas Beaume
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
6 # owner : IRRI
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
7 #
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
8 ########################################################
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
9 log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt")
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
10 sink(file = log, type="message")
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
11
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
12 library(glmnet)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
13 library(methods)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
14 ############################ helper functions #######################
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
15
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
16 createFolds <- function(nbObs, n) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
17 index <- sample(1:n, size=nbObs, replace = T)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
18 folds <- NULL
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
19 for(i in 1:n) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
20 folds <- c(folds, list(which(index==i)))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
21 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
22 return(folds)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
23 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
24
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
25 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
26 acc <- NULL
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
27 indexAlpha <- 1
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
28 for(a in alpha) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
29 curAcc <- NULL
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
30 for(i in 1:nfolds) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
31 n <- ceiling(nrow(genotype)/3)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
32 indexTest <- sample(1:nrow(genotype), size=n)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
33 train <- genotype[-indexTest,]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
34 test <- genotype[indexTest,]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
35 phenoTrain <- phenotype[-indexTest]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
36 phenoTest <- phenotype[indexTest]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
37 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
38 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
39 pred <- predict(model, test, type = "response")
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
40 curAcc <- c(curAcc, r2(phenoTest, pred))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
41 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
42 acc <- c(acc, mean(curAcc))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
43 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
44 names(acc) <- alpha
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
45 return(as.numeric(names(acc)[which.max(acc)]))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
46 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
47
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
48 r2 <- function(target, prediction) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
49 sst <- sum((target-mean(target))^2)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
50 ssr <- sum((target-prediction)^2)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
51 return(1-ssr/sst)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
52 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
53 ################################## main function ###########################
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
54
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
55 lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
56 # go for optimization
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
57 if(is.null(alpha)) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
58 alpha <- seq(0,1,0.1)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
59 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
60 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
61 # evaluation
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
62 if(evaluation) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
63 prediction <- NULL
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
64 for(i in 1:length(folds)) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
65 train <- genotype[-folds[[i]],]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
66 test <- genotype[folds[[i]],]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
67 phenoTrain <- phenotype[-folds[[i]]]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
68 phenoTest <- phenotype[folds[[i]]]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
69 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
70 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
71 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
72 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
73 saveRDS(prediction, file=paste(outFile,".rds", sep=""))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
74 # just create a model
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
75 } else {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
76 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
77 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
78 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
79 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
80 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
81
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
82 ############################ main #############################
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
83 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
84 # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
85 ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype).
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
86 # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
87
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
88 ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R).
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
89 # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
90
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
91 ## -e : do evaluation instead of producing a model
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
92
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
93 ## -f : index of the folds to which belong each individual
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
94 # please note that the list must be called "folds" (automatic if folds.R was used)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
95
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
96 ## -o : path to the output folder and generic name of the analysis. The extension related to each type of
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
97 # files are automatically added
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
98
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
99 cmd <- commandArgs(T)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
100 source(cmd[1])
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
101 # check if evaluation is required
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
102 evaluation <- F
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
103 if(as.integer(doEvaluation) == 1) {
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
104 evaluation <- T
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
105 con = file(folds)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
106 folds <- readLines(con = con, n = 1, ok=T)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
107 close(con)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
108 folds <- readRDS(folds)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
109 }
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
110 # load classifier parameters
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
111 if(as.numeric(alpha) == -1) {alpha <- NULL}
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
112 # load genotype and phenotype
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
113 con = file(genotype)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
114 genotype <- readLines(con = con, n = 1, ok=T)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
115 close(con)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
116 genotype <- read.table(genotype, sep="\t", h=T)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
117 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
118 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
119 # run !
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
120 lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype,
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
121 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
dcc10adbe46b Uploaded
nicolas
parents:
diff changeset
122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))