9
|
1 ########################################################
|
|
2 #
|
|
3 # creation date : 08/01/16
|
|
4 # last modification : 01/09/16
|
|
5 # author : Dr Nicolas Beaume
|
|
6 # owner : IRRI
|
|
7 #
|
|
8 ########################################################
|
|
9
|
35
|
10 suppressWarnings(suppressMessages(library(glmnet)))
|
9
|
11 library(methods)
|
|
12 ############################ helper functions #######################
|
|
13
|
|
14
|
35
|
15 # optimize alpha parameter
|
|
16 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) {
|
9
|
17 acc <- NULL
|
|
18 indexAlpha <- 1
|
|
19 for(a in alpha) {
|
|
20 curAcc <- NULL
|
35
|
21 # repeat nfolds time each analysis
|
|
22 for(i in 1:repet) {
|
|
23 # draw at random 1/3 of the training set for testing and thus choose alpha
|
|
24 # note it is not a cross-validation
|
9
|
25 n <- ceiling(nrow(genotype)/3)
|
|
26 indexTest <- sample(1:nrow(genotype), size=n)
|
35
|
27 # create training set and test set
|
9
|
28 train <- genotype[-indexTest,]
|
|
29 test <- genotype[indexTest,]
|
|
30 phenoTrain <- phenotype[-indexTest]
|
|
31 phenoTest <- phenotype[indexTest]
|
35
|
32 # cv.glmnet allow to compute lambda at the current alpha
|
9
|
33 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
|
35
|
34 # create model
|
9
|
35 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
|
35
|
36 # predict test set
|
9
|
37 pred <- predict(model, test, type = "response")
|
35
|
38 # compute r2 for choosing the best alpha
|
9
|
39 curAcc <- c(curAcc, r2(phenoTest, pred))
|
|
40 }
|
35
|
41 # add mean r2 for this value of lambda to the accuracy vector
|
9
|
42 acc <- c(acc, mean(curAcc))
|
|
43 }
|
35
|
44 # choose best alpha
|
9
|
45 names(acc) <- alpha
|
|
46 return(as.numeric(names(acc)[which.max(acc)]))
|
|
47 }
|
|
48
|
35
|
49 # compute r2 by computing the classic formula
|
|
50 # compare the sum of square difference from target to prediciton
|
|
51 # to the sum of square difference from target to the mean of the target
|
9
|
52 r2 <- function(target, prediction) {
|
|
53 sst <- sum((target-mean(target))^2)
|
|
54 ssr <- sum((target-prediction)^2)
|
|
55 return(1-ssr/sst)
|
|
56 }
|
|
57 ################################## main function ###########################
|
|
58
|
35
|
59 lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
|
9
|
60 # go for optimization
|
|
61 if(is.null(alpha)) {
|
|
62 alpha <- seq(0,1,0.1)
|
|
63 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
|
|
64 }
|
|
65 # evaluation
|
|
66 if(evaluation) {
|
|
67 prediction <- NULL
|
35
|
68 # do cross-validation
|
9
|
69 for(i in 1:length(folds)) {
|
35
|
70 # create training and test set
|
9
|
71 train <- genotype[-folds[[i]],]
|
|
72 test <- genotype[folds[[i]],]
|
|
73 phenoTrain <- phenotype[-folds[[i]]]
|
|
74 phenoTest <- phenotype[folds[[i]]]
|
35
|
75 # cv.glmnet helps to compute the right lambda for a chosen alpha
|
9
|
76 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
|
35
|
77 # create model
|
9
|
78 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
|
35
|
79 # predict value of the test set for further evaluation
|
9
|
80 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
|
|
81 }
|
35
|
82 # save predicted value for test set of each fold for further evaluation
|
9
|
83 saveRDS(prediction, file=paste(outFile,".rds", sep=""))
|
|
84 # just create a model
|
|
85 } else {
|
35
|
86 # cv.glmnet helps to compute the right lambda for a chosen alpha
|
9
|
87 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
|
35
|
88 # create model
|
9
|
89 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
|
35
|
90 # save model
|
9
|
91 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
|
|
92 }
|
|
93 }
|
|
94
|
|
95 ############################ main #############################
|
35
|
96 # load argument
|
9
|
97 cmd <- commandArgs(T)
|
|
98 source(cmd[1])
|
|
99 # check if evaluation is required
|
|
100 evaluation <- F
|
|
101 if(as.integer(doEvaluation) == 1) {
|
|
102 evaluation <- T
|
|
103 con = file(folds)
|
|
104 folds <- readLines(con = con, n = 1, ok=T)
|
|
105 close(con)
|
|
106 folds <- readRDS(folds)
|
|
107 }
|
|
108 # load classifier parameters
|
35
|
109 alpha <- as.numeric(alpha)
|
|
110 if(alpha < 0 | alpha > 1) {alpha <- NULL}
|
9
|
111 # load genotype and phenotype
|
|
112 con = file(genotype)
|
|
113 genotype <- readLines(con = con, n = 1, ok=T)
|
|
114 close(con)
|
|
115 genotype <- read.table(genotype, sep="\t", h=T)
|
|
116 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
|
|
117 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
|
|
118 # run !
|
35
|
119 lasso(genotype = data.matrix(genotype), phenotype = phenotype,
|
9
|
120 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
|
35
|
121 # return path of the result file to galaxy
|
9
|
122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
|