# HG changeset patch
# User nicolas
# Date 1477658656 14400
# Node ID 8cc5a7448ca6e2561ed3eb54eb7dbc269eeaf8fe
# Parent 55449ced23d5a1fd8a9bcacd89619862ab43cc5a
Deleted selected files
diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.R
--- a/computeR2.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-########################################################
-#
-# creation date : 27/06/16
-# last modification : 22/10/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-computeR2 <- function(target, prediction) {
- sst <- sum((target-mean(target))^2)
- ssr <- sum((target-prediction)^2)
- return(1-ssr/sst)
-}
-############################ main #############################
-# extract argument
-cmd <- commandArgs(trailingOnly = T)
-source(cmd[1])
-# load target and prediction
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-predicted <- read.table(predicted, sep = "\t", h=T)[,2]
-# compute r2
-cat(computeR2(phenotype, predicted), file=out)
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.xml
--- a/computeR2.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-
- compute R-squared value of a prediction vs true phenotype
-
- computeR2.R $config
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${predictedPhenotype}" -> predicted
-"${phenotype}" -> phenotype
-"${r2}" -> out
-
-
-
-
-
-
-
-
-
- compute R2 of prediction vs true phenotype
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.R
--- a/encode.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,126 +0,0 @@
-########################################################
-#
-# creation date : 04/01/16
-# last modification : 17/09/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-############################ helper functions #######################
-
-# encode one position in one individual
-encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
- res <- x
- if(!is.na(x)) {
- if(isHeterozygous(x, sep = sep)) {
- # heterozygous
- res <- code[2]
- } else {
- # determine whether it is the minor or major allele
- x <- unlist(strsplit(x, sep))
- # need to check only one element as we already know it is a homozygous
- if(length(x) > 1) {
- x <- x[1]
- }
- if(x==major) {
- res <- code[3]
- } else {
- res <- code[1]
- }
- }
- } else {
- # keep NA as NA
- res <- NA
- }
- return(res)
-}
-
-# rewrite a marker to make an exact count of the allele for the current marker
-encodeGenotype.rewrite <- function(x, sep=""){
- res <- x
- if(!is.na(x)) {
- if(length(unlist(strsplit(x,sep)))==1) {
- # in case of homozygous, must be counted 2 times so the caracter is written 2 times
- res <- c(x,x)
- } else {
- # heterozygous
- res <- unlist(strsplit(x, split=sep))
- }
- } else {
- res <- NA
- }
- return(res)
-}
-
-# encode one individual
-encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
- # rewrite genotype to make sure everything is written as a double caracter value
- newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
- # compute the occurcence of each genotype to determine major an minor allele
- stat <- table(as.character(newIndiv))
- major <- names(stat)[which.max(stat)]
- # Encode using the appropriate code
- indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
- return(indiv)
-}
-
-# determine if the genotype is an homozygous or heterozygous one
-# genotype must be written with two characters, even homozygous
-# (see encodeGenotype.rewrite() function )
-isHeterozygous <- function(genotype, sep=""){
- bool <- F
- # case of NA, can't be determined
- if(is.na(genotype)){
- bool <- NA
- } else {
- # check whether both element of the genotype are the same or not
- x <- unlist(strsplit(genotype, sep))
- if(length(x) > 1 & !(x[1] %in% x[2])) {
- bool <- T
- }
-
- }
- return(bool)
-}
-
-# check if encoding has been made properly. return a boolean vector
-# which has the same length that the number of columns of the input matrix
-# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
-# please note that this function is not used in the current tool and let
-# by convinience for being used outside of galaxy
-checkEncoding <- function(encoded, code=c(0,1,2)) {
- check <- NULL
- for(i in 1:ncol(encoded)) {
- # find major an minor allele
- major <- length(which(encoded[,i]==code[3]))
- minor <- length(which(encoded[,i]==code[1]))
- # comaprison
- if(major >= minor) {
- check <- c(check, T)
- } else {
- check <- c(check, F)
- }
- }
- return(check)
-}
-
-################################## main function ###########################
-# encode all individuals
-encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
- # encode genotype
- encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
- # set all NA to -1 (thus encoding schems with -1 are not allowed)
- encoded[is.na(encoded)] <- -1
- write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
-}
-
-############################ main #############################
-# load argument from xml file
-cmd <- commandArgs(T)
-source(cmd[1])
-# load genotype
-genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
-code <- c(0,1,2)
-encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
-cat(paste(out,".csv", "\n", sep=""))
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.xml
--- a/encode.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-
- encode genotypes for further machine learning analysis
-
- encode.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${separator}" -> sep
-"${encodedPath}" -> out
-
-
-
-
-
-
-
-
-
- Takes the genotype and encode them in the desired schem (usually 3 integer like 1/2/3) for minor homozygous/heterozygous/major homozygous. The output is a .csv file that may be use by other tools in the OGHMA workflow.
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.R
--- a/evaluation.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,95 +0,0 @@
-
-########################################################
-#
-# creation date : 08/01/16
-# last modification : 23/06/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-# compute correlation
-predictionCorrelation <- function(prediction, obs) {
- return(cor(prediction, obs, method = "pearson"))
-}
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-r2 <- function(target, prediction) {
- sst <- sum((target-mean(target))^2)
- ssr <- sum((target-prediction)^2)
- return(1-ssr/sst)
-}
-
-################################## main function ###########################
-
-evaluatePredictions <- function(data, prediction=NULL, traitValue=NULL, doR2=F,
- folds=NULL, classes=NULL, doCor=F) {
- eval <- NULL
- # case of r2
- if(doR2) {
- eval <- c(eval, list(r2=NULL))
- }
- if(doCor) {
- eval <- c(eval, list(cor=NULL))
- }
- for (i in 1:length(folds)) {
- train <- unlist(folds[-i])
- test <- folds[[i]]
- if(doR2) {
- if(!is.null(traitValue)) {
- eval$r2 <- c(eval$r2, r2(traitValue[test], prediction[[i]]))
- } else {
- eval$r2 <- c(eval$r2, NA)
- }
- }
- # case of correlation
- if(doCor) {
- if(!is.null(traitValue)) {
- eval$cor <- c(eval$cor, cor(traitValue[test], prediction[[i]]))
- } else {
- eval$cor <- c(eval$cor, NA)
- }
- }
- }
- # print output
- print(eval)
- if(doR2) {
- print(paste("mean r2 : ", mean(eval$r2), "standard deviation :", sd(eval$r2)))
- }
- if(doCor) {
- print(paste("mean correlation : ", mean(eval$cor), "standard deviation :", sd(eval$cor)))
- }
-}
-############################ main #############################
-
-# load arguments
-cmd <- commandArgs(trailingOnly = T)
-source(cmd[1])
-# set which evaluation are used
-if(as.integer(doR2) == 1) {
- doR2 <- T
-}
-if(as.integer(doCor) == 1) {
- doCor <- T
-}
-# load genotype & phenotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t",h=T)
-phenotype <- read.table(phenotype, sep="\t",h=T)[,1]
-# load prediction
-con = file(prediction)
-prediction <- readLines(con = con, n = 1, ok=T)
-close(con)
-prediction <- readRDS(prediction)
-# load folds
-con = file(folds)
-folds <- readLines(con = con, n = 1, ok=T)
-close(con)
-folds <- readRDS(folds)
-# evaluation
-evaluatePredictions(data=genotype, prediction=prediction, traitValue=phenotype, doR2=doR2,
- folds=folds, doCor=doCor)
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.xml
--- a/evaluation.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-
- evaluate results of classifiers prediction
-
- evaluation.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${phenotype}" -> phenotype
-"${prediction}" -> prediction
-"${r2}" -> doR2
-"${cor}" -> doCor
-"${folds}" -> folds
-
-
-
-
-
-
-
-
- evaluate the predictions of classifiers. For now correlation between true valeus and predictions, and r-squared distance are implemeted
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.R
--- a/folds.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-########################################################
-#
-# creation date : 05/01/16
-# last modification : 27/06/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-
-############################ main function #######################
-
-# create fold by picking at random row indexes
-createFolds <- function(nbObs, n) {
- # pick indexes
- index <- sample(1:n, size=nbObs, replace = T)
- # populate folds
- folds <- NULL
- for(i in 1:n) {
- folds <- c(folds, list(which(index==i)))
- }
- return(folds)
-}
-
-############################ main #############################
-# load arguments
-cmd <- commandArgs(trailingOnly = T)
-source(cmd[1])
-# load data and merge them
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-# fold creation
-nObs <- nrow(read.table(genotype, sep="\t", h=T))
-folds <- createFolds(nObs, as.numeric(n))
-# save them into a rds and send back to galaxy the path
-out <- paste(out,".rds",sep="")
-saveRDS(folds, file=out)
-cat(paste(out, "\n", sep=""))
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.xml
--- a/folds.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,41 +0,0 @@
-
- create folds for classifiers evaluation through cross-validation
-
- folds.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${n}" -> n
-"${foldsFile}" -> out
-
-
-
-
-
-
-
-
-
- Takes the genotypes and use it to determine the different folds for further cross-valisations
- return a rda file that contains a list of indexes of the genotype, each element of the list is a fold
- the list is made to be used by other tools of the OGHMA suite.
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.R
--- a/lasso.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,122 +0,0 @@
-########################################################
-#
-# creation date : 08/01/16
-# last modification : 01/09/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-suppressWarnings(suppressMessages(library(glmnet)))
-library(methods)
-############################ helper functions #######################
-
-
-# optimize alpha parameter
-optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) {
- acc <- NULL
- indexAlpha <- 1
- for(a in alpha) {
- curAcc <- NULL
- # repeat nfolds time each analysis
- for(i in 1:repet) {
- # draw at random 1/3 of the training set for testing and thus choose alpha
- # note it is not a cross-validation
- n <- ceiling(nrow(genotype)/3)
- indexTest <- sample(1:nrow(genotype), size=n)
- # create training set and test set
- train <- genotype[-indexTest,]
- test <- genotype[indexTest,]
- phenoTrain <- phenotype[-indexTest]
- phenoTest <- phenotype[indexTest]
- # cv.glmnet allow to compute lambda at the current alpha
- cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
- # create model
- model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
- # predict test set
- pred <- predict(model, test, type = "response")
- # compute r2 for choosing the best alpha
- curAcc <- c(curAcc, r2(phenoTest, pred))
- }
- # add mean r2 for this value of lambda to the accuracy vector
- acc <- c(acc, mean(curAcc))
- }
- # choose best alpha
- names(acc) <- alpha
- return(as.numeric(names(acc)[which.max(acc)]))
-}
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-r2 <- function(target, prediction) {
- sst <- sum((target-mean(target))^2)
- ssr <- sum((target-prediction)^2)
- return(1-ssr/sst)
-}
-################################## main function ###########################
-
-lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
- # go for optimization
- if(is.null(alpha)) {
- alpha <- seq(0,1,0.1)
- alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
- }
- # evaluation
- if(evaluation) {
- prediction <- NULL
- # do cross-validation
- for(i in 1:length(folds)) {
- # create training and test set
- train <- genotype[-folds[[i]],]
- test <- genotype[folds[[i]],]
- phenoTrain <- phenotype[-folds[[i]]]
- phenoTest <- phenotype[folds[[i]]]
- # cv.glmnet helps to compute the right lambda for a chosen alpha
- cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
- # create model
- lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
- # predict value of the test set for further evaluation
- prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
- }
- # save predicted value for test set of each fold for further evaluation
- saveRDS(prediction, file=paste(outFile,".rds", sep=""))
- # just create a model
- } else {
- # cv.glmnet helps to compute the right lambda for a chosen alpha
- cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
- # create model
- model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
- # save model
- saveRDS(model, file = paste(outFile, ".rds", sep = ""))
- }
-}
-
-############################ main #############################
-# load argument
-cmd <- commandArgs(T)
-source(cmd[1])
-# check if evaluation is required
-evaluation <- F
-if(as.integer(doEvaluation) == 1) {
- evaluation <- T
- con = file(folds)
- folds <- readLines(con = con, n = 1, ok=T)
- close(con)
- folds <- readRDS(folds)
-}
-# load classifier parameters
-alpha <- as.numeric(alpha)
-if(alpha < 0 | alpha > 1) {alpha <- NULL}
-# load genotype and phenotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# run !
-lasso(genotype = data.matrix(genotype), phenotype = phenotype,
- evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
-# return path of the result file to galaxy
-cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.xml
--- a/lasso.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-
- predict phenotype using a LASSO approach
-
- lasso.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${phenotype}" -> phenotype
-"${eval}" -> doEvaluation
-"${folds}" -> folds
-"${model}" -> out
-"${alpha}" -> alpha
-
-
-
-
-
-
-
-
-
- make the classification using the LASSO method
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.R
--- a/plotPrediction.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-########################################################
-#
-# creation date : 07/06/16
-# last modification : 07/06/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-library("miscTools")
-# scatterplot of the prediction vs target
-r2.plot <- function(true, predicted) {
- # the scatterplot
- plot(true, predicted, xlab="trait value", ylab="predicted value", main="", pch=16,
- ylim=c(min(min(true), min(predicted)), max(max(true), max(predicted))))
- # add a red lines with ideal case
- lines(true, true, col="red")
-}
-
-############################ main #############################
-# load argument
-cmd <- commandArgs(trailingOnly = T)
-source(cmd[1])
-# load prediction and target
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-predicted <- read.table(predicted, sep = "\t", h=T)[,2]
-# plot in a pdf that will be available in galaxy history panel
-pdf(out)
-r2.plot(phenotype, predicted = predicted)
-dev.off()
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.xml
--- a/plotPrediction.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-
- shows scatterplot of a prediction vs true phenotype
-
- plotPrediction.R $config
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${predictedPhenotype}" -> predicted
-"${phenotype}" -> phenotype
-"${r2}" -> out
-
-
-
-
-
-
-
-
-
- draw R2 of prediction vs true phenotype
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.R
--- a/prediction.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-########################################################
-#
-# creation date : 26/01/16
-# last modification : 02/06/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-
-library(rrBLUP)
-suppressWarnings(suppressMessages(library(randomForest)))
-library(e1071)
-suppressWarnings(suppressMessages(library(glmnet)))
-library(methods)
-
-
-############################ main #############################
-classifierNames <- c("list", "randomForest", "svm", "glmnet")
-# load argument
-cmd <- commandArgs(trailingOnly = T)
-source(cmd[1])
-# load data
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-con = file(model)
-model <- readLines(con = con, n = 1, ok=T)
-close(con)
-model <- readRDS(model)
-# check if the classifier name is valid
-if(all(is.na(match(class(model), classifierNames)))) {
- stop(paste(class(model), "is not recognized as a valid model. Valid models are : ", classifierNames))
-}
-# run prediction according to the classifier
-# rrBLUP prediction
-if(any(class(model) == "list")) {
- predictions <- as.matrix(genotype) %*% as.matrix(model$u)
- predictions <- predictions[,1]+model$beta
- predictions <- data.frame(lines=rownames(genotype), predictions=predictions)
-# LASSO prediction
-} else if(any(class(model) == "glmnet")) {
- predictions <- predict(model, as.matrix(genotype), type = "response")
- predictions <- data.frame(lines=rownames(genotype), predictions=predictions)
-# SVM or RandomForest prediction (predict is a wrapper for many machine learning function)
-} else {
- predictions <- predict(model, genotype)
- predictions <- data.frame(lines=names(predictions), predictions=predictions)
-}
-# save results
-write.table(predictions, file=out, sep="\t", row.names = F)
-
diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.xml
--- a/prediction.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-
- use machine learning model to predict phenotype from genotype
-
- prediction.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${model}" -> model
-"${output1}" -> out
-
-
-
-
-
-
-
-
-
- Use model designed by any clasifier tool from the OGHMA suite to predict the phenotype of a dataset provided as input. results are stored in a folder under a .rda file
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.R
--- a/randomForest.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-########################################################
-#
-# creation date : 07/01/16
-# last modification : 25/10/16
-# author : Dr Nicolas Beaume
-#
-########################################################
-
-suppressWarnings(suppressMessages(library(randomForest)))
-############################ helper functions #######################
-# optimize
-optimize <- function(genotype, phenotype, ntree=1000,
- rangeMtry=seq(ceiling(ncol(genotype)/5),
- ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)),
- repet=3) {
- # accuracy over all mtry values
- acc <- NULL
- for(mtry in rangeMtry) {
- # to compute the mean accuracy over repetiotion for the current mtry value
- tempAcc <- NULL
- for(i in 1:repet) {
- # 1/3 of the dataset is used as test set
- n <- ceiling(nrow(genotype)/3)
- indexTest <- sample(1:nrow(genotype), size=n)
- # create training and test set
- train <- genotype[-indexTest,]
- test <- genotype[indexTest,]
- phenoTrain <- phenotype[-indexTest]
- phenoTest <- phenotype[indexTest]
- # compute model
- model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry)
- # predict on test set and compute accuracy
- pred <- predict(model, test)
- tempAcc <- c(tempAcc, r2(phenoTest, pred))
- }
- # find mean accuracy for the current mtry value
- acc <- c(acc, mean(tempAcc))
- }
- # return mtry for the best accuracy
- names(acc) <- rangeMtry
- bestParam <- which.max(acc)
- return(rangeMtry[bestParam])
-}
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-r2 <- function(target, prediction) {
- sst <- sum((target-mean(target))^2)
- ssr <- sum((target-prediction)^2)
- return(1-ssr/sst)
-}
-################################## main function ###########################
-rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) {
-
- # go for optimization
- if(is.null(mtry)) {
- # find best mtry
- mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100))
- mtry <- optimize(genotype=genotype, phenotype=phenotype,
- ntree = ntree, rangeMtry = mtry)
- }
- # evaluation
- if(evaluation) {
- prediction <- NULL
- for(i in 1:length(folds)) {
- # create training and testing set for the current fold
- train <- genotype[-folds[[i]],]
- test <- genotype[folds[[i]],]
- phenoTrain <- phenotype[-folds[[i]]]
- # compute model
- rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree)
- # predict and save prediction for the current fold
- prediction <- c(prediction, list(predict(rf, test)))
- }
- # save preductions for all folds to be used for evaluation
- saveRDS(prediction, file = paste(outFile, ".rds", sep = ""))
- } else {
- # just compute the model and save it
- model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree)
- saveRDS(model, file = paste(outFile, ".rds", sep = ""))
- }
-}
-
-
-############################ main #############################
-# load parameters
-cmd <- commandArgs(T)
-source(cmd[1])
-# load classifier parameters
-mtry <- as.numeric(mtry)
-ntree <- as.numeric(ntree)
-if(mtry == -1) {mtry <- NULL}
-# check if evaluation is required
-evaluation <- F
-if(as.integer(doEvaluation) == 1) {
- evaluation <- T
- con = file(folds)
- folds <- readLines(con = con, n = 1, ok=T)
- close(con)
- folds <- readRDS(folds)
-}
-# load genotype and phenotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# run !
-rfSelection(genotype = data.matrix(genotype), phenotype=phenotype,
- evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree)
-# send the path containing results to galaxy
-cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.xml
--- a/randomForest.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-
- predict phenotype using a random forest approach
-
- randomForest.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${phenotype}" -> phenotype
-"${eval}" -> doEvaluation
-"${folds}" -> folds
-"${model}" -> out
-"${mtry}" -> mtry
-"${ntree}" -> ntree
-
-
-
-
-
-
-
-
-
- make the classification using the random forest method
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.R
--- a/rrBLUP.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-
-########################################################
-#
-# creation date : 05/01/16
-# last modification : 25/10/16
-# author : Dr Nicolas Beaume
-#
-########################################################
-
-
-library(rrBLUP)
-############################ helper functions #######################
-
-
-################################## main function ###########################
-# do rrBLUP evaluation of classification.
-# optimization of paramaters is included in rrBLUP package
-rrBLUP <- function(genotype, phenotype, outFile, evaluation=F, folds) {
- # Evaluation mode
- if(evaluation) {
- prediction <- NULL
- # run over folds
- for(i in 1:length(folds)) {
- # create training and test set for this fold
- train <- genotype[-folds[[i]],]
- test <- genotype[folds[[i]],]
- phenoTrain <- phenotype[-folds[[i]]]
- phenoTest <- phenotype[folds[[i]]]
- # create model
- model <-mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F)
- # predict current test set
- pred <- as.matrix(test) %*% as.matrix(model$u)
- pred <- pred[,1]+model$beta
- prediction <- c(prediction, list(pred))
- }
- # save results
- saveRDS(prediction, file=paste(outFile,".rds", sep=""))
- # just create a model
- } else {
- # create and save modle
- model <-mixed.solve(phenotype, Z=genotype,K=NULL, SE=F,return.Hinv = F)
- saveRDS(model, file = paste(outFile, ".rds", sep = ""))
- }
-}
-
-
-############################ main #############################
-# get argument from xml file
-cmd <- commandArgs(T)
-source(cmd[1])
-# for evaluation mode : set evaluation as True and load fold file
-if(as.integer(evaluation) == 1) {
- evaluation <- T
- con = file(folds)
- folds <- readLines(con = con, n = 1, ok=T)
- close(con)
- folds <- readRDS(folds)
-} else{
- evaluation <- F
-}
-# load genotype and phenotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# run !
-rrBLUP(genotype = genotype, phenotype = phenotype, outFile = out,
- evaluation = evaluation, folds = folds)
-# return path of the result to galaxy
-cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.xml
--- a/rrBLUP.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-
- predict phenotype using a rrBLUP approach
-
- rrBLUP.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${phenotype}" -> phenotype
-"${model}" -> out
-"${eval}" -> evaluation
-"${folds}" -> folds
-
-
-
-
-
-
-
-
-
-
- make the classification using the rrBLUP method
-
-
\ No newline at end of file
diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.R
--- a/svm.R Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,167 +0,0 @@
-########################################################
-#
-# creation date : 07/01/16
-# last modification : 03/07/16
-# author : Dr Nicolas Beaume
-# owner : IRRI
-#
-########################################################
-library("e1071")
-options(warn=-1)
-############################ helper functions #######################
-# optimize svm parameters
-optimizeSVM <- function(train, target, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) {
- # tuning parameters then train
- model <- NULL
- if(is.null(g)){g <- 10^(-6:0)}
- if(is.null(c)){c <- 10^(-1:0)}
- # optimize parameter for the kernel in use
- switch(kernel,
- # sigmoid kernel : need gamma, cost and coef
- sigmoid={
- if(is.null(coef)){coef <- 0:3};
- # optimize then extract best parameters
- tune <- tune.svm(train, target, gamma = g, cost = 10^(0:2), kernel="sigmoid", coef0 = coef);
- g <- tune$best.parameters[[1]];
- c <- tune$best.parameters[[3]];
- coef <- tune$best.parameters[[2]];
- # compute model
- model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
- # linear kernel, only cost is required
- linear={
- # optimize then extract best parameters
- tune <- tune.svm(train, target, cost = c, kernel="linear");
- c <- tune$best.parameters[[1]];
- # compute model
- model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "linear")},
- # polynomial kernel, use degree, gamma, cost and coef as parameters
- polynomial={
- # optimize and extract best parameters
- if(is.null(coef)){coef <- 0:3};
- if(is.null(d)){d <- 0:4};
- tune <- tune.svm(train, target, gamma = g, cost = c, degree = d, coef0 = coef, kernel="polynomial");
- d <- tune$best.parameters[[1]];
- g <- tune$best.parameters[[2]];
- coef <- tune$best.parameters[[3]];
- c <- tune$best.parameters[[4]];
- # compute model
- model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
- # default : radial kernel, use gamma and cost as parameters
- {
- # optimize and extract parameters
- tune <- tune.svm(train, target, gamma = g, cost = c, kernel="radial");
- g <- tune$best.parameters[[1]];
- c <- tune$best.parameters[[2]];
- # compute model
- model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
- )
- return(model)
-}
-################################## main function ###########################
-svmClassifier <- function(genotype, phenotype, evaluation = T, outFile, folds, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) {
- # optimize classifier if any parameter is NULL
- switch(kernel,
- # sigmoid kernel : need gamma, cost and coef
- sigmoid={
- if(any(is.null(c(coef, c, g)))){
- fit <- optimizeSVM(genotype, phenotype, kernel = "sigmoid",
- g = g, c=c, coef = coef);
- c <- fit$cost;
- g <- fit$gamma;
- coef <- fit$coef0;
- }
- },
- # linear kernel, only cost is required
- linear={
- if(is.null(c)){fit <- optimizeSVM(genotype, phenotype, kernel = "linear", c=c);
- c <- fit$cost;
- }
- },
- # polynomial kernel, use degree, gamma, cost and coef as parameters
- polynomial={
- if(any(is.null(c(coef, c, g, d)))){fit <- optimizeSVM(genotype, phenotype, kernel = "polynomial",
- g = g, c=c, coef = coef, d = d);
- c <- fit$cost;
- g <- fit$gamma;
- coef <- fit$coef0;
- d <- fit$degree
- }
- },
- # default : radial kernel, use gamma and cost as parameters
- {if(any(is.null(c(c, g)))){fit <- optimizeSVM(genotype, phenotype, kernel = "radial",
- g = g, c=c);
- c <- fit$cost;
- g <- fit$gamma;
- }
- }
- )
- # do evaluation
- if(evaluation) {
- prediction <- NULL
- # iterate over folds
- for(i in 1:length(folds)) {
- # prepare indexes for training and test
- test <- folds[[i]]
- train <- unlist(folds[-i])
- # compute model
- svm.fit <- optimizeSVM(train = genotype[train,], target = phenotype[train], kernel=kernel,
- g=g, c=c, coef=coef, d=d)
- # predict on test set of the current fold
- prediction <- c(prediction, list(predict(svm.fit, genotype[test,])))
- }
- # save all prediction for further evaluation
- saveRDS(prediction, file=paste(outFile, ".rds", sep = ""))
- } else {
- # compute model and save it
- switch(kernel,
- # sigmoid kernel : need gamma, cost and coef
- sigmoid={
- model <- svm(x = genotype, y = phenotype, kernel="sigmoid", gamma =g,
- cost =c, coef0=coef)
- },
- # linear kernel, only cost is required
- linear={
- model <- svm(x = genotype, y = phenotype, kernel="linear", cost =c)
- },
- # polynomial kernel, use degree, gamma, cost and coef as parameters
- polynomial={
- model <- svm(x = genotype, y = phenotype, kernel="polynomial", gamma =g, cost =c,
- coef0=coef, degree =d)
- },
- # default : radial kernel, use gamma and cost as parameters
- { model <- svm(x = genotype, y = phenotype, kernel="radial", gamma =g, cost =c)
- })
- saveRDS(model, file=paste(outFile, ".rds", sep = ""))
- }
-}
-
-############################ main #############################
-# load argument
-cmd <- commandArgs(T)
-source(cmd[1])
-# check for svm paramater, especially to know if optimization is requiered
-if(as.numeric(g) == -1) {g <- NULL}
-if(as.numeric(c) == -1) {c <- NULL}
-if(as.numeric(coef) == -1) {coef <- NULL}
-if(as.numeric(d) == -1) {d <- NULL}
-# check if evaluation is required
-evaluation <- F
-if(as.integer(doEvaluation) == 1) {
- evaluation <- T
- con = file(folds)
- folds <- readLines(con = con, n = 1, ok=T)
- close(con)
- folds <- readRDS(folds)
-}
-# load genotype and phenotype
-con = file(genotype)
-genotype <- readLines(con = con, n = 1, ok=T)
-close(con)
-genotype <- read.table(genotype, sep="\t", h=T)
-# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# run !
-svmClassifier(genotype = genotype, phenotype = phenotype,
- evaluation = evaluation, outFile = out, folds = folds, g=g, c=c, coef=coef, d=d, kernel=kernel)
-# retunr path of the result file to galaxy
-cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.xml
--- a/svm.xml Wed Oct 26 19:16:22 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-
- predict phenotype using a SVM approach
-
- svm.R $config > ${output1}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## Desc: this file is sourced in encode wrapper script
-## as means to pass all galaxy params to R
-"${genotype}" -> genotype
-"${phenotype}" -> phenotype
-"${eval}" -> doEvaluation
-"${folds}" -> folds
-"${model}" -> out
-"${kernel}" -> kernel
-"${gamma}" -> g
-"${cost}" -> c
-"${coeficient}" -> coef
-"${degree}" -> d
-
-
-
-
-
-
-
-
-
- make the classification using the SVM method
-
-
\ No newline at end of file