# HG changeset patch # User nicolas # Date 1477658656 14400 # Node ID 8cc5a7448ca6e2561ed3eb54eb7dbc269eeaf8fe # Parent 55449ced23d5a1fd8a9bcacd89619862ab43cc5a Deleted selected files diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.R --- a/computeR2.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -######################################################## -# -# creation date : 27/06/16 -# last modification : 22/10/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -computeR2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -############################ main ############################# -# extract argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load target and prediction -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -predicted <- read.table(predicted, sep = "\t", h=T)[,2] -# compute r2 -cat(computeR2(phenotype, predicted), file=out) \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.xml --- a/computeR2.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - compute R-squared value of a prediction vs true phenotype - - computeR2.R $config - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${predictedPhenotype}" -> predicted -"${phenotype}" -> phenotype -"${r2}" -> out - - - - - - - - - - compute R2 of prediction vs true phenotype - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.R --- a/encode.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -######################################################## -# -# creation date : 04/01/16 -# last modification : 17/09/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -############################ helper functions ####################### - -# encode one position in one individual -encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ - res <- x - if(!is.na(x)) { - if(isHeterozygous(x, sep = sep)) { - # heterozygous - res <- code[2] - } else { - # determine whether it is the minor or major allele - x <- unlist(strsplit(x, sep)) - # need to check only one element as we already know it is a homozygous - if(length(x) > 1) { - x <- x[1] - } - if(x==major) { - res <- code[3] - } else { - res <- code[1] - } - } - } else { - # keep NA as NA - res <- NA - } - return(res) -} - -# rewrite a marker to make an exact count of the allele for the current marker -encodeGenotype.rewrite <- function(x, sep=""){ - res <- x - if(!is.na(x)) { - if(length(unlist(strsplit(x,sep)))==1) { - # in case of homozygous, must be counted 2 times so the caracter is written 2 times - res <- c(x,x) - } else { - # heterozygous - res <- unlist(strsplit(x, split=sep)) - } - } else { - res <- NA - } - return(res) -} - -# encode one individual -encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ - # rewrite genotype to make sure everything is written as a double caracter value - newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) - # compute the occurcence of each genotype to determine major an minor allele - stat <- table(as.character(newIndiv)) - major <- names(stat)[which.max(stat)] - # Encode using the appropriate code - indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) - return(indiv) -} - -# determine if the genotype is an homozygous or heterozygous one -# genotype must be written with two characters, even homozygous -# (see encodeGenotype.rewrite() function ) -isHeterozygous <- function(genotype, sep=""){ - bool <- F - # case of NA, can't be determined - if(is.na(genotype)){ - bool <- NA - } else { - # check whether both element of the genotype are the same or not - x <- unlist(strsplit(genotype, sep)) - if(length(x) > 1 & !(x[1] %in% x[2])) { - bool <- T - } - - } - return(bool) -} - -# check if encoding has been made properly. return a boolean vector -# which has the same length that the number of columns of the input matrix -# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise -# please note that this function is not used in the current tool and let -# by convinience for being used outside of galaxy -checkEncoding <- function(encoded, code=c(0,1,2)) { - check <- NULL - for(i in 1:ncol(encoded)) { - # find major an minor allele - major <- length(which(encoded[,i]==code[3])) - minor <- length(which(encoded[,i]==code[1])) - # comaprison - if(major >= minor) { - check <- c(check, T) - } else { - check <- c(check, F) - } - } - return(check) -} - -################################## main function ########################### -# encode all individuals -encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ - # encode genotype - encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) - # set all NA to -1 (thus encoding schems with -1 are not allowed) - encoded[is.na(encoded)] <- -1 - write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") -} - -############################ main ############################# -# load argument from xml file -cmd <- commandArgs(T) -source(cmd[1]) -# load genotype -genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) -code <- c(0,1,2) -encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) -cat(paste(out,".csv", "\n", sep="")) \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.xml --- a/encode.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - encode genotypes for further machine learning analysis - - encode.R $config > ${output1} - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${separator}" -> sep -"${encodedPath}" -> out - - - - - - - - - - Takes the genotype and encode them in the desired schem (usually 3 integer like 1/2/3) for minor homozygous/heterozygous/major homozygous. The output is a .csv file that may be use by other tools in the OGHMA workflow. - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.R --- a/evaluation.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ - -######################################################## -# -# creation date : 08/01/16 -# last modification : 23/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -# compute correlation -predictionCorrelation <- function(prediction, obs) { - return(cor(prediction, obs, method = "pearson")) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} - -################################## main function ########################### - -evaluatePredictions <- function(data, prediction=NULL, traitValue=NULL, doR2=F, - folds=NULL, classes=NULL, doCor=F) { - eval <- NULL - # case of r2 - if(doR2) { - eval <- c(eval, list(r2=NULL)) - } - if(doCor) { - eval <- c(eval, list(cor=NULL)) - } - for (i in 1:length(folds)) { - train <- unlist(folds[-i]) - test <- folds[[i]] - if(doR2) { - if(!is.null(traitValue)) { - eval$r2 <- c(eval$r2, r2(traitValue[test], prediction[[i]])) - } else { - eval$r2 <- c(eval$r2, NA) - } - } - # case of correlation - if(doCor) { - if(!is.null(traitValue)) { - eval$cor <- c(eval$cor, cor(traitValue[test], prediction[[i]])) - } else { - eval$cor <- c(eval$cor, NA) - } - } - } - # print output - print(eval) - if(doR2) { - print(paste("mean r2 : ", mean(eval$r2), "standard deviation :", sd(eval$r2))) - } - if(doCor) { - print(paste("mean correlation : ", mean(eval$cor), "standard deviation :", sd(eval$cor))) - } -} -############################ main ############################# - -# load arguments -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# set which evaluation are used -if(as.integer(doR2) == 1) { - doR2 <- T -} -if(as.integer(doCor) == 1) { - doCor <- T -} -# load genotype & phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t",h=T) -phenotype <- read.table(phenotype, sep="\t",h=T)[,1] -# load prediction -con = file(prediction) -prediction <- readLines(con = con, n = 1, ok=T) -close(con) -prediction <- readRDS(prediction) -# load folds -con = file(folds) -folds <- readLines(con = con, n = 1, ok=T) -close(con) -folds <- readRDS(folds) -# evaluation -evaluatePredictions(data=genotype, prediction=prediction, traitValue=phenotype, doR2=doR2, - folds=folds, doCor=doCor) \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.xml --- a/evaluation.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ - - evaluate results of classifiers prediction - - evaluation.R $config > ${output1} - - - - - - - - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${prediction}" -> prediction -"${r2}" -> doR2 -"${cor}" -> doCor -"${folds}" -> folds - - - - - - - - - evaluate the predictions of classifiers. For now correlation between true valeus and predictions, and r-squared distance are implemeted - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.R --- a/folds.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -######################################################## -# -# creation date : 05/01/16 -# last modification : 27/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - - -############################ main function ####################### - -# create fold by picking at random row indexes -createFolds <- function(nbObs, n) { - # pick indexes - index <- sample(1:n, size=nbObs, replace = T) - # populate folds - folds <- NULL - for(i in 1:n) { - folds <- c(folds, list(which(index==i))) - } - return(folds) -} - -############################ main ############################# -# load arguments -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load data and merge them -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -# fold creation -nObs <- nrow(read.table(genotype, sep="\t", h=T)) -folds <- createFolds(nObs, as.numeric(n)) -# save them into a rds and send back to galaxy the path -out <- paste(out,".rds",sep="") -saveRDS(folds, file=out) -cat(paste(out, "\n", sep="")) \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.xml --- a/folds.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - - create folds for classifiers evaluation through cross-validation - - folds.R $config > ${output1} - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${n}" -> n -"${foldsFile}" -> out - - - - - - - - - - Takes the genotypes and use it to determine the different folds for further cross-valisations - return a rda file that contains a list of indexes of the genotype, each element of the list is a fold - the list is made to be used by other tools of the OGHMA suite. - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.R --- a/lasso.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -######################################################## -# -# creation date : 08/01/16 -# last modification : 01/09/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -suppressWarnings(suppressMessages(library(glmnet))) -library(methods) -############################ helper functions ####################### - - -# optimize alpha parameter -optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) { - acc <- NULL - indexAlpha <- 1 - for(a in alpha) { - curAcc <- NULL - # repeat nfolds time each analysis - for(i in 1:repet) { - # draw at random 1/3 of the training set for testing and thus choose alpha - # note it is not a cross-validation - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - # create training set and test set - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - # cv.glmnet allow to compute lambda at the current alpha - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) - # create model - model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) - # predict test set - pred <- predict(model, test, type = "response") - # compute r2 for choosing the best alpha - curAcc <- c(curAcc, r2(phenoTest, pred)) - } - # add mean r2 for this value of lambda to the accuracy vector - acc <- c(acc, mean(curAcc)) - } - # choose best alpha - names(acc) <- alpha - return(as.numeric(names(acc)[which.max(acc)])) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -################################## main function ########################### - -lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { - # go for optimization - if(is.null(alpha)) { - alpha <- seq(0,1,0.1) - alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) - } - # evaluation - if(evaluation) { - prediction <- NULL - # do cross-validation - for(i in 1:length(folds)) { - # create training and test set - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - # cv.glmnet helps to compute the right lambda for a chosen alpha - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) - # create model - lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) - # predict value of the test set for further evaluation - prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) - } - # save predicted value for test set of each fold for further evaluation - saveRDS(prediction, file=paste(outFile,".rds", sep="")) - # just create a model - } else { - # cv.glmnet helps to compute the right lambda for a chosen alpha - cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) - # create model - model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) - # save model - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - -############################ main ############################# -# load argument -cmd <- commandArgs(T) -source(cmd[1]) -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load classifier parameters -alpha <- as.numeric(alpha) -if(alpha < 0 | alpha > 1) {alpha <- NULL} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -lasso(genotype = data.matrix(genotype), phenotype = phenotype, - evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) -# return path of the result file to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.xml --- a/lasso.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - predict phenotype using a LASSO approach - - lasso.R $config > ${output1} - - - - - - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${alpha}" -> alpha - - - - - - - - - - make the classification using the LASSO method - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.R --- a/plotPrediction.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -######################################################## -# -# creation date : 07/06/16 -# last modification : 07/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -library("miscTools") -# scatterplot of the prediction vs target -r2.plot <- function(true, predicted) { - # the scatterplot - plot(true, predicted, xlab="trait value", ylab="predicted value", main="", pch=16, - ylim=c(min(min(true), min(predicted)), max(max(true), max(predicted)))) - # add a red lines with ideal case - lines(true, true, col="red") -} - -############################ main ############################# -# load argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load prediction and target -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -predicted <- read.table(predicted, sep = "\t", h=T)[,2] -# plot in a pdf that will be available in galaxy history panel -pdf(out) -r2.plot(phenotype, predicted = predicted) -dev.off() \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.xml --- a/plotPrediction.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - shows scatterplot of a prediction vs true phenotype - - plotPrediction.R $config - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${predictedPhenotype}" -> predicted -"${phenotype}" -> phenotype -"${r2}" -> out - - - - - - - - - - draw R2 of prediction vs true phenotype - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.R --- a/prediction.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -######################################################## -# -# creation date : 26/01/16 -# last modification : 02/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -library(rrBLUP) -suppressWarnings(suppressMessages(library(randomForest))) -library(e1071) -suppressWarnings(suppressMessages(library(glmnet))) -library(methods) - - -############################ main ############################# -classifierNames <- c("list", "randomForest", "svm", "glmnet") -# load argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load data -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -con = file(model) -model <- readLines(con = con, n = 1, ok=T) -close(con) -model <- readRDS(model) -# check if the classifier name is valid -if(all(is.na(match(class(model), classifierNames)))) { - stop(paste(class(model), "is not recognized as a valid model. Valid models are : ", classifierNames)) -} -# run prediction according to the classifier -# rrBLUP prediction -if(any(class(model) == "list")) { - predictions <- as.matrix(genotype) %*% as.matrix(model$u) - predictions <- predictions[,1]+model$beta - predictions <- data.frame(lines=rownames(genotype), predictions=predictions) -# LASSO prediction -} else if(any(class(model) == "glmnet")) { - predictions <- predict(model, as.matrix(genotype), type = "response") - predictions <- data.frame(lines=rownames(genotype), predictions=predictions) -# SVM or RandomForest prediction (predict is a wrapper for many machine learning function) -} else { - predictions <- predict(model, genotype) - predictions <- data.frame(lines=names(predictions), predictions=predictions) -} -# save results -write.table(predictions, file=out, sep="\t", row.names = F) - diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.xml --- a/prediction.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - use machine learning model to predict phenotype from genotype - - prediction.R $config > ${output1} - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${model}" -> model -"${output1}" -> out - - - - - - - - - - Use model designed by any clasifier tool from the OGHMA suite to predict the phenotype of a dataset provided as input. results are stored in a folder under a .rda file - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.R --- a/randomForest.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -######################################################## -# -# creation date : 07/01/16 -# last modification : 25/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - -suppressWarnings(suppressMessages(library(randomForest))) -############################ helper functions ####################### -# optimize -optimize <- function(genotype, phenotype, ntree=1000, - rangeMtry=seq(ceiling(ncol(genotype)/5), - ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)), - repet=3) { - # accuracy over all mtry values - acc <- NULL - for(mtry in rangeMtry) { - # to compute the mean accuracy over repetiotion for the current mtry value - tempAcc <- NULL - for(i in 1:repet) { - # 1/3 of the dataset is used as test set - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - # create training and test set - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - # compute model - model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) - # predict on test set and compute accuracy - pred <- predict(model, test) - tempAcc <- c(tempAcc, r2(phenoTest, pred)) - } - # find mean accuracy for the current mtry value - acc <- c(acc, mean(tempAcc)) - } - # return mtry for the best accuracy - names(acc) <- rangeMtry - bestParam <- which.max(acc) - return(rangeMtry[bestParam]) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -################################## main function ########################### -rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) { - - # go for optimization - if(is.null(mtry)) { - # find best mtry - mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)) - mtry <- optimize(genotype=genotype, phenotype=phenotype, - ntree = ntree, rangeMtry = mtry) - } - # evaluation - if(evaluation) { - prediction <- NULL - for(i in 1:length(folds)) { - # create training and testing set for the current fold - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - # compute model - rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) - # predict and save prediction for the current fold - prediction <- c(prediction, list(predict(rf, test))) - } - # save preductions for all folds to be used for evaluation - saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) - } else { - # just compute the model and save it - model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - - -############################ main ############################# -# load parameters -cmd <- commandArgs(T) -source(cmd[1]) -# load classifier parameters -mtry <- as.numeric(mtry) -ntree <- as.numeric(ntree) -if(mtry == -1) {mtry <- NULL} -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, - evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) -# send the path containing results to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.xml --- a/randomForest.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - predict phenotype using a random forest approach - - randomForest.R $config > ${output1} - - - - - - - - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${mtry}" -> mtry -"${ntree}" -> ntree - - - - - - - - - - make the classification using the random forest method - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.R --- a/rrBLUP.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ - -######################################################## -# -# creation date : 05/01/16 -# last modification : 25/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - - -library(rrBLUP) -############################ helper functions ####################### - - -################################## main function ########################### -# do rrBLUP evaluation of classification. -# optimization of paramaters is included in rrBLUP package -rrBLUP <- function(genotype, phenotype, outFile, evaluation=F, folds) { - # Evaluation mode - if(evaluation) { - prediction <- NULL - # run over folds - for(i in 1:length(folds)) { - # create training and test set for this fold - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - # create model - model <-mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F) - # predict current test set - pred <- as.matrix(test) %*% as.matrix(model$u) - pred <- pred[,1]+model$beta - prediction <- c(prediction, list(pred)) - } - # save results - saveRDS(prediction, file=paste(outFile,".rds", sep="")) - # just create a model - } else { - # create and save modle - model <-mixed.solve(phenotype, Z=genotype,K=NULL, SE=F,return.Hinv = F) - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - - -############################ main ############################# -# get argument from xml file -cmd <- commandArgs(T) -source(cmd[1]) -# for evaluation mode : set evaluation as True and load fold file -if(as.integer(evaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} else{ - evaluation <- F -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -rrBLUP(genotype = genotype, phenotype = phenotype, outFile = out, - evaluation = evaluation, folds = folds) -# return path of the result to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.xml --- a/rrBLUP.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ - - predict phenotype using a rrBLUP approach - - rrBLUP.R $config > ${output1} - - - - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${model}" -> out -"${eval}" -> evaluation -"${folds}" -> folds - - - - - - - - - - - make the classification using the rrBLUP method - - \ No newline at end of file diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.R --- a/svm.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ -######################################################## -# -# creation date : 07/01/16 -# last modification : 03/07/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## -library("e1071") -options(warn=-1) -############################ helper functions ####################### -# optimize svm parameters -optimizeSVM <- function(train, target, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) { - # tuning parameters then train - model <- NULL - if(is.null(g)){g <- 10^(-6:0)} - if(is.null(c)){c <- 10^(-1:0)} - # optimize parameter for the kernel in use - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - if(is.null(coef)){coef <- 0:3}; - # optimize then extract best parameters - tune <- tune.svm(train, target, gamma = g, cost = 10^(0:2), kernel="sigmoid", coef0 = coef); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[3]]; - coef <- tune$best.parameters[[2]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, - # linear kernel, only cost is required - linear={ - # optimize then extract best parameters - tune <- tune.svm(train, target, cost = c, kernel="linear"); - c <- tune$best.parameters[[1]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "linear")}, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - # optimize and extract best parameters - if(is.null(coef)){coef <- 0:3}; - if(is.null(d)){d <- 0:4}; - tune <- tune.svm(train, target, gamma = g, cost = c, degree = d, coef0 = coef, kernel="polynomial"); - d <- tune$best.parameters[[1]]; - g <- tune$best.parameters[[2]]; - coef <- tune$best.parameters[[3]]; - c <- tune$best.parameters[[4]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, - # default : radial kernel, use gamma and cost as parameters - { - # optimize and extract parameters - tune <- tune.svm(train, target, gamma = g, cost = c, kernel="radial"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} - ) - return(model) -} -################################## main function ########################### -svmClassifier <- function(genotype, phenotype, evaluation = T, outFile, folds, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) { - # optimize classifier if any parameter is NULL - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - if(any(is.null(c(coef, c, g)))){ - fit <- optimizeSVM(genotype, phenotype, kernel = "sigmoid", - g = g, c=c, coef = coef); - c <- fit$cost; - g <- fit$gamma; - coef <- fit$coef0; - } - }, - # linear kernel, only cost is required - linear={ - if(is.null(c)){fit <- optimizeSVM(genotype, phenotype, kernel = "linear", c=c); - c <- fit$cost; - } - }, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - if(any(is.null(c(coef, c, g, d)))){fit <- optimizeSVM(genotype, phenotype, kernel = "polynomial", - g = g, c=c, coef = coef, d = d); - c <- fit$cost; - g <- fit$gamma; - coef <- fit$coef0; - d <- fit$degree - } - }, - # default : radial kernel, use gamma and cost as parameters - {if(any(is.null(c(c, g)))){fit <- optimizeSVM(genotype, phenotype, kernel = "radial", - g = g, c=c); - c <- fit$cost; - g <- fit$gamma; - } - } - ) - # do evaluation - if(evaluation) { - prediction <- NULL - # iterate over folds - for(i in 1:length(folds)) { - # prepare indexes for training and test - test <- folds[[i]] - train <- unlist(folds[-i]) - # compute model - svm.fit <- optimizeSVM(train = genotype[train,], target = phenotype[train], kernel=kernel, - g=g, c=c, coef=coef, d=d) - # predict on test set of the current fold - prediction <- c(prediction, list(predict(svm.fit, genotype[test,]))) - } - # save all prediction for further evaluation - saveRDS(prediction, file=paste(outFile, ".rds", sep = "")) - } else { - # compute model and save it - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - model <- svm(x = genotype, y = phenotype, kernel="sigmoid", gamma =g, - cost =c, coef0=coef) - }, - # linear kernel, only cost is required - linear={ - model <- svm(x = genotype, y = phenotype, kernel="linear", cost =c) - }, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - model <- svm(x = genotype, y = phenotype, kernel="polynomial", gamma =g, cost =c, - coef0=coef, degree =d) - }, - # default : radial kernel, use gamma and cost as parameters - { model <- svm(x = genotype, y = phenotype, kernel="radial", gamma =g, cost =c) - }) - saveRDS(model, file=paste(outFile, ".rds", sep = "")) - } -} - -############################ main ############################# -# load argument -cmd <- commandArgs(T) -source(cmd[1]) -# check for svm paramater, especially to know if optimization is requiered -if(as.numeric(g) == -1) {g <- NULL} -if(as.numeric(c) == -1) {c <- NULL} -if(as.numeric(coef) == -1) {coef <- NULL} -if(as.numeric(d) == -1) {d <- NULL} -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -svmClassifier(genotype = genotype, phenotype = phenotype, - evaluation = evaluation, outFile = out, folds = folds, g=g, c=c, coef=coef, d=d, kernel=kernel) -# retunr path of the result file to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.xml --- a/svm.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ - - predict phenotype using a SVM approach - - svm.R $config > ${output1} - - - - - - - - - - - - - - - - - - - - - - - - - - -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${kernel}" -> kernel -"${gamma}" -> g -"${cost}" -> c -"${coeficient}" -> coef -"${degree}" -> d - - - - - - - - - - make the classification using the SVM method - - \ No newline at end of file