Mercurial > repos > nicolas > oghma

--- a/aggregation.R	Wed Oct 26 18:42:20 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,320 +0,0 @@
-########################################################
-#
-# creation date : 25/10/16
-# last modification : 25/10/16
-# author : Dr Nicolas Beaume
-#
-########################################################
-
-suppressWarnings(suppressMessages(library(GA)))
-library("miscTools")
-library(rpart)
-suppressWarnings(suppressMessages(library(randomForest)))
-library(e1071)
-suppressWarnings(suppressMessages(library(glmnet)))
-options(warn=-1)
-############################ helper functions #######################
-
-##### Genetic algorithm
-
-# compute r2 by computing the classic formula
-# compare the sum of square difference from target to prediciton
-# to the sum of square difference from target to the mean of the target
-r2 <- function(target, prediction) {
-  sst <- sum((target-mean(target))^2)
-  ssr <- sum((target-prediction)^2)
-  return(1-ssr/sst)
-}
-
-optimizeOneIndividual <- function(values, trueValue) {
-  # change the value into a function
-  f <- function(w) {sum(values * w/sum(w))}
-  fitness <- function(x) {1/abs(trueValue-f(x))}
-  resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
-             maxiter = 1000, monitor = NULL, keepBest = T)
-  resp@solution <- resp@solution/sum(resp@solution)
-  return(resp)
-}
-
-optimizeWeight <- function(values, trueValue, n=1000) {
-  fitnessAll <- function(w) {
-    predicted <- apply(values, 1, weightedPrediction.vec, w)
-    return(mean(r2(trueValue, predicted)))
-    #return(mean(1/abs(trueValue-predicted)))
-  }
-  resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
-             maxiter = n, monitor = NULL, keepBest = T)
-  resp@solution <- resp@solution/sum(resp@solution)
-  return(resp)
-}
-
-weightedPrediction <- function(classifiers, w) {
-  if(length(w) > ncol(classifiers)) {
-    warning("more weights than classifiers, extra weigths are ignored")
-    w <- w[1:ncol(classifiers)]
-  } else if(length(w) < ncol(classifiers)) {
-    warning("less weights than classifiers, extra classifiers are ignored")
-    classifiers <- classifiers[,1:length(w)]
-  }
-  prediction <- NULL
-  prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
-  return(prediction)
-}
-
-weightedPrediction.vec <- function(values, w) {
-  return(sum(values * w/sum(w)))
-}
-
-##### meta-decision tree
-
-tuneTree <- function(data, target) {
-  data <- data.frame(data, target=target)
-  size <-  nrow(data)
-  xerror <-  NULL
-  split <-  1:ceiling(size/5)
-  leafSize <-  1:ceiling(size/10)
-  xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
-  cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
-  for(i in 1:length(split)) {
-    for(j in 1:length(leafSize)) {
-      op <- list(minsplit=split[i], minbucket=leafSize[j])
-      tree <- rpart(target ~., data=data, control=op, method="anova")
-      xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
-      cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
-    }
-  }
-  index <- which(xerror==min(xerror), arr.ind = T)
-  op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
-  return(op)
-}
-
-###### meta-LASSO
-# create fold by picking at random row indexes
-createFolds <- function(nbObs, n) {
-  # pick indexes
-  index <- sample(1:n, size=nbObs, replace = T)
-  # populate folds
-  folds <- NULL
-  for(i in 1:n) {
-    folds <- c(folds, list(which(index==i)))
-  }
-  return(folds)
-}
-
-searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
-  folds <- createFolds(nrow(genotype), n = n)
-  acc <- NULL
-  indexAlpha <- 1
-  for(a in alpha) {
-    curAcc <- NULL
-    for(i in 1:n) {
-      train <- genotype[-folds[[i]],]
-      test <- genotype[folds[[i]],]
-      phenoTrain <- phenotype[-folds[[i]]]
-      phenoTest <- phenotype[folds[[i]]]
-      cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
-      model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
-      pred <- predict(model, test, type = "response")
-      curAcc <- c(curAcc, r2(phenoTest, pred))
-    }
-    acc <- c(acc, mean(curAcc))
-  }
-  names(acc) <- alpha
-  return(as.numeric(names(acc)[which.max(acc)]))
-}
-
-###### meta-random forest
-
-searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
-  n <- ceiling(nrow(genotype)/3)
-  indexTest <- sample(1:nrow(genotype), size=n)
-  train <- genotype[-indexTest,]
-  test <- genotype[indexTest,]
-  phenoTrain <- phenotype[-indexTest]
-  phenoTest <- phenotype[indexTest]
-  acc <- NULL
-  indexNtree <- 1
-  for(ntree in rangeNtree) {
-    model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
-    pred <- predict(model, test)
-    acc <- c(acc, r2(phenoTest, pred))
-  }
-  names(acc) <- rangeNtree
-  best <- which.max(acc)
-  return(as.numeric(names(acc)[best]))
-}
-
-###### meta-SVM
-searchParamSVM <- function(train, target, kernel="radial") {
-  # tuning parameters then train
-  model <- NULL
-  switch(kernel,
-         sigmoid={
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
-           g <- tune$best.parameters[[1]];
-           c <- tune$best.parameters[[2]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
-         linear={
-           tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
-           c <- tune$best.parameters[[1]];
-           model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
-         polynomial={
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
-           d <- tune$best.parameters[[1]];
-           g <- tune$best.parameters[[2]];
-           coef <- tune$best.parameters[[3]];
-           c <- tune$best.parameters[[4]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
-         {
-           tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
-           g <- tune$best.parameters[[1]];
-           c <- tune$best.parameters[[2]];
-           model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
-  )
-  return(model)
-}
-
-#################### upper level functions #####################
-
-aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
-  if(!prediction) {
-    treeParam <- tuneTree(classifiers, target)
-    data <- data.frame(classifiers, target)
-    model <- rpart(target ~., data=data, method = "anova", control = treeParam)
-    model <- prune(model, cp=treeParam["cp"])
-    out <- paste(out, ".rds", sep = "")
-    saveRDS(model, out)
-    return(out)
-  } else {
-    return(predict(model, classifiers))
-  }
-}
-
-aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
-  if(!prediction) {
-    opt <- optimizeWeight(values = classifiers, trueValue = target)
-    out <- paste(out, ".rds", sep = "")
-    saveRDS(opt@solution, out)
-    return(out)
-  } else {
-    return(weightedPrediction.vec(classifiers, model))
-  }
-}
-
-aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
-  if(!prediction) {
-    alpha <- searchParamLASSO(classifiers, target)
-    cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
-    model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
-    out <- paste(out, ".rds", sep = "")
-    saveRDS(model, out)
-    return(out)
-  } else {
-    return(predict(model, classifiers))
-  }
-}
-
-aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
-  if(!prediction) {
-   ntree <- searchParamRF(genotype = classifiers, phenotype = target,
-                                              rangeNtree = seq(100, 1000, 100))
-    model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
-    out <- paste(out, ".rds", sep = "")
-    saveRDS(model, out)
-    return(out)
-  } else {
-    return(predict(model, classifiers))
-  }
-}
-
-aggregateSVM <- function(classifiers, target=NULL, prediction=F,
-                         model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
-  if(!prediction) {
-      model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
-      out <- paste(out, ".rds", sep = "")
-      saveRDS(model, out)
-      return(out)
-  } else {
-    return(predict(model, classifiers))
-  }
-}
-
-################################### main #############################
-# # load argument
-cmd <- commandArgs(T)
-source(cmd[1])
-# check if evaluation is required
-evaluation <- F
-if(as.integer(doEvaluation) == 1) {
-  evaluation <- T
-  con = file(folds)
-  folds <- readLines(con = con, n = 1, ok=T)
-  close(con)
-  folds <- readRDS(folds)
-}
-# check for model
-if(model == "None") {
-  model <- NULL
-  prediction <- F
-} else {
-  prediction <- T
-  con = file(model)
-  model <- readLines(con = con, n = 1, ok=T)
-  close(con)
-  model <- readRDS(model)
-}
-# load classifiers and phenotype
-classifiers <- NULL
-classifNames <- NULL
-if(lassoPred !="None"){
-  classifiers <- c(classifiers, lassoPred)
-  classifNames <- c(classifNames, "lasso")
-}
-if(rrBLUPPred !="None"){
-  classifiers <- c(classifiers, rrBLUPPred)
-  classifNames <- c(classifNames, "rrBLUP")
-}
-if(rfPred !="None"){
-  classifiers <- c(classifiers, rfPred)
-  classifNames <- c(classifNames, "rf")
-}
-if(svmPred !="None"){
-  classifiers <- c(classifiers, svmPred)
-  classifNames <- c(classifNames, "svm")
-}
-classifPrediction <- NULL
-for(classif in classifiers) {
-  classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
-}
-classifPrediction <- data.frame(classifPrediction)
-colnames(classifPrediction) <- classifNames
-# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
-phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
-# aggregate !
-switch(method,
-       geneticMean={
-         res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
-                              out = out, prediction = prediction, model=model)
-       },
-       dt={
-         res <- aggregateDT(classifiers = classifPrediction, target = phenotype,
-                     out = out, prediction = prediction, model=model)
-       },
-       lasso={
-         res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,
-                     out = out, prediction = prediction, model=model)
-       },
-       rf={
-         res <- aggregateRF(classifiers = classifPrediction, target = phenotype,
-                     out = out, prediction = prediction, model=model)
-       },
-       # svm
-       {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
-                     out = out, prediction = prediction, model = model)}
-       )
-if(prediction) {
-  write.table(data.frame(lines=rownames(classifPrediction), res), out,
-                         sep="\t", row.names = F)
-} else {
-  cat(out, "\n", sep="")
-}
\ No newline at end of file
--- a/aggregation.xml	Wed Oct 26 18:42:20 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,78 +0,0 @@
-<tool id="aggreg" name="aggregation" version="1.0.0">
-  <description>predict phenotype by combining multiple classifiers</description>
-  <command interpreter="Rscript">
-	  aggregation.R $config &gt; ${output1}
-  </command>
-
-  <inputs>
-	<param name="lassoPred" type="data" optional="true"
-		label="lasso prediction" help="path to rds containing LASSO prediction"
-	/>
-
-	<param name="rfPred" type="data" optional="true"
-		label="rf prediction" help="path to rds containing Random Forest prediction"
-		/>
-
-	<param name="rrBLUPPred" type="data" optional="true"
-		label="rrBLUP prediction" help="path to rds containing rrBLUP prediction"
-		/>
-
-	<param name="svmPred" type="data" optional="true"
-		label="SVM prediction" help="path to rds containing SVM prediction"
-		/>
-
-	<param name="phenotype" type="data"
-			label="phenotype data" help=" a tabular datatype containing the phenotypes "
-			/>
-
-	<param name="eval" type="integer" value="0"
-			label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model "
-			/>
-
-	<param name="folds" type="data" optional="true"
-			label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. "
-			/>
-
-	<param name="model" type="data" optional="true"
-			label="model" help= " a path to a file where the results (depending on the chosen mode) will be writen"
-			/>
-	<!-- deprecated <param name="out" type="text"
-			label="output path" help= " a path to a rds file" -->
-			/>
-	<param name="method" type="text" value="svm"
-			label="aggregation method" help= "choose among geneticMean, rrBLUP, lasso, rf or svm"
-			/>
-	<param name="kernel" type="text" value="linear"
-			label="kernel for SVM" help= "choose among linear, polynomial, radial, sigmoid"
-			/>
-
-  </inputs>
-
-  <configfiles>
-    <configfile name="config">
-## Desc: this file is sourced in encode wrapper script
-##  as means to pass all galaxy params to R
-"${lassoPred}" -> lassoPred
-"${rfPred}" -> rfPred
-"${rrBLUPPred}" -> rrBLUPPred
-"${svmPred}" -> svmPred
-"${phenotype}" -> phenotype
-"${model}" -> model
-"${output1}" -> out
-"${eval}" -> evaluation
-"${folds}" -> folds
-"${method}" -> method
-"${kernel}" -> kernel
-"${eval}" -> doEvaluation
-
-    </configfile>
-</configfiles>
-
-<outputs>
-	<data format="tabular" name = "output1" label="aggregation output" />
-</outputs>
-
-  <help>
-
-  </help>
-  </tool>
\ No newline at end of file