changeset 0:bdebdea5f6a7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 2f34a215c35f08c3666f314a87d235437baa1d21
author iuc
date Mon, 12 Jun 2017 07:41:02 -0400
parents
children 76d01fe0ec36
files limma_voom.R limma_voom.xml test-data/anno.txt test-data/limma-voom_Mut-WT.tsv test-data/limma-voom_Mut-WTanno.tsv test-data/limma-voom_WT-Mut.tsv test-data/matrix.txt
diffstat 7 files changed, 1077 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/limma_voom.R	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,654 @@
+# This tool takes in a matrix of feature counts as well as gene annotations and
+# outputs a table of top expressions as well as various plots for differential
+# expression analysis
+#
+# ARGS: 1.countPath       -Path to RData input containing counts
+#       2.annoPath        -Path to RData input containing gene annotations
+#       3.htmlPath        -Path to html file linking to other outputs
+#       4.outPath         -Path to folder to write all output to
+#       5.rdaOpt          -String specifying if RData should be saved
+#       6.normOpt         -String specifying type of normalisation used
+#       7.weightOpt       -String specifying usage of weights
+#       8.contrastData    -String containing contrasts of interest
+#       9.cpmReq          -Float specifying cpm requirement
+#       10.sampleReq      -Integer specifying cpm requirement
+#       11.pAdjOpt        -String specifying the p-value adjustment method
+#       12.pValReq        -Float specifying the p-value requirement
+#       13.lfcReq         -Float specifying the log-fold-change requirement
+#       14.factorData     -String containing factor names and values
+#
+# OUT:  Voom Plot
+#       BCV Plot
+#       MA Plot
+#       Top Expression Table
+#       HTML file linking to the ouputs
+#
+# Author: Shian Su - registertonysu@gmail.com - Jan 2014
+
+# Record starting time
+timeStart <- as.character(Sys.time())
+
+# Load all required libraries
+library(methods, quietly=TRUE, warn.conflicts=FALSE)
+library(statmod, quietly=TRUE, warn.conflicts=FALSE)
+library(splines, quietly=TRUE, warn.conflicts=FALSE)
+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
+library(limma, quietly=TRUE, warn.conflicts=FALSE)
+library(scales, quietly=TRUE, warn.conflicts=FALSE)
+
+if (packageVersion("limma") < "3.20.1") {
+  stop("Please update 'limma' to version >= 3.20.1 to run this tool")
+}
+
+################################################################################
+### Function Delcaration
+################################################################################
+# Function to sanitise contrast equations so there are no whitespaces
+# surrounding the arithmetic operators, leading or trailing whitespace
+sanitiseEquation <- function(equation) {
+  equation <- gsub(" *[+] *", "+", equation)
+  equation <- gsub(" *[-] *", "-", equation)
+  equation <- gsub(" *[/] *", "/", equation)
+  equation <- gsub(" *[*] *", "*", equation)
+  equation <- gsub("^\\s+|\\s+$", "", equation)
+  return(equation)
+}
+
+# Function to sanitise group information
+sanitiseGroups <- function(string) {
+  string <- gsub(" *[,] *", ",", string)
+  string <- gsub("^\\s+|\\s+$", "", string)
+  return(string)
+}
+
+# Function to change periods to whitespace in a string
+unmake.names <- function(string) {
+  string <- gsub(".", " ", string, fixed=TRUE)
+  return(string)
+}
+
+# Generate output folder and paths
+makeOut <- function(filename) {
+  return(paste0(outPath, "/", filename))
+}
+
+# Generating design information
+pasteListName <- function(string) {
+  return(paste0("factors$", string))
+}
+
+# Create cata function: default path set, default seperator empty and appending
+# true by default (Ripped straight from the cat function with altered argument
+# defaults)
+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, 
+                 append = TRUE) {
+  if (is.character(file)) 
+    if (file == "") 
+      file <- stdout()
+  else if (substring(file, 1L, 1L) == "|") {
+    file <- pipe(substring(file, 2L), "w")
+    on.exit(close(file))
+  }
+  else {
+    file <- file(file, ifelse(append, "a", "w"))
+    on.exit(close(file))
+  }
+  .Internal(cat(list(...), file, sep, fill, labels, append))
+}
+
+# Function to write code for html head and title
+HtmlHead <- function(title) {
+  cata("<head>\n")
+  cata("<title>", title, "</title>\n")
+  cata("</head>\n")
+}
+
+# Function to write code for html links
+HtmlLink <- function(address, label=address) {
+  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
+}
+
+# Function to write code for html images
+HtmlImage <- function(source, label=source, height=600, width=600) {
+  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
+  cata("\" width=\"", width, "\"/>\n")
+}
+
+# Function to write code for html list items
+ListItem <- function(...) {
+  cata("<li>", ..., "</li>\n")
+}
+
+TableItem <- function(...) {
+  cata("<td>", ..., "</td>\n")
+}
+
+TableHeadItem <- function(...) {
+  cata("<th>", ..., "</th>\n")
+}
+
+################################################################################
+### Input Processing
+################################################################################
+
+# Collects arguments from command line
+argv <- commandArgs(TRUE)
+
+# Grab arguments
+countPath <- as.character(argv[1])
+annoPath <- as.character(argv[2])
+htmlPath <- as.character(argv[3])
+outPath <- as.character(argv[4])
+rdaOpt <- as.character(argv[5])
+normOpt <- as.character(argv[6])
+weightOpt <- as.character(argv[7])
+contrastData <- as.character(argv[8])
+cpmReq <- as.numeric(argv[9])
+sampleReq <- as.numeric(argv[10])
+pAdjOpt <- as.character(argv[11])
+pValReq <- as.numeric(argv[12])
+lfcReq <- as.numeric(argv[13])
+factorData <- list()
+for (i in 14:length(argv)) {
+  newFact <- unlist(strsplit(as.character(argv[i]), split="::"))
+  factorData <- rbind(factorData, newFact)
+} # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,...
+
+# Process arguments
+if (weightOpt=="yes") {
+  wantWeight <- TRUE
+} else {
+  wantWeight <- FALSE
+}
+
+if (rdaOpt=="yes") {
+  wantRda <- TRUE
+} else {
+  wantRda <- FALSE
+}
+
+if (annoPath=="None") {
+  haveAnno <- FALSE
+} else {
+  haveAnno <- TRUE
+}
+
+# Set the row names to be the name of the factor and delete first row
+row.names(factorData) <- factorData[, 1]
+factorData <- factorData[, -1]
+factorData <- sapply(factorData, sanitiseGroups)
+factorData <- sapply(factorData, strsplit, split=",")
+factorData <- sapply(factorData, make.names)
+
+# Transform factor data into data frame of R factor objects
+factors <- data.frame(factorData)
+
+#Create output directory
+dir.create(outPath, showWarnings=FALSE)
+
+# Split up contrasts seperated by comma into a vector then sanitise
+contrastData <- unlist(strsplit(contrastData, split=","))
+contrastData <- sanitiseEquation(contrastData)
+contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+
+bcvOutPdf <- makeOut("bcvplot.pdf")
+bcvOutPng <- makeOut("bcvplot.png")
+mdsOutPdf <- makeOut("mdsplot.pdf")
+mdsOutPng <- makeOut("mdsplot.png")
+voomOutPdf <- makeOut("voomplot.pdf")
+voomOutPng <- makeOut("voomplot.png") 
+maOutPdf <- character()   # Initialise character vector
+maOutPng <- character()
+topOut <- character()
+for (i in 1:length(contrastData)) {
+  maOutPdf[i] <- makeOut(paste0("maplot_", contrastData[i], ".pdf"))
+  maOutPng[i] <- makeOut(paste0("maplot_", contrastData[i], ".png"))
+  topOut[i] <- makeOut(paste0("limma-voom_", contrastData[i], ".tsv"))
+}                         # Save output paths for each contrast as vectors
+rdaOut <- makeOut("RData.rda")
+sessionOut <- makeOut("session_info.txt")
+
+# Initialise data for html links and images, data frame with columns Label and 
+# Link
+linkData <- data.frame(Label=character(), Link=character(),
+                       stringsAsFactors=FALSE)
+imageData <- data.frame(Label=character(), Link=character(),
+                        stringsAsFactors=FALSE)
+
+# Initialise vectors for storage of up/down/neutral regulated counts
+upCount <- numeric()
+downCount <- numeric()
+flatCount <- numeric()
+                        
+# Read in counts and geneanno data
+counts <- read.table(countPath, header=TRUE, sep="\t")
+row.names(counts) <- counts[, 1]
+counts <- counts[ , -1]
+countsRows <- nrow(counts)
+if (haveAnno) {
+  geneanno <- read.table(annoPath, header=TRUE, sep="\t")
+}
+
+################################################################################
+### Data Processing
+################################################################################
+
+# Extract counts and annotation data
+data <- list()
+data$counts <- counts
+if (haveAnno) {
+  data$genes <- geneanno
+} else {
+  data$genes <- data.frame(GeneID=row.names(counts))
+}
+
+# Filter out genes that do not have a required cpm in a required number of
+# samples
+preFilterCount <- nrow(data$counts)
+sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
+data$counts <- data$counts[sel, ]
+data$genes <- data$genes[sel, ]
+postFilterCount <- nrow(data$counts)
+filteredCount <- preFilterCount-postFilterCount
+
+# Creating naming data
+samplenames <- colnames(data$counts)
+sampleanno <- data.frame("sampleID"=samplenames, factors)
+
+# Generating the DGEList object "data"
+data$samples <- sampleanno
+data$samples$lib.size <- colSums(data$counts)
+data$samples$norm.factors <- 1
+row.names(data$samples) <- colnames(data$counts)
+data <- new("DGEList", data)
+
+factorList <- sapply(names(factors), pasteListName)
+formula <- "~0"
+for (i in 1:length(factorList)) {
+  formula <- paste(formula, factorList[i], sep="+")
+}
+formula <- formula(formula)
+design <- model.matrix(formula)
+for (i in 1:length(factorList)) {
+  colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
+}
+
+# Calculating normalising factor, estimating dispersion
+data <- calcNormFactors(data, method=normOpt)
+#data <- estimateDisp(data, design=design, robust=TRUE)
+
+# Generate contrasts information
+contrasts <- makeContrasts(contrasts=contrastData, levels=design)
+
+# Name rows of factors according to their sample
+row.names(factors) <- names(data$counts)
+
+################################################################################
+### Data Output
+################################################################################
+
+# BCV Plot
+#png(bcvOutPng, width=600, height=600)
+#plotBCV(data, main="BCV Plot")
+#imageData[1, ] <- c("BCV Plot", "bcvplot.png")
+#invisible(dev.off())
+
+#pdf(bcvOutPdf)
+#plotBCV(data, main="BCV Plot")
+#invisible(dev.off())
+
+if (wantWeight) {
+  # Creating voom data object and plot
+  png(voomOutPng, width=1000, height=600)
+  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
+  imageData[1, ] <- c("Voom Plot", "voomplot.png")
+  invisible(dev.off())
+  
+  pdf(voomOutPdf, width=14)
+  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
+  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
+  invisible(dev.off())
+  
+  # Generating fit data and top table with weights
+  wts <- vData$weights
+  voomFit <- lmFit(vData, design, weights=wts)
+  
+} else {
+  # Creating voom data object and plot
+  png(voomOutPng, width=600, height=600)
+  vData <- voom(data, design=design, plot=TRUE)
+  imageData[1, ] <- c("Voom Plot", "voomplot.png")
+  invisible(dev.off())
+  
+  pdf(voomOutPdf)
+  vData <- voom(data, design=design, plot=TRUE)
+  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
+  invisible(dev.off())
+  
+  # Generate voom fit
+  voomFit <- lmFit(vData, design)
+  
+}
+
+# Fit linear model and estimate dispersion with eBayes
+voomFit <- contrasts.fit(voomFit, contrasts)
+voomFit <- eBayes(voomFit)
+
+# Plot MDS
+labels <- names(counts)
+png(mdsOutPng, width=600, height=600)
+# Currently only using a single factor
+plotMDS(vData, labels=labels, col=as.numeric(factors[, 1]), cex=0.8)
+imgName <- "Voom Plot"
+imgAddr <- "mdsplot.png"
+imageData <- rbind(imageData, c(imgName, imgAddr))
+invisible(dev.off())
+
+pdf(mdsOutPdf)
+plotMDS(vData, labels=labels, cex=0.5)
+linkName <- paste0("MDS Plot (.pdf)")
+linkAddr <- paste0("mdsplot.pdf")
+linkData <- rbind(linkData, c(linkName, linkAddr))
+invisible(dev.off())
+
+
+for (i in 1:length(contrastData)) {
+
+  status = decideTests(voomFit[, i], adjust.method=pAdjOpt, p.value=pValReq,
+                       lfc=lfcReq)
+                       
+  sumStatus <- summary(status)
+  
+  # Collect counts for differential expression
+  upCount[i] <- sumStatus["1",]
+  downCount[i] <- sumStatus["-1",]
+  flatCount[i] <- sumStatus["0",]
+                       
+  # Write top expressions table
+  top <- topTable(voomFit, coef=i, number=Inf, sort.by="P")
+  write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
+  
+  linkName <- paste0("limma-voom_", contrastData[i], 
+                     ".tsv")
+  linkAddr <- paste0("limma-voom_", contrastData[i], ".tsv")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+  
+  # Plot MA (log ratios vs mean average) using limma package on weighted data
+  pdf(maOutPdf[i])
+  limma::plotMA(voomFit, status=status, coef=i,
+                main=paste("MA Plot:", unmake.names(contrastData[i])), 
+                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
+                xlab="Average Expression", ylab="logFC")
+  
+  abline(h=0, col="grey", lty=2)
+  
+  linkName <- paste0("MA Plot_", contrastData[i], " (.pdf)")
+  linkAddr <- paste0("maplot_", contrastData[i], ".pdf")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+  invisible(dev.off())
+  
+  png(maOutPng[i], height=600, width=600)
+  limma::plotMA(voomFit, status=status, coef=i,
+                main=paste("MA Plot:", unmake.names(contrastData[i])), 
+                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
+                xlab="Average Expression", ylab="logFC")
+  
+  abline(h=0, col="grey", lty=2)
+  
+  imgName <- paste0("MA Plot_", contrastData[i])
+  imgAddr <- paste0("maplot_", contrastData[i], ".png")
+  imageData <- rbind(imageData, c(imgName, imgAddr))
+  invisible(dev.off())
+}
+sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
+row.names(sigDiff) <- contrastData
+
+# Save relevant items as rda object
+if (wantRda) {
+  if (wantWeight) {
+    save(data, status, vData, labels, factors, wts, voomFit, top, contrasts, 
+         design,
+         file=rdaOut, ascii=TRUE)
+  } else {
+    save(data, status, vData, labels, factors, voomFit, top, contrasts, design,
+         file=rdaOut, ascii=TRUE)
+  }
+  linkData <- rbind(linkData, c("RData (.rda)", "RData.rda"))
+}
+
+# Record session info
+writeLines(capture.output(sessionInfo()), sessionOut)
+linkData <- rbind(linkData, c("Session Info", "session_info.txt"))
+
+# Record ending time and calculate total run time
+timeEnd <- as.character(Sys.time())
+timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
+timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
+################################################################################
+### HTML Generation
+################################################################################
+
+# Clear file
+cat("", file=htmlPath)
+
+cata("<html>\n")
+
+cata("<body>\n")
+cata("<h3>Limma-voom Analysis Output:</h3>\n")
+cata("PDF copies of JPEGS available in 'Plots' section.<br />\n")
+if (wantWeight) {
+  HtmlImage(imageData$Link[1], imageData$Label[1], width=1000)
+} else {
+  HtmlImage(imageData$Link[1], imageData$Label[1])
+}
+
+for (i in 2:nrow(imageData)) {
+  HtmlImage(imageData$Link[i], imageData$Label[i])
+}
+
+cata("<h4>Differential Expression Counts:</h4>\n")
+
+cata("<table border=\"1\" cellpadding=\"4\">\n")
+cata("<tr>\n")
+TableItem()
+for (i in colnames(sigDiff)) {
+  TableHeadItem(i)
+}
+cata("</tr>\n")
+for (i in 1:nrow(sigDiff)) {
+  cata("<tr>\n")
+  TableHeadItem(unmake.names(row.names(sigDiff)[i]))
+  for (j in 1:ncol(sigDiff)) {
+    TableItem(as.character(sigDiff[i, j]))
+  }
+  cata("</tr>\n")
+}
+cata("</table>")
+
+cata("<h4>Plots:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".pdf", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<h4>Tables:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".tsv", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+if (wantRda) {
+  cata("<h4>R Data Object:</h4>\n")
+  for (i in 1:nrow(linkData)) {
+    if (grepl(".rda", linkData$Link[i])) {
+      HtmlLink(linkData$Link[i], linkData$Label[i])
+    }
+  }
+}
+
+cata("<p>Alt-click links to download file.</p>\n")
+cata("<p>Click floppy disc icon associated history item to download ")
+cata("all files.</p>\n")
+cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n")
+
+cata("<h4>Additional Information</h4>\n")
+cata("<ul>\n")
+if (cpmReq!=0 && sampleReq!=0) {
+  tempStr <- paste("Genes without more than", cpmReq,
+                   "CPM in at least", sampleReq, "samples are insignificant",
+                   "and filtered out.")
+  ListItem(tempStr)
+  filterProp <- round(filteredCount/preFilterCount*100, digits=2)
+  tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
+                   "%) genes were filtered out for low expression.")
+  ListItem(tempStr)
+}
+ListItem(normOpt, " was the method used to normalise library sizes.")
+if (wantWeight) {
+  ListItem("Weights were applied to samples.")
+} else {
+  ListItem("Weights were not applied to samples.")
+}
+if (pAdjOpt!="none") {
+  if (pAdjOpt=="BH" || pAdjOpt=="BY") {
+    tempStr <- paste0("MA-Plot highlighted genes are significant at FDR ",
+                      "of ", pValReq," and exhibit log2-fold-change of at ", 
+                      "least ", lfcReq, ".")
+    ListItem(tempStr)
+  } else if (pAdjOpt=="holm") {
+    tempStr <- paste0("MA-Plot highlighted genes are significant at adjusted ",
+                      "p-value of ", pValReq,"  by the Holm(1979) ",
+                      "method, and exhibit log2-fold-change of at least ", 
+                      lfcReq, ".")
+    ListItem(tempStr)
+  }
+} else {
+  tempStr <- paste0("MA-Plot highlighted genes are significant at p-value ",
+                    "of ", pValReq," and exhibit log2-fold-change of at ", 
+                    "least ", lfcReq, ".")
+  ListItem(tempStr)
+}
+cata("</ul>\n")
+
+cata("<h4>Summary of experimental data:</h4>\n")
+
+cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP*</p>\n")
+
+cata("<table border=\"1\" cellpadding=\"3\">\n")
+cata("<tr>\n")
+TableItem()
+for (i in names(factors)) {
+  TableHeadItem(i)
+}
+cata("</tr>\n")
+
+for (i in 1:nrow(factors)) {
+  cata("<tr>\n")
+  TableHeadItem(row.names(factors)[i])
+  for (j in ncol(factors)) {
+    TableItem(as.character(unmake.names(factors[i, j])))
+  }
+  cata("</tr>\n")
+}
+cata("</table>")
+
+cit <- character()
+link <- character()
+link[1] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/limma/inst/doc/usersguide.pdf",
+                  "\">", "limma User's Guide", "</a>.")
+
+link[2] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
+                  "\">", "edgeR User's Guide", "</a>")
+
+cit[1] <- paste("Please cite the following paper for this tool:")
+
+cit[2] <- paste("Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME,",
+                "Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? ",
+                "Modelling sample and observational level variability improves power ",
+                "in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.")
+
+cit[3] <- paste("Please cite the paper below for the limma software itself.",
+                "Please also try to cite the appropriate methodology articles",
+                "that describe the statistical methods implemented in limma,",
+                "depending on which limma functions you are using. The",
+                "methodology articles are listed in Section 2.1 of the",
+                link[1],
+                "Cite no. 3 only if sample weights were used.")
+cit[4] <- paste("Smyth GK (2005). Limma: linear models for microarray data.",
+                "In: 'Bioinformatics and Computational Biology Solutions using",
+                "R and Bioconductor'. R. Gentleman, V. Carey, S. doit,.",
+                "Irizarry, W. Huber (eds), Springer, New York, pages 397-420.")
+cit[5] <- paste("Please cite the first paper for the software itself and the",
+                "other papers for the various original statistical methods",
+                "implemented in edgeR.  See Section 1.2 in the", link[2],
+                "for more detail.")
+cit[6] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a",
+                "Bioconductor package for differential expression analysis",
+                "of digital gene expression data. Bioinformatics 26, 139-140")
+cit[7] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
+                "for assessing differences in tag abundance. Bioinformatics",
+                "23, 2881-2887")
+cit[8] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
+                "negative binomial dispersion, with applications to SAGE data.",
+                "Biostatistics, 9, 321-332")
+cit[9] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
+                "expression analysis of multifactor RNA-Seq experiments with",
+                "respect to biological variation. Nucleic Acids Research 40,",
+                "4288-4297")
+cit[10] <- paste("Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:",
+                "precision weights unlock linear model analysis tools for",
+                "RNA-seq read counts. Genome Biology 15, R29.")
+cit[11] <- paste("Ritchie ME, Diyagama D, Neilson J, van Laar R,", 
+                "Dobrovic A, Holloway A and Smyth GK (2006).",
+                "Empirical array quality weights for microarray data.",
+                "BMC Bioinformatics 7, Article 261.")
+cata("<h3>Citations</h3>\n")
+cata(cit[1], "\n")
+cata("<br>\n")
+cata(cit[2], "\n")
+
+cata("<h4>limma</h4>\n")
+cata(cit[3], "\n")
+cata("<ol>\n")
+ListItem(cit[4])
+ListItem(cit[10])
+ListItem(cit[11])
+cata("</ol>\n")
+
+cata("<h4>edgeR</h4>\n")
+cata(cit[5], "\n")
+cata("<ol>\n")
+ListItem(cit[6])
+ListItem(cit[7])
+ListItem(cit[8])
+ListItem(cit[9])
+cata("</ol>\n")
+
+cata("<p>Please report problems or suggestions to: su.s@wehi.edu.au</p>\n")
+
+for (i in 1:nrow(linkData)) {
+  if (grepl("session_info", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<table border=\"0\">\n")
+cata("<tr>\n")
+TableItem("Task started at:"); TableItem(timeStart)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task ended at:"); TableItem(timeEnd)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task run time:"); TableItem(timeTaken)
+cata("<tr>\n")
+cata("</table>\n")
+
+cata("</body>\n")
+cata("</html>")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/limma_voom.xml	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,388 @@
+<tool id="limma_voom" name="limma-voom" version="1.1.1">
+    <description>
+        Differential expression with optional sample weights
+    </description>
+  
+    <requirements>
+        <requirement type="package" version="3.16.5">bioconductor-edger</requirement>
+        <requirement type="package" version="3.30.13">bioconductor-limma</requirement>
+        <requirement type="package" version="1.4.29">r-statmod</requirement>
+        <requirement type="package" version="0.4.1">r-scales</requirement>
+    </requirements>
+
+    <version_command>
+    <![CDATA[
+        echo $(R --version | grep version | grep -v GNU)", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\$otherPkgs\$limma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+    ]]>
+    </version_command>
+  
+    <command detect_errors="exit_code">
+    <![CDATA[
+        Rscript '$__tool_directory__/limma_voom.R'
+            '$counts'
+            
+            #if $anno.annoOpt=='yes':
+              '$geneanno'
+            #else:
+              None
+            #end if
+            
+            '$outReport'
+            '$outReport.files_path'
+            $rdaOption
+            $normalisationOption
+            $weightOption
+            '$contrast'
+
+            #if $filterCPM.filterLowCPM=='yes':
+              '$filterCPM.cpmReq'
+              '$filterCPM.sampleReq'
+            #else:
+              0
+              0
+            #end if
+            
+            #if $testOpt.wantOpt=='yes':
+              '$testOpt.pAdjust'
+              '$testOpt.pVal'
+              '$testOpt.lfc'
+            #else:
+              "BH"
+              0.05
+              0
+            #end if
+            
+            '$factName::$factLevel'
+
+            &&
+            mkdir ./output_dir
+
+            &&
+            mv '$outReport.files_path'/*.tsv output_dir/
+            
+    ]]>             
+    </command>
+   
+    <inputs>
+        <param name="counts" type="data" format="tabular" label="Counts Data"/>
+        
+        <conditional name="anno">
+            <param name="annoOpt" type="select"
+                    label="Use Gene Annotations?" 
+                    help="If an annotation file is provided, annotations will be added to the table of differential expression results to provide descriptions for each gene.">
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="yes">
+                <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>
+            </when>
+            <when value="no" />
+        </conditional>
+
+      <!--*Code commented until solution for multiple factors is found*
+      <repeat name="factors" title="Factors" min="1" max="5" default="1">
+        <param name="factName" type="text" label="Factor Name (No spaces)"
+               help="Eg. Genotype"/>
+          <param name="factLevel" type="text" size="100"
+                 label="Factor Levels (No spaces)"
+                 help="Eg. WT,WT,Mut,Mut,WT"/>
+      </repeat>
+      -->
+      
+        <param name="factName" type="text" label="Factor Name" help="Eg. Genotype."/>
+        <param name="factLevel" type="text" label="Factor Values"
+                help="Eg. WT,WT,WT,Mut,Mut,Mut
+                NOTE: Please ensure that the same levels are typed identically with cases matching."/>     
+        <param name="contrast" type="text" label="Contrasts of interest" help="Eg. Mut-WT,KD-Control"/>
+      
+        <conditional name="filterCPM">
+            <param name="filterLowCPM" type="select" label="Filter Low CPM?"
+                help="Treat genes with very low expression as unexpressed and filter out to speed up computation.">
+                <option value="yes" selected="True">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param name="cpmReq" type="float" value="0.5" min="0" label="Minimum CPM"/>
+                       
+                <param name="sampleReq" type="integer" value="1" min="0" label="Minimum Samples"
+                    help="Filter out all the genes that do not meet the minimum CPM in at least this many samples."/>
+            </when>          
+            <when value="no"/>         
+        </conditional>
+
+        <param name="weightOption" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Apply sample weights?"
+            help="Apply weights if outliers are present."> 
+        </param>
+      
+        <param name="normalisationOption" type="select" label="Normalisation Method">
+            <option value="TMM">TMM</option>
+            <option value="RLE">RLE</option>
+            <option value="upperquartile">Upperquartile</option>
+            <option value="none">None (Don't normalise)</option>
+        </param>
+
+        <param name="rdaOption" type="boolean" truevalue="yes" falsevalue="no" checked="false" 
+            label="Output RData?"
+            help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report.">      
+        </param>
+                    
+        <conditional name="testOpt">
+            <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
+                help="Enable choices for p-value adjustment method, p-value threshold and log2-fold-change threshold.">
+                <option value="no" selected="True">No</option>
+                <option value="yes">Yes</option>
+            </param>         
+            <when value="yes">
+                <param name="pAdjust" type="select" label="P-Value Adjustment Method.">
+                    <option value="BH">Benjamini and Hochberg (1995)</option>
+                    <option value="BY">Benjamini and Yekutieli (2001)</option>
+                    <option value="holm">Holm (1979)</option>
+                    <option value="none">None</option>
+                </param>             
+                <param name="pVal" type="float" value="0.05" min="0" max="1"
+                    label="Adjusted Threshold"
+                    help="Genes below this threshold are considered significant and highlighted in the MA plot. If either BH(1995) or BY(2001) were selected then this value is a false-discovery-rate control. If Holm(1979) was selected then this is an adjusted p-value for family-wise error rate."/>
+                <param name="lfc" type="float" value="0" min="0"
+                    label="Minimum log2-fold-change Required"
+                    help="Genes above this threshold and below the p-value threshold are considered significant and highlighted in the MA plot."/>
+            </when> 
+            <when value="no"/>       
+        </conditional>
+
+    </inputs>
+  
+    <outputs>
+        <data format="html" name="outReport" label="${tool.name} on ${on_string}: Report" />
+        <collection name="voom_results" type="list" label="${tool.name} on ${on_string}: DE genes">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output_dir" visible="false" />
+        </collection>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="counts" value="matrix.txt" />
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT,WT-Mut" />
+            <param name="normalisationOption" value="TMM" />
+            <output_collection name="voom_results" count="2">
+                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WT.tsv" />
+                <element name="limma-voom_WT-Mut" ftype="tabular" file="limma-voom_WT-Mut.tsv" />
+            </output_collection>    
+            <output name="outReport" >
+                <assert_contents>
+                    <has_text text="Limma-voom Analysis Output" />
+                    <not_has_text text="RData" />
+                </assert_contents>
+            </output>          
+        </test>
+        <test>
+            <param name="annoOpt" value="yes" />
+            <param name="geneanno" value="anno.txt" />
+            <param name="counts" value="matrix.txt" />
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT" />
+            <param name="normalisationOption" value="TMM" />
+            <output_collection name="voom_results" >
+                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WTanno.tsv" />
+            </output_collection>  
+        </test>
+        <test>
+            <param name="rdaOption" value="yes" />
+            <param name="counts" value="matrix.txt" />            
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT" />
+            <param name="normalisationOption" value="TMM" />
+            <output name="outReport" >
+                <assert_contents>
+                    <has_text text="RData" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+  
+    <help>
+<![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Given a matrix of counts (e.g. from featureCounts) and optional information about the genes, this tool
+produces plots and tables useful in the analysis of differential gene 
+expression.
+
+-----
+
+**Inputs**
+
+**Counts Data:**
+A matrix of counts, with rows corresponding to genes
+and columns corresponding to counts for the samples.
+Values must be tab separated, with the first row containing the sample/column
+labels and the first column containing the row/gene labels.
+
+Example:
+
+    ========== ======= ======= ======= ======== ======== ========
+    **GeneID** **WT1** **WT2** **WT3** **Mut1** **Mut2** **Mut3**
+    ---------- ------- ------- ------- -------- -------- --------
+    11287      1699    1528    1601    1463     1441     1495                
+    11298      1905    1744    1834    1345     1291     1346                
+    11302      6       8       7       5        6        5                   
+    11303      2099    1974    2100    1574     1519     1654                
+    11304      356     312     337     361      397      346                 
+    11305      2528    2438    2493    1762     1942     2027                
+    ========== ======= ======= ======= ======== ======== ========
+
+**Gene Annotations:**
+Optional input for gene annotations, this can contain more
+information about the genes than just an ID number. The annotations will
+be avaiable in the differential expression results table.
+
+Example:
+
+    ==========  ==========  ===================================================
+    **GeneID**  **Symbol**  **GeneName**
+    ----------  ----------  ---------------------------------------------------
+    1287        Pzp         pregnancy zone protein
+    1298        Aanat       arylalkylamine N-acetyltransferase
+    1302        Aatk        apoptosis-associated tyrosine kinase
+    1303        Abca1       ATP-binding cassette, sub-family A (ABC1), member 1
+    1304        Abca4       ATP-binding cassette, sub-family A (ABC1), member 4
+    1305        Abca2       ATP-binding cassette, sub-family A (ABC1), member 2
+    ==========  ==========  ===================================================
+
+**Factor Name:**
+The name of the factor being investigated. This tool currently assumes
+that only one factor is of interest.
+
+**Factor Levels:**
+The levels of the factor of interest, this must be entered in the same
+order as the samples to which the levels correspond as listed in the
+columns of the counts matrix. 
+
+The values should be seperated by commas, and spaces must not be used.
+
+**Contrasts of Interest:**
+The contrasts you wish to make between levels. 
+
+A common contrast would be a simple difference between two levels: "Mut-WT" 
+represents the difference between the mutant and wild type genotypes.
+
+The values should be seperated by commas and spaces must not be used.
+
+**Filter Low CPM:**
+Option to ignore the genes that do not show significant levels of
+expression, this filtering is dependent on two criteria:
+
+    * **Minimum CPM:** This is the counts per million that a gene must have in at
+      least some specified number of samples.
+
+    * **Minumum Samples:** This is the number of samples in which the CPM
+      requirement must be met in order for that gene to be acknowledged.
+
+Only genes that exhibit a CPM greater than the required amount in at least the
+number of samples specified will be used for analysis. Care should be taken to
+ensure that the sample requirement is appropriate. In the case of an experiment
+with two experimental groups each with two members, if there is a change from
+insignificant cpm to significant cpm but the sample requirement is set to 3,
+then this will cause that gene to fail the criteria. When in doubt simply do not
+filter.
+
+
+**Normalisation Method:**
+Option for using different methods to rescale the raw library
+size. For more information, see calcNormFactor section in the edgeR_ user's
+manual.
+
+**Apply Sample Weights:**
+Option to downweight outlier samples such that their information is still
+used in the statistical analysis but their impact is reduced. Use this
+whenever significant outliers are present. The MDS plotting tool in this package
+is useful for identifying outliers. For more information on this option see Liu et al. (2015).
+
+**Use Advanced Testing Options?:**
+By default error rate for multiple testing is controlled using Benjamini and
+Hochberg's false discovery rate control at a threshold value of 0.05. However
+there are options to change this to custom values.
+
+    * **P-Value Adjustment Method:**
+      Change the multiple testing control method, the options are BH(1995) and 
+      BY(2001) which are both false discovery rate controls. There is also
+      Holm(1979) which is a method for family-wise error rate control.
+    
+    * **Adjusted Threshold:**
+      Set the threshold for the resulting value of the multiple testing control
+      method. Only observations whose statistic falls below this value is
+      considered significant, thus highlighted in the MA plot.
+      
+    * **Minimum log2-fold-change Required:**
+      In addition to meeting the requirement for the adjusted statistic for
+      multiple testing, the observation must have an absolute log2-fold-change
+      greater than this threshold to be considered significant, thus highlighted 
+      in the MA plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma 
+User's Guide.
+
+    * Smyth GK (2005). Limma: linear models for microarray data. In: 
+      'Bioinformatics and Computational Biology Solutions using R and 
+      Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+      W. Huber (eds), Springer, New York, pages 397-420.
+        
+    * Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:
+      precision weights unlock linear model analysis tools for
+      RNA-seq read counts. Genome Biology 15, R29.
+
+    * Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME, Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? Modelling sample and observational level variability improves power in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.
+
+    * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic, 
+      A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights
+      for microarray data. BMC Bioinformatics 7, Article 261.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See 
+Section 1.2 in the User's Guide for more detail.
+
+    * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+      package for differential expression analysis of digital gene expression 
+      data. Bioinformatics 26, 139-140
+      
+    * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+      differences in tag abundance. Bioinformatics 23, 2881-2887
+      
+    * Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+      binomial dispersion, with applications to SAGE data.
+      Biostatistics, 9, 321-332
+      
+    * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+      of multifactor RNA-Seq experiments with respect to biological variation. 
+      Nucleic Acids Research 40, 4288-4297
+      
+Please report problems or suggestions to: su.s@wehi.edu.au
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1093/nar/gkv412</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/anno.txt	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,7 @@
+EntrezID	Symbol	GeneName	Chr	Length
+11287	Pzp	pregnancy zone protein	6	4681
+11298	Aanat	arylalkylamine N-acetyltransferase	11	1455
+11302	Aatk	apoptosis-associated tyrosine kinase	11	5743
+11303	Abca1	ATP-binding cassette, sub-family A (ABC1), member 1	4	10260
+11304	Abca4	ATP-binding cassette, sub-family A (ABC1), member 4	3	7248
+11305	Abca2	ATP-binding cassette, sub-family A (ABC1), member 2	2	8061
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_Mut-WT.tsv	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,7 @@
+"ID"	"logFC"	"AveExpr"	"t"	"P.Value"	"adj.P.Val"	"B"
+"11304"	0.457332061341026	15.5254133001226	6.50459574633681	9.98720685006039e-07	5.99232411003624e-06	14.0741948485896
+"11287"	0.190749727701785	17.6546448244617	5.09535410066402	3.26518807654125e-05	9.79556422962375e-05	5.46773893802392
+"11298"	-0.138014418336201	17.6747285193431	-3.33168485842331	0.00278753263633162	0.00557506527266324	-1.84301342041449
+"11303"	-0.0558958943606989	17.886791401216	-1.30108531275576	0.205582481502297	0.254491025872973	-6.4924124057801
+"11305"	-0.0606991650996633	18.1585474109909	-1.28203791127299	0.212075854894144	0.254491025872973	-6.42090197700503
+"11302"	-0.0350239682204432	9.78883119065989	-0.236945963165269	0.814709535394087	0.814709535394087	-6.09497670655944
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_Mut-WTanno.tsv	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,7 @@
+"EntrezID"	"Symbol"	"GeneName"	"Chr"	"Length"	"logFC"	"AveExpr"	"t"	"P.Value"	"adj.P.Val"	"B"
+11304	"Abca4"	"ATP-binding cassette, sub-family A (ABC1), member 4"	3	7248	0.457332061341026	15.5254133001226	6.50459574633681	9.98720685006039e-07	5.99232411003624e-06	14.0741948485896
+11287	"Pzp"	"pregnancy zone protein"	6	4681	0.190749727701785	17.6546448244617	5.09535410066402	3.26518807654125e-05	9.79556422962375e-05	5.46773893802392
+11298	"Aanat"	"arylalkylamine N-acetyltransferase"	11	1455	-0.138014418336201	17.6747285193431	-3.33168485842331	0.00278753263633162	0.00557506527266324	-1.84301342041449
+11303	"Abca1"	"ATP-binding cassette, sub-family A (ABC1), member 1"	4	10260	-0.0558958943606989	17.886791401216	-1.30108531275576	0.205582481502297	0.254491025872973	-6.4924124057801
+11305	"Abca2"	"ATP-binding cassette, sub-family A (ABC1), member 2"	2	8061	-0.0606991650996633	18.1585474109909	-1.28203791127299	0.212075854894144	0.254491025872973	-6.42090197700503
+11302	"Aatk"	"apoptosis-associated tyrosine kinase"	11	5743	-0.0350239682204432	9.78883119065989	-0.236945963165269	0.814709535394087	0.814709535394087	-6.09497670655944
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/limma-voom_WT-Mut.tsv	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,7 @@
+"ID"	"logFC"	"AveExpr"	"t"	"P.Value"	"adj.P.Val"	"B"
+"11304"	-0.457332061341026	15.5254133001226	-6.50459574633681	9.98720685006039e-07	5.99232411003624e-06	14.0741948485896
+"11287"	-0.190749727701785	17.6546448244617	-5.09535410066402	3.26518807654125e-05	9.79556422962375e-05	5.46773893802392
+"11298"	0.138014418336201	17.6747285193431	3.33168485842331	0.00278753263633162	0.00557506527266324	-1.84301342041449
+"11303"	0.0558958943606989	17.886791401216	1.30108531275576	0.205582481502297	0.254491025872973	-6.4924124057801
+"11305"	0.0606991650996633	18.1585474109909	1.28203791127299	0.212075854894144	0.254491025872973	-6.42090197700503
+"11302"	0.0350239682204432	9.78883119065989	0.236945963165269	0.814709535394087	0.814709535394087	-6.09497670655944
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/matrix.txt	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,7 @@
+GeneID	WT1	WT2	WT3	Mut1	Mut2	Mut3
+11287	1699	1528	1601	1463	1441	1495
+11298	1905	1744	1834	1345	1291	1346
+11302	6	8	7	5	6	5
+11303	2099	1974	2100	1574	1519	1654
+11304	356	312	337	361	397	346
+11305	2528	2438	2493	1762	1942	2027