Mercurial > repos > shians > voom_rnaseq

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/diffexp.R	Tue Dec 16 14:38:15 2014 +1100
@@ -0,0 +1,644 @@
+# This tool takes in a matrix of feature counts as well as gene annotations and
+# outputs a table of top expressions as well as various plots for differential
+# expression analysis
+#
+# ARGS: 1.countPath       -Path to RData input containing counts
+#       2.annoPath        -Path to RData input containing gene annotations
+#       3.htmlPath        -Path to html file linking to other outputs
+#       4.outPath         -Path to folder to write all output to
+#       5.rdaOpt          -String specifying if RData should be saved
+#       6.normOpt         -String specifying type of normalisation used
+#       7.weightOpt       -String specifying usage of weights
+#       8.contrastData    -String containing contrasts of interest
+#       9.cpmReq          -Float specifying cpm requirement
+#       10.sampleReq      -Integer specifying cpm requirement
+#       11.pAdjOpt        -String specifying the p-value adjustment method
+#       12.pValReq        -Float specifying the p-value requirement
+#       13.lfcReq         -Float specifying the log-fold-change requirement
+#       14.factorData     -String containing factor names and values
+#
+# OUT:  Voom Plot
+#       BCV Plot
+#       MA Plot
+#       Top Expression Table
+#       HTML file linking to the ouputs
+#
+# Author: Shian Su - registertonysu@gmail.com - Jan 2014
+
+# Record starting time
+timeStart <- as.character(Sys.time())
+
+# Load all required libraries
+library(methods, quietly=TRUE, warn.conflicts=FALSE)
+library(statmod, quietly=TRUE, warn.conflicts=FALSE)
+library(splines, quietly=TRUE, warn.conflicts=FALSE)
+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
+library(limma, quietly=TRUE, warn.conflicts=FALSE)
+library(scales, quietly=TRUE, warn.conflicts=FALSE)
+
+if (packageVersion("limma") < "3.20.1") {
+  stop("Please update 'limma' to version >= 3.20.1 to run this tool")
+}
+
+################################################################################
+### Function Delcaration
+################################################################################
+# Function to sanitise contrast equations so there are no whitespaces
+# surrounding the arithmetic operators, leading or trailing whitespace
+sanitiseEquation <- function(equation) {
+  equation <- gsub(" *[+] *", "+", equation)
+  equation <- gsub(" *[-] *", "-", equation)
+  equation <- gsub(" *[/] *", "/", equation)
+  equation <- gsub(" *[*] *", "*", equation)
+  equation <- gsub("^\\s+|\\s+$", "", equation)
+  return(equation)
+}
+
+# Function to sanitise group information
+sanitiseGroups <- function(string) {
+  string <- gsub(" *[,] *", ",", string)
+  string <- gsub("^\\s+|\\s+$", "", string)
+  return(string)
+}
+
+# Function to change periods to whitespace in a string
+unmake.names <- function(string) {
+  string <- gsub(".", " ", string, fixed=TRUE)
+  return(string)
+}
+
+# Generate output folder and paths
+makeOut <- function(filename) {
+  return(paste0(outPath, "/", filename))
+}
+
+# Generating design information
+pasteListName <- function(string) {
+  return(paste0("factors$", string))
+}
+
+# Create cata function: default path set, default seperator empty and appending
+# true by default (Ripped straight from the cat function with altered argument
+# defaults)
+cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL,
+                 append = TRUE) {
+  if (is.character(file))
+    if (file == "")
+      file <- stdout()
+  else if (substring(file, 1L, 1L) == "|") {
+    file <- pipe(substring(file, 2L), "w")
+    on.exit(close(file))
+  }
+  else {
+    file <- file(file, ifelse(append, "a", "w"))
+    on.exit(close(file))
+  }
+  .Internal(cat(list(...), file, sep, fill, labels, append))
+}
+
+# Function to write code for html head and title
+HtmlHead <- function(title) {
+  cata("<head>\n")
+  cata("<title>", title, "</title>\n")
+  cata("</head>\n")
+}
+
+# Function to write code for html links
+HtmlLink <- function(address, label=address) {
+  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
+}
+
+# Function to write code for html images
+HtmlImage <- function(source, label=source, height=600, width=600) {
+  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
+  cata("\" width=\"", width, "\"/>\n")
+}
+
+# Function to write code for html list items
+ListItem <- function(...) {
+  cata("<li>", ..., "</li>\n")
+}
+
+TableItem <- function(...) {
+  cata("<td>", ..., "</td>\n")
+}
+
+TableHeadItem <- function(...) {
+  cata("<th>", ..., "</th>\n")
+}
+
+################################################################################
+### Input Processing
+################################################################################
+
+# Collects arguments from command line
+argv <- commandArgs(TRUE)
+
+# Grab arguments
+countPath <- as.character(argv[1])
+annoPath <- as.character(argv[2])
+htmlPath <- as.character(argv[3])
+outPath <- as.character(argv[4])
+rdaOpt <- as.character(argv[5])
+normOpt <- as.character(argv[6])
+weightOpt <- as.character(argv[7])
+contrastData <- as.character(argv[8])
+cpmReq <- as.numeric(argv[9])
+sampleReq <- as.numeric(argv[10])
+pAdjOpt <- as.character(argv[11])
+pValReq <- as.numeric(argv[12])
+lfcReq <- as.numeric(argv[13])
+factorData <- list()
+for (i in 14:length(argv)) {
+  newFact <- unlist(strsplit(as.character(argv[i]), split="::"))
+  factorData <- rbind(factorData, newFact)
+} # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,...
+
+# Process arguments
+if (weightOpt=="yes") {
+  wantWeight <- TRUE
+} else {
+  wantWeight <- FALSE
+}
+
+if (rdaOpt=="yes") {
+  wantRda <- TRUE
+} else {
+  wantRda <- FALSE
+}
+
+if (annoPath=="None") {
+  haveAnno <- FALSE
+} else {
+  haveAnno <- TRUE
+}
+
+# Set the row names to be the name of the factor and delete first row
+row.names(factorData) <- factorData[, 1]
+factorData <- factorData[, -1]
+factorData <- sapply(factorData, sanitiseGroups)
+factorData <- sapply(factorData, strsplit, split=",")
+factorData <- sapply(factorData, make.names)
+
+# Transform factor data into data frame of R factor objects
+factors <- data.frame(factorData)
+
+#Create output directory
+dir.create(outPath, showWarnings=FALSE)
+
+# Split up contrasts seperated by comma into a vector then sanitise
+contrastData <- unlist(strsplit(contrastData, split=","))
+contrastData <- sanitiseEquation(contrastData)
+contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+
+bcvOutPdf <- makeOut("bcvplot.pdf")
+bcvOutPng <- makeOut("bcvplot.png")
+mdsOutPdf <- makeOut("mdsplot.pdf")
+mdsOutPng <- makeOut("mdsplot.png")
+voomOutPdf <- makeOut("voomplot.pdf")
+voomOutPng <- makeOut("voomplot.png")
+maOutPdf <- character()   # Initialise character vector
+maOutPng <- character()
+topOut <- character()
+for (i in 1:length(contrastData)) {
+  maOutPdf[i] <- makeOut(paste0("maplot(", contrastData[i], ").pdf"))
+  maOutPng[i] <- makeOut(paste0("maplot(", contrastData[i], ").png"))
+  topOut[i] <- makeOut(paste0("toptab(", contrastData[i], ").tsv"))
+}                         # Save output paths for each contrast as vectors
+rdaOut <- makeOut("RData.rda")
+sessionOut <- makeOut("session_info.txt")
+
+# Initialise data for html links and images, data frame with columns Label and
+# Link
+linkData <- data.frame(Label=character(), Link=character(),
+                       stringsAsFactors=FALSE)
+imageData <- data.frame(Label=character(), Link=character(),
+                        stringsAsFactors=FALSE)
+
+# Initialise vectors for storage of up/down/neutral regulated counts
+upCount <- numeric()
+downCount <- numeric()
+flatCount <- numeric()
+
+# Read in counts and geneanno data
+counts <- read.table(countPath, header=TRUE, sep="\t")
+row.names(counts) <- counts$GeneID
+counts <- counts[ , !(colnames(counts)=="GeneID")]
+countsRows <- nrow(counts)
+if (haveAnno) {
+  geneanno <- read.table(annoPath, header=TRUE, sep="\t")
+}
+
+################################################################################
+### Data Processing
+################################################################################
+
+# Extract counts and annotation data
+data <- list()
+data$counts <- counts
+if (haveAnno) {
+  data$genes <- geneanno
+} else {
+  data$genes <- data.frame(GeneID=row.names(counts))
+}
+
+# Filter out genes that do not have a required cpm in a required number of
+# samples
+preFilterCount <- nrow(data$counts)
+sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
+data$counts <- data$counts[sel, ]
+data$genes <- data$genes[sel, ]
+postFilterCount <- nrow(data$counts)
+filteredCount <- preFilterCount-postFilterCount
+
+# Creating naming data
+samplenames <- colnames(data$counts)
+sampleanno <- data.frame("sampleID"=samplenames, factors)
+
+# Generating the DGEList object "data"
+data$samples <- sampleanno
+data$samples$lib.size <- colSums(data$counts)
+data$samples$norm.factors <- 1
+row.names(data$samples) <- colnames(data$counts)
+data <- new("DGEList", data)
+
+factorList <- sapply(names(factors), pasteListName)
+formula <- "~0"
+for (i in 1:length(factorList)) {
+  formula <- paste(formula, factorList[i], sep="+")
+}
+formula <- formula(formula)
+design <- model.matrix(formula)
+for (i in 1:length(factorList)) {
+  colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
+}
+
+# Calculating normalising factor, estimating dispersion
+data <- calcNormFactors(data, method=normOpt)
+#data <- estimateDisp(data, design=design, robust=TRUE)
+
+# Generate contrasts information
+contrasts <- makeContrasts(contrasts=contrastData, levels=design)
+
+# Name rows of factors according to their sample
+row.names(factors) <- names(data$counts)
+
+################################################################################
+### Data Output
+################################################################################
+
+# BCV Plot
+#png(bcvOutPng, width=600, height=600)
+#plotBCV(data, main="BCV Plot")
+#imageData[1, ] <- c("BCV Plot", "bcvplot.png")
+#invisible(dev.off())
+
+#pdf(bcvOutPdf)
+#plotBCV(data, main="BCV Plot")
+#invisible(dev.off())
+
+if (wantWeight) {
+  # Creating voom data object and plot
+  png(voomOutPng, width=1000, height=600)
+  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
+  imageData[1, ] <- c("Voom Plot", "voomplot.png")
+  invisible(dev.off())
+
+  pdf(voomOutPdf, width=14)
+  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
+  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
+  invisible(dev.off())
+
+  # Generating fit data and top table with weights
+  wts <- vData$weights
+  voomFit <- lmFit(vData, design, weights=wts)
+
+} else {
+  # Creating voom data object and plot
+  png(voomOutPng, width=600, height=600)
+  vData <- voom(data, design=design, plot=TRUE)
+  imageData[1, ] <- c("Voom Plot", "voomplot.png")
+  invisible(dev.off())
+
+  pdf(voomOutPdf)
+  vData <- voom(data, design=design, plot=TRUE)
+  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
+  invisible(dev.off())
+
+  # Generate voom fit
+  voomFit <- lmFit(vData, design)
+
+}
+
+# Fit linear model and estimate dispersion with eBayes
+voomFit <- contrasts.fit(voomFit, contrasts)
+voomFit <- eBayes(voomFit)
+
+# Plot MDS
+labels <- names(counts)
+png(mdsOutPng, width=600, height=600)
+# Currently only using a single factor
+plotMDS(vData, labels=labels, col=as.numeric(factors[, 1]), cex=0.8)
+imgName <- "Voom Plot"
+imgAddr <- "mdsplot.png"
+imageData <- rbind(imageData, c(imgName, imgAddr))
+invisible(dev.off())
+
+pdf(mdsOutPdf)
+plotMDS(vData, labels=labels, cex=0.5)
+linkName <- paste0("MDS Plot (.pdf)")
+linkAddr <- paste0("mdsplot.pdf")
+linkData <- rbind(linkData, c(linkName, linkAddr))
+invisible(dev.off())
+
+
+for (i in 1:length(contrastData)) {
+
+  status = decideTests(voomFit[, i], adjust.method=pAdjOpt, p.value=pValReq,
+                       lfc=lfcReq)
+
+  sumStatus <- summary(status)
+
+  # Collect counts for differential expression
+  upCount[i] <- sumStatus["1",]
+  downCount[i] <- sumStatus["-1",]
+  flatCount[i] <- sumStatus["0",]
+
+  # Write top expressions table
+  top <- topTable(voomFit, coef=i, number=Inf, sort.by="P")
+  write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
+
+  linkName <- paste0("Top Differential Expressions(", contrastData[i],
+                     ") (.tsv)")
+  linkAddr <- paste0("toptab(", contrastData[i], ").tsv")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+
+  # Plot MA (log ratios vs mean average) using limma package on weighted data
+  pdf(maOutPdf[i])
+  limma::plotMA(voomFit, status=status, coef=i,
+                main=paste("MA Plot:", unmake.names(contrastData[i])),
+                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
+                xlab="Average Expression", ylab="logFC")
+
+  abline(h=0, col="grey", lty=2)
+
+  linkName <- paste0("MA Plot(", contrastData[i], ")", " (.pdf)")
+  linkAddr <- paste0("maplot(", contrastData[i], ").pdf")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+  invisible(dev.off())
+
+  png(maOutPng[i], height=600, width=600)
+  limma::plotMA(voomFit, status=status, coef=i,
+                main=paste("MA Plot:", unmake.names(contrastData[i])),
+                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
+                xlab="Average Expression", ylab="logFC")
+
+  abline(h=0, col="grey", lty=2)
+
+  imgName <- paste0("MA Plot(", contrastData[i], ")")
+  imgAddr <- paste0("maplot(", contrastData[i], ").png")
+  imageData <- rbind(imageData, c(imgName, imgAddr))
+  invisible(dev.off())
+}
+sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
+row.names(sigDiff) <- contrastData
+
+# Save relevant items as rda object
+if (wantRda) {
+  if (wantWeight) {
+    save(data, status, vData, labels, factors, wts, voomFit, top, contrasts,
+         design,
+         file=rdaOut, ascii=TRUE)
+  } else {
+    save(data, status, vData, labels, factors, voomFit, top, contrasts, design,
+         file=rdaOut, ascii=TRUE)
+  }
+  linkData <- rbind(linkData, c("RData (.rda)", "RData.rda"))
+}
+
+# Record session info
+writeLines(capture.output(sessionInfo()), sessionOut)
+linkData <- rbind(linkData, c("Session Info", "session_info.txt"))
+
+# Record ending time and calculate total run time
+timeEnd <- as.character(Sys.time())
+timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
+timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
+################################################################################
+### HTML Generation
+################################################################################
+
+# Clear file
+cat("", file=htmlPath)
+
+cata("<html>\n")
+
+cata("<body>\n")
+cata("<h3>Limma Voom Analysis Output:</h3>\n")
+cata("PDF copies of JPEGS available in 'Plots' section.<br />\n")
+if (wantWeight) {
+  HtmlImage(imageData$Link[1], imageData$Label[1], width=1000)
+} else {
+  HtmlImage(imageData$Link[1], imageData$Label[1])
+}
+
+for (i in 2:nrow(imageData)) {
+  HtmlImage(imageData$Link[i], imageData$Label[i])
+}
+
+cata("<h4>Differential Expression Counts:</h4>\n")
+
+cata("<table border=\"1\" cellpadding=\"4\">\n")
+cata("<tr>\n")
+TableItem()
+for (i in colnames(sigDiff)) {
+  TableHeadItem(i)
+}
+cata("</tr>\n")
+for (i in 1:nrow(sigDiff)) {
+  cata("<tr>\n")
+  TableHeadItem(unmake.names(row.names(sigDiff)[i]))
+  for (j in 1:ncol(sigDiff)) {
+    TableItem(as.character(sigDiff[i, j]))
+  }
+  cata("</tr>\n")
+}
+cata("</table>")
+
+cata("<h4>Plots:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".pdf", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<h4>Tables:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".tsv", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+if (wantRda) {
+  cata("<h4>R Data Object:</h4>\n")
+  for (i in 1:nrow(linkData)) {
+    if (grepl(".rda", linkData$Link[i])) {
+      HtmlLink(linkData$Link[i], linkData$Label[i])
+    }
+  }
+}
+
+cata("<p>Alt-click links to download file.</p>\n")
+cata("<p>Click floppy disc icon associated history item to download ")
+cata("all files.</p>\n")
+cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n")
+
+cata("<h4>Additional Information</h4>\n")
+cata("<ul>\n")
+if (cpmReq!=0 && sampleReq!=0) {
+  tempStr <- paste("Genes without more than", cpmReq,
+                   "CPM in at least", sampleReq, "samples are insignificant",
+                   "and filtered out.")
+  ListItem(tempStr)
+  filterProp <- round(filteredCount/preFilterCount*100, digits=2)
+  tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
+                   "%) genes were filtered out for low expression.")
+  ListItem(tempStr)
+}
+ListItem(normOpt, " was the method used to normalise library sizes.")
+if (wantWeight) {
+  ListItem("Weights were applied to samples.")
+} else {
+  ListItem("Weights were not applied to samples.")
+}
+if (pAdjOpt!="none") {
+  if (pAdjOpt=="BH" || pAdjOpt=="BY") {
+    tempStr <- paste0("MA-Plot highlighted genes are significant at FDR ",
+                      "of ", pValReq," and exhibit log2-fold-change of at ",
+                      "least ", lfcReq, ".")
+    ListItem(tempStr)
+  } else if (pAdjOpt=="holm") {
+    tempStr <- paste0("MA-Plot highlighted genes are significant at adjusted ",
+                      "p-value of ", pValReq,"  by the Holm(1979) ",
+                      "method, and exhibit log2-fold-change of at least ",
+                      lfcReq, ".")
+    ListItem(tempStr)
+  }
+} else {
+  tempStr <- paste0("MA-Plot highlighted genes are significant at p-value ",
+                    "of ", pValReq," and exhibit log2-fold-change of at ",
+                    "least ", lfcReq, ".")
+  ListItem(tempStr)
+}
+cata("</ul>\n")
+
+cata("<h4>Summary of experimental data:</h4>\n")
+
+cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP*</p>\n")
+
+cata("<table border=\"1\" cellpadding=\"3\">\n")
+cata("<tr>\n")
+TableItem()
+for (i in names(factors)) {
+  TableHeadItem(i)
+}
+cata("</tr>\n")
+
+for (i in 1:nrow(factors)) {
+  cata("<tr>\n")
+  TableHeadItem(row.names(factors)[i])
+  for (j in ncol(factors)) {
+    TableItem(as.character(unmake.names(factors[i, j])))
+  }
+  cata("</tr>\n")
+}
+cata("</table>")
+
+cit <- character()
+link <- character()
+link[1] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/limma/inst/doc/usersguide.pdf",
+                  "\">", "limma User's Guide", "</a>.")
+
+link[2] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
+                  "\">", "edgeR User's Guide", "</a>")
+
+cit[1] <- paste("Please cite the paper below for the limma software itself.",
+                "Please also try to cite the appropriate methodology articles",
+                "that describe the statistical methods implemented in limma,",
+                "depending on which limma functions you are using. The",
+                "methodology articles are listed in Section 2.1 of the",
+                link[1],
+                "Cite no. 3 only if sample weights were used.")
+cit[2] <- paste("Smyth, GK (2005). Limma: linear models for microarray data.",
+                "In: 'Bioinformatics and Computational Biology Solutions using",
+                "R and Bioconductor'. R. Gentleman, V. Carey, S. doit,.",
+                "Irizarry, W. Huber (eds), Springer, New York, pages 397-420.")
+cit[3] <- paste("Please cite the first paper for the software itself and the",
+                "other papers for the various original statistical methods",
+                "implemented in edgeR.  See Section 1.2 in the", link[2],
+                "for more detail.")
+cit[4] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a",
+                "Bioconductor package for differential expression analysis",
+                "of digital gene expression data. Bioinformatics 26, 139-140")
+cit[5] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
+                "for assessing differences in tag abundance. Bioinformatics",
+                "23, 2881-2887")
+cit[6] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
+                "negative binomial dispersion, with applications to SAGE data.",
+                "Biostatistics, 9, 321-332")
+cit[7] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
+                "expression analysis of multifactor RNA-Seq experiments with",
+                "respect to biological variation. Nucleic Acids Research 40,",
+                "4288-4297")
+cit[8] <- paste("Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:",
+                "precision weights unlock linear model analysis tools for",
+                "RNA-seq read counts. Genome Biology 15, R29.")
+cit[9] <- paste("Ritchie, M. E., Diyagama, D., Neilson, J., van Laar,",
+                "R., Dobrovic, A., Holloway, A., and Smyth, G. K. (2006).",
+                "Empirical array quality weights for microarray data.",
+                "BMC Bioinformatics 7, Article 261.")
+cata("<h3>Citations</h3>\n")
+
+cata("<h4>limma</h4>\n")
+cata(cit[1], "\n")
+cata("<ol>\n")
+ListItem(cit[2])
+ListItem(cit[8])
+ListItem(cit[9])
+cata("</ol>\n")
+
+cata("<h4>edgeR</h4>\n")
+cata(cit[3], "\n")
+cata("<ol>\n")
+ListItem(cit[4])
+ListItem(cit[5])
+ListItem(cit[6])
+ListItem(cit[7])
+cata("</ol>\n")
+
+cata("<p>Report problems to: su.s@wehi.edu.au</p>\n")
+
+for (i in 1:nrow(linkData)) {
+  if (grepl("session_info", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<table border=\"0\">\n")
+cata("<tr>\n")
+TableItem("Task started at:"); TableItem(timeStart)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task ended at:"); TableItem(timeEnd)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task run time:"); TableItem(timeTaken)
+cata("<tr>\n")
+cata("</table>\n")
+
+cata("</body>\n")
+cata("</html>")
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/diffexp.xml	Tue Dec 16 14:38:15 2014 +1100
@@ -0,0 +1,372 @@
+<tool id="diffexp" name="Voom Rnaseq" version="1.1.0">
+  <description>
+      Perform differential expression analysis using pipeline based on the voom
+      function of the limma bioconductor package. This tool takes a count matrix
+      (tab separated) as input and produces a HTML report as output.
+  </description>
+
+  <requirements>
+    <requirement type="R-module" version="3.5.27">edgeR</requirement>
+    <requirement type="R-module" version="3.18.13">limma</requirement>
+  </requirements>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Tool exception" />
+  </stdio>
+
+  <command interpreter="Rscript">
+    diffexp.R $counts
+
+              #if $anno.annoOpt=="yes":
+                $geneanno
+              #else:
+                None
+              #end if
+
+              $outFile
+              $outFile.files_path
+              "no" <!-- Disabled Rda option -->
+              $normalisationOption
+              $weightCond.weightOption
+              "$contrast"
+
+              #if $filterCPM.filterLowCPM=="yes":
+                $filterCPM.cpmReq
+                $filterCPM.sampleReq
+              #else:
+                0
+                0
+              #end if
+
+              #if $testOpt.wantOpt=="yes":
+              	"$testOpt.pAdjust"
+              	$testOpt.pVal
+              	$testOpt.lfc
+              #else:
+                "BH"
+                0.05
+                0
+              #end if
+
+              <!--*Code commented until solution for multiple factors is found*
+              #for $i, $fct in enumerate($factors):
+                $fct.factName::$fct.factLevel
+              #end for
+              -->
+              "$factName::$factLevel"
+
+  </command>
+
+  <inputs>
+    <param name="counts" type="data" format="tabular" label="Counts Data"/>
+
+    <conditional name="anno">
+      <param name="annoOpt" type="select" label="Use Gene Annotations?"
+             help="Annotations will be added to table of top differential
+                   expressions to provide descriptions for each gene.">
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+      </param>
+
+      <when value="yes">
+        <param name="geneanno" type="data" format="tabular"
+               label="Gene Annotations"/>
+        </when>
+    </conditional>
+
+    <!--*Code commented until solution for multiple factors is found*
+    <repeat name="factors" title="Factors" min="1" max="5" default="1">
+      <param name="factName" type="text" label="Factor Name (No spaces)"
+             help="Eg. Genotype"/>
+        <param name="factLevel" type="text" size="100"
+               label="Factor Levels (No spaces)"
+               help="Eg. WT,WT,Mut,Mut,WT"/>
+    </repeat>
+    -->
+
+    <param name="factName" type="text" label="Factor Name"
+           help="Eg. Genotype."/>
+    <param name="factLevel" type="text" size="100"
+           label="Factor Values"
+           help="Eg. WT,WT,Mut,Mut,WT... NOTE: Please ensure that the same
+           		 levels are typed identically when repeated, with all cases
+           		 matching."/>
+
+    <param name="contrast" type="text" size="30"
+           label="Contrasts of interest"
+           help="Eg. Mut-WT,KD-Control."/>
+
+    <conditional name="filterCPM">
+      <param name="filterLowCPM" type="select" label="Filter Low CPM?"
+       help="Treat genes with very low expression as unexpressed and
+       			 filter out to speed up computation.">
+        <option value="yes" selected="True">Yes</option>
+        <option value="no">No</option>
+      </param>
+
+        <when value="yes">
+          <param name="cpmReq" type="float" value="0.5" min="0"
+                 label="Minimum CPM"/>
+
+          <param name="sampleReq" type="integer" value="1" min="0"
+                 label="Minimum Samples"
+                 help="Filter out all the genes that do not meet the minimum
+                       CPM in at least this many samples."/>
+        </when>
+
+        <when value="no"/>
+
+    </conditional>
+
+    <conditional name="weightCond">
+      <param name="weightOption" type="select" label="Apply sample weights?"
+             display="radio" help="Apply weights if outliers are present.">
+
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+
+      </param>
+    </conditional>
+
+    <param name="normalisationOption" type="select"
+           label="Normalisation Method">
+
+      <option value="TMM">TMM</option>
+      <option value="RLE">RLE</option>
+      <option value="upperquartile">Upperquartile</option>
+      <option value="none">None (Don't normalise)</option>
+
+    </param>
+
+    <conditional name="testOpt">
+      <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
+       help="Enable choices for p-value adjustment method, p-value threshold
+             and log2-fold-change threshold.">
+        <option value="no" selected="True">No</option>
+      	<option value="yes">Yes</option>
+      </param>
+
+        <when value="yes">
+        	<param name="pAdjust" type="select" label="P-Value Adjustment Method.">
+          	<option value="BH">Benjamini and Hochberg (1995)</option>
+            <option value="BY">Benjamini and Yekutieli (2001)</option>
+            <option value="holm">Holm (1979)</option>
+            <option value="none">None</option>
+          </param>
+
+          <param name="pVal" type="float" value="0.05" min="0" max="1"
+    			 		   label="Adjusted Threshold"
+    			 		   help="Genes below this threshold are considered significant and
+    			 		         highlighted in the MA plot. If either BH(1995) or
+    			 		         BY(2001) were selected then this value is a
+    			 		         false-discovery-rate control. If Holm(1979) was selected
+    			 		         then this is an adjusted p-value for family-wise error
+    			 		         rate."/>
+
+          <param name="lfc" type="float" value="0" min="0"
+          			 label="Minimum log2-fold-change Required"
+          			 help="Genes above this threshold and below the p-value
+          			       threshold are considered significant and highlighted
+          			       in the MA plot."/>
+        </when>
+
+        <when value="no"/>
+
+    </conditional>
+
+ <!--    <conditional name="wantRda">
+      <param name="rdaOption" type="select" label="Output RData?"
+             display="radio"
+             help="Output all the data R used to construct the plots,
+                   can be loaded into R.">
+
+        <option value="no">No</option>
+        <option value="yes">Yes</option>
+
+      </param>
+    </conditional> -->
+  </inputs>
+
+  <outputs>
+      <data format="html" name="outFile" label="Voom Output"/>
+  </outputs>
+
+
+<help>
+.. class:: infomark
+
+**What it does**
+
+Given a matrix of counts and optional information about the genes, this tool
+produces plots and tables useful in the analysis of differential gene
+expression.
+
+.. class:: warningmark
+
+This tool is dependent on the R packages limma_ and edgeR_ as a part of the
+bioconductor project. Please ensure that these packages are installed on the
+server running this tool.
+
+-----
+
+**Counts Data:**
+A matrix of expression level with rows corresponding to particular genes
+and columns corresponding to the feature count in particular samples.
+Values must be tab separated and there must be a row for the sample/column
+labels and a column for the row/gene labels.
+
+Example::
+
+	"GeneID"  "Smpl1"	"Smpl2"	"Smpl3"	"Smpl4"	"Smpl5"
+	"27395"	1699	1528	1463	1441	1495
+	"18777"	1905	1744	1345	1291	1346
+	"15037"	6	8	4	5	5
+	"21399"	2099	1974	1574	1519	1654
+	"58175"	356	312	347	361	346
+	"10866"	2528	2438	1762	1942	2027
+	"12421"	2182	2005	1786	1799	1858
+	"24069"	3	4	2	3	3
+	"31926"	1337	1380	1004	1102	1000
+	"71096"	0	0	2	1	6
+	"59014"	1466	1426	1296	1097	1175
+	...
+
+**Gene Annotations:**
+Optional input for gene annotations, this can contain more
+information about the genes than just an ID number. The annotations will
+be avaiable in the top differential expression table.
+
+Example::
+
+	"GeneID"	"Length"	"EntrezID"	"Symbols"	"GeneName"	"Chr"
+	"11287"	"11287"	4681	"11287"	"Pzp"	"pregnancy zone protein"	"6"
+	"11298"	"11298"	1455	"11298"	"Aanat"	"arylalkylamine N-acetyltransferase"	"11"
+	"11302"	"11302"	5743	"11302"	"Aatk"	"apoptosis-associated tyrosine kinase"	"11"
+	"11303"	"11303"	10260	"11303"	"Abca1"	"ATP-binding cassette, sub-family A (ABC1), member 1"	"4"
+	"11304"	"11304"	7248	"11304"	"Abca4"	"ATP-binding cassette, sub-family A (ABC1), member 4"	"3"
+	"11305"	"11305"	8061	"11305"	"Abca2"	"ATP-binding cassette, sub-family A (ABC1), member 2"	"2"
+	...
+
+**Factor Name:**
+The name of the factor being investigated. This tool currently assumes
+that only one factor is of interest.
+
+**Factor Levels:**
+The levels of the factor of interest, this must be entered in the same
+order as the samples to which the levels correspond as listed in the
+columns of the counts matrix.
+
+The values should be seperated by commas, and spaces must not be used.
+
+**Contrasts of Interest:**
+The contrasts you wish to make between levels.
+
+Common contrasts would be a simple difference between two levels: "Mut-WT"
+represents the difference between the mutant and wild type genotypes.
+
+The values should be seperated by commas and spaces must not be used.
+
+**Filter Low CPM:**
+Option to ignore the genes that do not show significant levels of
+expression, this filtering is dependent on two criteria:
+
+ * **Minimum CPM:** This is the counts per million that a gene must have in at
+   least some specified number of samples.
+
+ * **Minumum Samples:** This is the number of samples in which the CPM
+   requirement must be met in order for that gene to be acknowledged.
+
+Only genes that exhibit a CPM greater than the required amount in at least the
+number of samples specified will be used for analysis. Care should be taken to
+ensure that the sample requirement is appropriate. In the case of an experiment
+with two experimental groups each with two members, if there is a change from
+insignificant cpm to significant cpm but the sample requirement is set to 3,
+then this will cause that gene to fail the criteria. When in doubt simply do not
+filter.
+
+
+**Normalisation Method:**
+Option for using different methods to rescale the raw library
+size. For more information, see calcNormFactor section in the edgeR_ user's
+manual.
+
+**Apply Sample Weights:**
+Option to downweight outlier samples such that their information is still
+used in the statistical analysis but their impact is reduced. Use this
+whenever significant outliers are present. The MDS plotting tool in this package
+is useful for identifying outliers
+
+**Use Advanced Testing Options?:**
+By default error rate for multiple testing is controlled using Benjamini and
+Hochberg's false discovery rate control at a threshold value of 0.05. However
+there are options to change this to custom values.
+
+  * **P-Value Adjustment Method:**
+    Change the multiple testing control method, the options are BH(1995) and
+    BY(2001) which are both false discovery rate controls. There is also
+    Holm(1979) which is a method for family-wise error rate control.
+
+  * **Adjusted Threshold:**
+    Set the threshold for the resulting value of the multiple testing control
+    method. Only observations whose statistic falls below this value is
+    considered significant, thus highlighted in the MA plot.
+
+  * **Minimum log2-fold-change Required:**
+    In addition to meeting the requirement for the adjusted statistic for
+    multiple testing, the observation must have an absolute log2-fold-change
+    greater than this threshold to be considered significant, thus highlighted
+    in the MA plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma
+User's Guide.
+
+  * Smyth, GK (2005). Limma: linear models for microarray data. In:
+    'Bioinformatics and Computational Biology Solutions using R and
+    Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
+    W. Huber (eds), Springer, New York, pages 397-420.
+
+  * Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:
+    precision weights unlock linear model analysis tools for
+    RNA-seq read counts. Genome Biology 15, R29.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See
+Section 1.2 in the User's Guide for more detail.
+
+	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
+	  package for differential expression analysis of digital gene expression
+	  data. Bioinformatics 26, 139-140
+
+	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
+	  differences in tag abundance. Bioinformatics 23, 2881-2887
+
+	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative
+	  binomial dispersion, with applications to SAGE data.
+	  Biostatistics, 9, 321-332
+
+	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
+	  of multifactor RNA-Seq experiments with respect to biological variation.
+	  Nucleic Acids Research 40, 4288-4297
+
+Report problems to: su.s@wehi.edu.au
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+
+</help>
+</tool>