view diffexp.R @ 0:7a80e9ec63cb

- Initial commit
author shian_su <registertonysu@gmail.com>
date Tue, 16 Dec 2014 14:38:15 +1100
parents
children b2fe55fd0651
line wrap: on
line source

# This tool takes in a matrix of feature counts as well as gene annotations and
# outputs a table of top expressions as well as various plots for differential
# expression analysis
#
# ARGS: 1.countPath       -Path to RData input containing counts
#       2.annoPath        -Path to RData input containing gene annotations
#       3.htmlPath        -Path to html file linking to other outputs
#       4.outPath         -Path to folder to write all output to
#       5.rdaOpt          -String specifying if RData should be saved
#       6.normOpt         -String specifying type of normalisation used
#       7.weightOpt       -String specifying usage of weights
#       8.contrastData    -String containing contrasts of interest
#       9.cpmReq          -Float specifying cpm requirement
#       10.sampleReq      -Integer specifying cpm requirement
#       11.pAdjOpt        -String specifying the p-value adjustment method
#       12.pValReq        -Float specifying the p-value requirement
#       13.lfcReq         -Float specifying the log-fold-change requirement
#       14.factorData     -String containing factor names and values
#
# OUT:  Voom Plot
#       BCV Plot
#       MA Plot
#       Top Expression Table
#       HTML file linking to the ouputs
#
# Author: Shian Su - registertonysu@gmail.com - Jan 2014

# Record starting time
timeStart <- as.character(Sys.time())

# Load all required libraries
library(methods, quietly=TRUE, warn.conflicts=FALSE)
library(statmod, quietly=TRUE, warn.conflicts=FALSE)
library(splines, quietly=TRUE, warn.conflicts=FALSE)
library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
library(limma, quietly=TRUE, warn.conflicts=FALSE)
library(scales, quietly=TRUE, warn.conflicts=FALSE)

if (packageVersion("limma") < "3.20.1") {
  stop("Please update 'limma' to version >= 3.20.1 to run this tool")
}

################################################################################
### Function Delcaration
################################################################################
# Function to sanitise contrast equations so there are no whitespaces
# surrounding the arithmetic operators, leading or trailing whitespace
sanitiseEquation <- function(equation) {
  equation <- gsub(" *[+] *", "+", equation)
  equation <- gsub(" *[-] *", "-", equation)
  equation <- gsub(" *[/] *", "/", equation)
  equation <- gsub(" *[*] *", "*", equation)
  equation <- gsub("^\\s+|\\s+$", "", equation)
  return(equation)
}

# Function to sanitise group information
sanitiseGroups <- function(string) {
  string <- gsub(" *[,] *", ",", string)
  string <- gsub("^\\s+|\\s+$", "", string)
  return(string)
}

# Function to change periods to whitespace in a string
unmake.names <- function(string) {
  string <- gsub(".", " ", string, fixed=TRUE)
  return(string)
}

# Generate output folder and paths
makeOut <- function(filename) {
  return(paste0(outPath, "/", filename))
}

# Generating design information
pasteListName <- function(string) {
  return(paste0("factors$", string))
}

# Create cata function: default path set, default seperator empty and appending
# true by default (Ripped straight from the cat function with altered argument
# defaults)
cata <- function(..., file = htmlPath, sep = "", fill = FALSE, labels = NULL, 
                 append = TRUE) {
  if (is.character(file)) 
    if (file == "") 
      file <- stdout()
  else if (substring(file, 1L, 1L) == "|") {
    file <- pipe(substring(file, 2L), "w")
    on.exit(close(file))
  }
  else {
    file <- file(file, ifelse(append, "a", "w"))
    on.exit(close(file))
  }
  .Internal(cat(list(...), file, sep, fill, labels, append))
}

# Function to write code for html head and title
HtmlHead <- function(title) {
  cata("<head>\n")
  cata("<title>", title, "</title>\n")
  cata("</head>\n")
}

# Function to write code for html links
HtmlLink <- function(address, label=address) {
  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
}

# Function to write code for html images
HtmlImage <- function(source, label=source, height=600, width=600) {
  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
  cata("\" width=\"", width, "\"/>\n")
}

# Function to write code for html list items
ListItem <- function(...) {
  cata("<li>", ..., "</li>\n")
}

TableItem <- function(...) {
  cata("<td>", ..., "</td>\n")
}

TableHeadItem <- function(...) {
  cata("<th>", ..., "</th>\n")
}

################################################################################
### Input Processing
################################################################################

# Collects arguments from command line
argv <- commandArgs(TRUE)

# Grab arguments
countPath <- as.character(argv[1])
annoPath <- as.character(argv[2])
htmlPath <- as.character(argv[3])
outPath <- as.character(argv[4])
rdaOpt <- as.character(argv[5])
normOpt <- as.character(argv[6])
weightOpt <- as.character(argv[7])
contrastData <- as.character(argv[8])
cpmReq <- as.numeric(argv[9])
sampleReq <- as.numeric(argv[10])
pAdjOpt <- as.character(argv[11])
pValReq <- as.numeric(argv[12])
lfcReq <- as.numeric(argv[13])
factorData <- list()
for (i in 14:length(argv)) {
  newFact <- unlist(strsplit(as.character(argv[i]), split="::"))
  factorData <- rbind(factorData, newFact)
} # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,...

# Process arguments
if (weightOpt=="yes") {
  wantWeight <- TRUE
} else {
  wantWeight <- FALSE
}

if (rdaOpt=="yes") {
  wantRda <- TRUE
} else {
  wantRda <- FALSE
}

if (annoPath=="None") {
  haveAnno <- FALSE
} else {
  haveAnno <- TRUE
}

# Set the row names to be the name of the factor and delete first row
row.names(factorData) <- factorData[, 1]
factorData <- factorData[, -1]
factorData <- sapply(factorData, sanitiseGroups)
factorData <- sapply(factorData, strsplit, split=",")
factorData <- sapply(factorData, make.names)

# Transform factor data into data frame of R factor objects
factors <- data.frame(factorData)

#Create output directory
dir.create(outPath, showWarnings=FALSE)

# Split up contrasts seperated by comma into a vector then sanitise
contrastData <- unlist(strsplit(contrastData, split=","))
contrastData <- sanitiseEquation(contrastData)
contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)

bcvOutPdf <- makeOut("bcvplot.pdf")
bcvOutPng <- makeOut("bcvplot.png")
mdsOutPdf <- makeOut("mdsplot.pdf")
mdsOutPng <- makeOut("mdsplot.png")
voomOutPdf <- makeOut("voomplot.pdf")
voomOutPng <- makeOut("voomplot.png") 
maOutPdf <- character()   # Initialise character vector
maOutPng <- character()
topOut <- character()
for (i in 1:length(contrastData)) {
  maOutPdf[i] <- makeOut(paste0("maplot(", contrastData[i], ").pdf"))
  maOutPng[i] <- makeOut(paste0("maplot(", contrastData[i], ").png"))
  topOut[i] <- makeOut(paste0("toptab(", contrastData[i], ").tsv"))
}                         # Save output paths for each contrast as vectors
rdaOut <- makeOut("RData.rda")
sessionOut <- makeOut("session_info.txt")

# Initialise data for html links and images, data frame with columns Label and 
# Link
linkData <- data.frame(Label=character(), Link=character(),
                       stringsAsFactors=FALSE)
imageData <- data.frame(Label=character(), Link=character(),
                        stringsAsFactors=FALSE)

# Initialise vectors for storage of up/down/neutral regulated counts
upCount <- numeric()
downCount <- numeric()
flatCount <- numeric()
                        
# Read in counts and geneanno data
counts <- read.table(countPath, header=TRUE, sep="\t")
row.names(counts) <- counts$GeneID
counts <- counts[ , !(colnames(counts)=="GeneID")]
countsRows <- nrow(counts)
if (haveAnno) {
  geneanno <- read.table(annoPath, header=TRUE, sep="\t")
}

################################################################################
### Data Processing
################################################################################

# Extract counts and annotation data
data <- list()
data$counts <- counts
if (haveAnno) {
  data$genes <- geneanno
} else {
  data$genes <- data.frame(GeneID=row.names(counts))
}

# Filter out genes that do not have a required cpm in a required number of
# samples
preFilterCount <- nrow(data$counts)
sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
data$counts <- data$counts[sel, ]
data$genes <- data$genes[sel, ]
postFilterCount <- nrow(data$counts)
filteredCount <- preFilterCount-postFilterCount

# Creating naming data
samplenames <- colnames(data$counts)
sampleanno <- data.frame("sampleID"=samplenames, factors)

# Generating the DGEList object "data"
data$samples <- sampleanno
data$samples$lib.size <- colSums(data$counts)
data$samples$norm.factors <- 1
row.names(data$samples) <- colnames(data$counts)
data <- new("DGEList", data)

factorList <- sapply(names(factors), pasteListName)
formula <- "~0"
for (i in 1:length(factorList)) {
  formula <- paste(formula, factorList[i], sep="+")
}
formula <- formula(formula)
design <- model.matrix(formula)
for (i in 1:length(factorList)) {
  colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
}

# Calculating normalising factor, estimating dispersion
data <- calcNormFactors(data, method=normOpt)
#data <- estimateDisp(data, design=design, robust=TRUE)

# Generate contrasts information
contrasts <- makeContrasts(contrasts=contrastData, levels=design)

# Name rows of factors according to their sample
row.names(factors) <- names(data$counts)

################################################################################
### Data Output
################################################################################

# BCV Plot
#png(bcvOutPng, width=600, height=600)
#plotBCV(data, main="BCV Plot")
#imageData[1, ] <- c("BCV Plot", "bcvplot.png")
#invisible(dev.off())

#pdf(bcvOutPdf)
#plotBCV(data, main="BCV Plot")
#invisible(dev.off())

if (wantWeight) {
  # Creating voom data object and plot
  png(voomOutPng, width=1000, height=600)
  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
  imageData[1, ] <- c("Voom Plot", "voomplot.png")
  invisible(dev.off())
  
  pdf(voomOutPdf, width=14)
  vData <- voomWithQualityWeights(data, design=design, plot=TRUE)
  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
  invisible(dev.off())
  
  # Generating fit data and top table with weights
  wts <- vData$weights
  voomFit <- lmFit(vData, design, weights=wts)
  
} else {
  # Creating voom data object and plot
  png(voomOutPng, width=600, height=600)
  vData <- voom(data, design=design, plot=TRUE)
  imageData[1, ] <- c("Voom Plot", "voomplot.png")
  invisible(dev.off())
  
  pdf(voomOutPdf)
  vData <- voom(data, design=design, plot=TRUE)
  linkData[1, ] <- c("Voom Plot (.pdf)", "voomplot.pdf")
  invisible(dev.off())
  
  # Generate voom fit
  voomFit <- lmFit(vData, design)
  
}

# Fit linear model and estimate dispersion with eBayes
voomFit <- contrasts.fit(voomFit, contrasts)
voomFit <- eBayes(voomFit)

# Plot MDS
labels <- names(counts)
png(mdsOutPng, width=600, height=600)
# Currently only using a single factor
plotMDS(vData, labels=labels, col=as.numeric(factors[, 1]), cex=0.8)
imgName <- "Voom Plot"
imgAddr <- "mdsplot.png"
imageData <- rbind(imageData, c(imgName, imgAddr))
invisible(dev.off())

pdf(mdsOutPdf)
plotMDS(vData, labels=labels, cex=0.5)
linkName <- paste0("MDS Plot (.pdf)")
linkAddr <- paste0("mdsplot.pdf")
linkData <- rbind(linkData, c(linkName, linkAddr))
invisible(dev.off())


for (i in 1:length(contrastData)) {

  status = decideTests(voomFit[, i], adjust.method=pAdjOpt, p.value=pValReq,
                       lfc=lfcReq)
                       
  sumStatus <- summary(status)
  
  # Collect counts for differential expression
  upCount[i] <- sumStatus["1",]
  downCount[i] <- sumStatus["-1",]
  flatCount[i] <- sumStatus["0",]
                       
  # Write top expressions table
  top <- topTable(voomFit, coef=i, number=Inf, sort.by="P")
  write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
  
  linkName <- paste0("Top Differential Expressions(", contrastData[i], 
                     ") (.tsv)")
  linkAddr <- paste0("toptab(", contrastData[i], ").tsv")
  linkData <- rbind(linkData, c(linkName, linkAddr))
  
  # Plot MA (log ratios vs mean average) using limma package on weighted data
  pdf(maOutPdf[i])
  limma::plotMA(voomFit, status=status, coef=i,
                main=paste("MA Plot:", unmake.names(contrastData[i])), 
                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
                xlab="Average Expression", ylab="logFC")
  
  abline(h=0, col="grey", lty=2)
  
  linkName <- paste0("MA Plot(", contrastData[i], ")", " (.pdf)")
  linkAddr <- paste0("maplot(", contrastData[i], ").pdf")
  linkData <- rbind(linkData, c(linkName, linkAddr))
  invisible(dev.off())
  
  png(maOutPng[i], height=600, width=600)
  limma::plotMA(voomFit, status=status, coef=i,
                main=paste("MA Plot:", unmake.names(contrastData[i])), 
                col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
                xlab="Average Expression", ylab="logFC")
  
  abline(h=0, col="grey", lty=2)
  
  imgName <- paste0("MA Plot(", contrastData[i], ")")
  imgAddr <- paste0("maplot(", contrastData[i], ").png")
  imageData <- rbind(imageData, c(imgName, imgAddr))
  invisible(dev.off())
}
sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
row.names(sigDiff) <- contrastData

# Save relevant items as rda object
if (wantRda) {
  if (wantWeight) {
    save(data, status, vData, labels, factors, wts, voomFit, top, contrasts, 
         design,
         file=rdaOut, ascii=TRUE)
  } else {
    save(data, status, vData, labels, factors, voomFit, top, contrasts, design,
         file=rdaOut, ascii=TRUE)
  }
  linkData <- rbind(linkData, c("RData (.rda)", "RData.rda"))
}

# Record session info
writeLines(capture.output(sessionInfo()), sessionOut)
linkData <- rbind(linkData, c("Session Info", "session_info.txt"))

# Record ending time and calculate total run time
timeEnd <- as.character(Sys.time())
timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
################################################################################
### HTML Generation
################################################################################

# Clear file
cat("", file=htmlPath)

cata("<html>\n")

cata("<body>\n")
cata("<h3>Limma Voom Analysis Output:</h3>\n")
cata("PDF copies of JPEGS available in 'Plots' section.<br />\n")
if (wantWeight) {
  HtmlImage(imageData$Link[1], imageData$Label[1], width=1000)
} else {
  HtmlImage(imageData$Link[1], imageData$Label[1])
}

for (i in 2:nrow(imageData)) {
  HtmlImage(imageData$Link[i], imageData$Label[i])
}

cata("<h4>Differential Expression Counts:</h4>\n")

cata("<table border=\"1\" cellpadding=\"4\">\n")
cata("<tr>\n")
TableItem()
for (i in colnames(sigDiff)) {
  TableHeadItem(i)
}
cata("</tr>\n")
for (i in 1:nrow(sigDiff)) {
  cata("<tr>\n")
  TableHeadItem(unmake.names(row.names(sigDiff)[i]))
  for (j in 1:ncol(sigDiff)) {
    TableItem(as.character(sigDiff[i, j]))
  }
  cata("</tr>\n")
}
cata("</table>")

cata("<h4>Plots:</h4>\n")
for (i in 1:nrow(linkData)) {
  if (grepl(".pdf", linkData$Link[i])) {
    HtmlLink(linkData$Link[i], linkData$Label[i])
  }
}

cata("<h4>Tables:</h4>\n")
for (i in 1:nrow(linkData)) {
  if (grepl(".tsv", linkData$Link[i])) {
    HtmlLink(linkData$Link[i], linkData$Label[i])
  }
}

if (wantRda) {
  cata("<h4>R Data Object:</h4>\n")
  for (i in 1:nrow(linkData)) {
    if (grepl(".rda", linkData$Link[i])) {
      HtmlLink(linkData$Link[i], linkData$Label[i])
    }
  }
}

cata("<p>Alt-click links to download file.</p>\n")
cata("<p>Click floppy disc icon associated history item to download ")
cata("all files.</p>\n")
cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n")

cata("<h4>Additional Information</h4>\n")
cata("<ul>\n")
if (cpmReq!=0 && sampleReq!=0) {
  tempStr <- paste("Genes without more than", cpmReq,
                   "CPM in at least", sampleReq, "samples are insignificant",
                   "and filtered out.")
  ListItem(tempStr)
  filterProp <- round(filteredCount/preFilterCount*100, digits=2)
  tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
                   "%) genes were filtered out for low expression.")
  ListItem(tempStr)
}
ListItem(normOpt, " was the method used to normalise library sizes.")
if (wantWeight) {
  ListItem("Weights were applied to samples.")
} else {
  ListItem("Weights were not applied to samples.")
}
if (pAdjOpt!="none") {
  if (pAdjOpt=="BH" || pAdjOpt=="BY") {
    tempStr <- paste0("MA-Plot highlighted genes are significant at FDR ",
                      "of ", pValReq," and exhibit log2-fold-change of at ", 
                      "least ", lfcReq, ".")
    ListItem(tempStr)
  } else if (pAdjOpt=="holm") {
    tempStr <- paste0("MA-Plot highlighted genes are significant at adjusted ",
                      "p-value of ", pValReq,"  by the Holm(1979) ",
                      "method, and exhibit log2-fold-change of at least ", 
                      lfcReq, ".")
    ListItem(tempStr)
  }
} else {
  tempStr <- paste0("MA-Plot highlighted genes are significant at p-value ",
                    "of ", pValReq," and exhibit log2-fold-change of at ", 
                    "least ", lfcReq, ".")
  ListItem(tempStr)
}
cata("</ul>\n")

cata("<h4>Summary of experimental data:</h4>\n")

cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP*</p>\n")

cata("<table border=\"1\" cellpadding=\"3\">\n")
cata("<tr>\n")
TableItem()
for (i in names(factors)) {
  TableHeadItem(i)
}
cata("</tr>\n")

for (i in 1:nrow(factors)) {
  cata("<tr>\n")
  TableHeadItem(row.names(factors)[i])
  for (j in ncol(factors)) {
    TableItem(as.character(unmake.names(factors[i, j])))
  }
  cata("</tr>\n")
}
cata("</table>")

cit <- character()
link <- character()
link[1] <- paste0("<a href=\"",
                  "http://www.bioconductor.org/packages/release/bioc/",
                  "vignettes/limma/inst/doc/usersguide.pdf",
                  "\">", "limma User's Guide", "</a>.")

link[2] <- paste0("<a href=\"",
                  "http://www.bioconductor.org/packages/release/bioc/",
                  "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
                  "\">", "edgeR User's Guide", "</a>")

cit[1] <- paste("Please cite the paper below for the limma software itself.",
                "Please also try to cite the appropriate methodology articles",
                "that describe the statistical methods implemented in limma,",
                "depending on which limma functions you are using. The",
                "methodology articles are listed in Section 2.1 of the",
                link[1],
                "Cite no. 3 only if sample weights were used.")
cit[2] <- paste("Smyth, GK (2005). Limma: linear models for microarray data.",
                "In: 'Bioinformatics and Computational Biology Solutions using",
                "R and Bioconductor'. R. Gentleman, V. Carey, S. doit,.",
                "Irizarry, W. Huber (eds), Springer, New York, pages 397-420.")
cit[3] <- paste("Please cite the first paper for the software itself and the",
                "other papers for the various original statistical methods",
                "implemented in edgeR.  See Section 1.2 in the", link[2],
                "for more detail.")
cit[4] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a",
                "Bioconductor package for differential expression analysis",
                "of digital gene expression data. Bioinformatics 26, 139-140")
cit[5] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
                "for assessing differences in tag abundance. Bioinformatics",
                "23, 2881-2887")
cit[6] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
                "negative binomial dispersion, with applications to SAGE data.",
                "Biostatistics, 9, 321-332")
cit[7] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
                "expression analysis of multifactor RNA-Seq experiments with",
                "respect to biological variation. Nucleic Acids Research 40,",
                "4288-4297")
cit[8] <- paste("Law, CW, Chen, Y, Shi, W, and Smyth, GK (2014). Voom:",
                "precision weights unlock linear model analysis tools for",
                "RNA-seq read counts. Genome Biology 15, R29.")
cit[9] <- paste("Ritchie, M. E., Diyagama, D., Neilson, J., van Laar,", 
                "R., Dobrovic, A., Holloway, A., and Smyth, G. K. (2006).",
                "Empirical array quality weights for microarray data.",
                "BMC Bioinformatics 7, Article 261.")
cata("<h3>Citations</h3>\n")

cata("<h4>limma</h4>\n")
cata(cit[1], "\n")
cata("<ol>\n")
ListItem(cit[2])
ListItem(cit[8])
ListItem(cit[9])
cata("</ol>\n")

cata("<h4>edgeR</h4>\n")
cata(cit[3], "\n")
cata("<ol>\n")
ListItem(cit[4])
ListItem(cit[5])
ListItem(cit[6])
ListItem(cit[7])
cata("</ol>\n")

cata("<p>Report problems to: su.s@wehi.edu.au</p>\n")

for (i in 1:nrow(linkData)) {
  if (grepl("session_info", linkData$Link[i])) {
    HtmlLink(linkData$Link[i], linkData$Label[i])
  }
}

cata("<table border=\"0\">\n")
cata("<tr>\n")
TableItem("Task started at:"); TableItem(timeStart)
cata("</tr>\n")
cata("<tr>\n")
TableItem("Task ended at:"); TableItem(timeEnd)
cata("</tr>\n")
cata("<tr>\n")
TableItem("Task run time:"); TableItem(timeTaken)
cata("<tr>\n")
cata("</table>\n")

cata("</body>\n")
cata("</html>")