diff edgeR_repenrich.R @ 1:54a3f3a195d6 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 114b47cc624e39b4f485c8623458fc98494c564d
author drosofff
date Mon, 29 May 2017 13:11:57 -0400
parents
children 1805b262c12d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edgeR_repenrich.R	Mon May 29 13:11:57 2017 -0400
@@ -0,0 +1,221 @@
+#!/usr/bin/env Rscript
+
+# A command-line interface to edgeR for use with Galaxy edger-repenrich
+# written by Christophe Antoniewski drosofff@gmail.com 2017.05.30
+
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# To not crash galaxy with an UTF8 error with not-US LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+library("getopt")
+library("tools")
+options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs(trailingOnly = TRUE)
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+  "quiet", "q", 0, "logical",
+  "help", "h", 0, "logical",
+  "outfile", "o", 1, "character",
+  "countsfile", "n", 1, "character",
+  "factorName", "N", 1, "character",
+  "levelNameA", "A", 1, "character",
+  "levelNameB", "B", 1, "character",
+  "levelAfiles", "a", 1, "character",
+  "levelBfiles", "b", 1, "character",
+  "alignmentA", "i", 1, "character",
+  "alignmentB", "j", 1, "character",
+  "plots" , "p", 1, "character"),
+  byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# if help was asked for print a friendly message
+# and exit with a non-zero error code
+if (!is.null(opt$help)) {
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+# enforce the following required arguments
+if (is.null(opt$outfile)) {
+  cat("'outfile' is required\n")
+  q(status=1)
+}
+if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
+  cat("input count files are required for both levels\n")
+  q(status=1)
+}
+if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
+  cat("total aligned read files are required for both levels\n")
+  q(status=1)
+}
+
+verbose <- if (is.null(opt$quiet)) {
+  TRUE
+} else {
+  FALSE
+}
+
+suppressPackageStartupMessages({
+  library("edgeR")
+  library("limma")
+})
+
+# build levels A and B file lists
+
+library("rjson")
+filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
+filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
+listA <- list()
+indice = 0
+listA[["level"]] <- opt$levelNameA
+for (file in filesA) {
+    indice = indice +1
+    listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE)
+    }
+listB <- list()
+indice = 0
+listB[["level"]] <- opt$levelNameB
+for (file in filesB) {
+    indice = indice +1
+    listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE)
+    }
+
+# build a counts table
+counts <- data.frame(row.names=listA[[2]][,1])
+for (element in names(listA[-1])) {
+    counts<-cbind(counts, listA[[element]][,4])
+    } 
+for (element in names(listB[-1])) {
+    counts<-cbind(counts, listB[[element]][,4])
+    }
+colnames(counts)=c(names(listA[-1]), names(listB[-1]))
+
+# build aligned counts vector
+
+filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
+filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
+sizes <- c()
+for (file in filesi) {
+    sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
+    }
+for (file in filesj) {
+    sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
+    }
+
+# build a meta data object
+
+meta <- data.frame(
+    row.names=colnames(counts),
+    condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ),
+    libsize=sizes
+)
+
+
+# Define the library size and conditions for the GLM
+libsize <- meta$libsize
+condition <- factor(meta$condition)
+design <- model.matrix(~0+condition)
+colnames(design) <- levels(meta$condition)
+
+
+# Build a DGE object for the GLM
+y <- DGEList(counts=counts, lib.size=libsize)
+
+# Normalize the data
+y <- calcNormFactors(y)
+y$samples
+# plotMDS(y) latter
+
+# Estimate the variance
+y <- estimateGLMCommonDisp(y, design)
+y <- estimateGLMTrendedDisp(y, design)
+y <- estimateGLMTagwiseDisp(y, design)
+# plotBCV(y) latter
+
+# Builds and outputs an object to contain the normalized read abundance in counts per million of reads
+cpm <- cpm(y, log=FALSE, lib.size=libsize)
+cpm <- as.data.frame(cpm)
+colnames(cpm) <- colnames(counts)
+if (!is.null(opt$countsfile)) {
+    normalizedAbundance <- data.frame(Tag=rownames(cpm))
+    normalizedAbundance <- cbind(normalizedAbundance, cpm)
+    write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
+}
+
+# test
+print(counts)
+print(cpm)
+
+# Conduct fitting of the GLM
+yfit <- glmFit(y, design)
+
+# Initialize result matrices to contain the results of the GLM
+results <- matrix(nrow=dim(counts)[1],ncol=0)
+logfc <- matrix(nrow=dim(counts)[1],ncol=0)
+
+# Make the comparisons for the GLM
+my.contrasts <- makeContrasts(
+    paste0(opt$levelNameB,"_",opt$levelNameA," = ", opt$levelNameB, " - ", opt$levelNameA),
+    levels = design
+)
+
+# Define the contrasts used in the comparisons
+allcontrasts =  paste0(opt$levelNameB," vs ",opt$levelNameA)
+
+# Conduct a for loop that will do the fitting of the GLM for each comparison
+# Put the results into the results objects
+    lrt <- glmLRT(yfit, contrast=my.contrasts[,1])
+    plotSmear(lrt, de.tags=rownames(y))
+    title(allcontrasts)
+    res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table
+    results <- cbind(results,res[,c(1,5)])
+    logfc <- cbind(logfc,res[c(1)])
+
+# Add the repeat types back into the results.
+# We should still have the same order as the input data
+results$class <- listA[[2]][,2]
+results$type <- listA[[2]][,3]
+
+# Sort the results table by the FDR
+results <- results[with(results, order(FDR)), ]
+
+# Save the results
+write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)
+
+# Plot Fold Changes for repeat classes and types
+
+# open the device and plots
+if (!is.null(opt$plots)) {
+    if (verbose) cat("creating plots\n")
+    pdf(opt$plots)
+    plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples")
+    plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot")
+    logFC <- results[, "logFC"]
+    # Plot the repeat classes
+    classes <- with(results, reorder(class, -logFC, median))
+    par(mar=c(6,10,4,1))
+    boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE,
+        las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Class"))
+    abline(v=0)
+    # Plot the repeat types
+    types <- with(results, reorder(type, -logFC, median))
+    boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE,
+        las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Type"))
+    abline(v=0)
+}
+
+# close the plot device
+if (!is.null(opt$plots)) {
+  cat("closing plot device\n")
+  dev.off()
+}
+
+cat("Session information:\n\n")
+
+sessionInfo()
+