Mercurial > repos > artbio > repenrich
view edgeR_repenrich.R @ 7:308591fd36e5 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit e1dfdcf457c29c18018f330546a6695a0dd8f26d
author | artbio |
---|---|
date | Tue, 11 Dec 2018 11:39:22 -0500 |
parents | 83b5ea7cc342 |
children | 5dd4791c7b70 |
line wrap: on
line source
#!/usr/bin/env Rscript # A command-line interface to edgeR for use with Galaxy edger-repenrich # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30 # setup R error handling to go to stderr options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) # To not crash galaxy with an UTF8 error with not-US LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") library("getopt") library("tools") options(stringAsFactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) # get options, using the spec as defined by the enclosed list. # we read the options from the default: commandArgs(TRUE). spec <- matrix(c( "quiet", "q", 0, "logical", "help", "h", 0, "logical", "outfile", "o", 1, "character", "countsfile", "n", 1, "character", "factorName", "N", 1, "character", "levelNameA", "A", 1, "character", "levelNameB", "B", 1, "character", "levelAfiles", "a", 1, "character", "levelBfiles", "b", 1, "character", "alignmentA", "i", 1, "character", "alignmentB", "j", 1, "character", "plots" , "p", 1, "character"), byrow=TRUE, ncol=4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { cat(getopt(spec, usage=TRUE)) q(status=1) } # enforce the following required arguments if (is.null(opt$outfile)) { cat("'outfile' is required\n") q(status=1) } if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) { cat("input count files are required for both levels\n") q(status=1) } if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) { cat("total aligned read files are required for both levels\n") q(status=1) } verbose <- if (is.null(opt$quiet)) { TRUE } else { FALSE } suppressPackageStartupMessages({ library("edgeR") library("limma") }) # build levels A and B file lists library("rjson") filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") listA <- list() indice = 0 listA[["level"]] <- opt$levelNameA for (file in filesA) { indice = indice +1 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE) } listB <- list() indice = 0 listB[["level"]] <- opt$levelNameB for (file in filesB) { indice = indice +1 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE) } # build a counts table counts <- data.frame(row.names=listA[[2]][,1]) for (element in names(listA[-1])) { counts<-cbind(counts, listA[[element]][,4]) } for (element in names(listB[-1])) { counts<-cbind(counts, listB[[element]][,4]) } colnames(counts)=c(names(listA[-1]), names(listB[-1])) # build aligned counts vector filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error") filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error") sizes <- c() for (file in filesi) { sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) } for (file in filesj) { sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) } # build a meta data object meta <- data.frame( row.names=colnames(counts), condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ), libsize=sizes ) # Define the library size and conditions for the GLM libsize <- meta$libsize condition <- factor(meta$condition) design <- model.matrix(~0+condition) colnames(design) <- levels(meta$condition) # Build a DGE object for the GLM y <- DGEList(counts=counts, lib.size=libsize) # Normalize the data y <- calcNormFactors(y) y$samples # plotMDS(y) latter # Estimate the variance y <- estimateGLMCommonDisp(y, design) y <- estimateGLMTrendedDisp(y, design) y <- estimateGLMTagwiseDisp(y, design) # plotBCV(y) latter # Builds and outputs an object to contain the normalized read abundance in counts per million of reads cpm <- cpm(y, log=FALSE, lib.size=libsize) cpm <- as.data.frame(cpm) colnames(cpm) <- colnames(counts) if (!is.null(opt$countsfile)) { normalizedAbundance <- data.frame(Tag=rownames(cpm)) normalizedAbundance <- cbind(normalizedAbundance, cpm) write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) } # Conduct fitting of the GLM yfit <- glmFit(y, design) # Initialize result matrices to contain the results of the GLM results <- matrix(nrow=dim(counts)[1],ncol=0) logfc <- matrix(nrow=dim(counts)[1],ncol=0) # Make the comparisons for the GLM my.contrasts <- makeContrasts( paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB), levels = design ) # Define the contrasts used in the comparisons allcontrasts = paste0(opt$levelNameA," vs ",opt$levelNameB) # Conduct a for loop that will do the fitting of the GLM for each comparison # Put the results into the results objects lrt <- glmLRT(yfit, contrast=my.contrasts[,1]) plotSmear(lrt, de.tags=rownames(y)) title(allcontrasts) res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table results <- cbind(results,res[,c(1,5)]) logfc <- cbind(logfc,res[c(1)]) # Add the repeat types back into the results. # We should still have the same order as the input data results$class <- listA[[2]][,2] results$type <- listA[[2]][,3] # Sort the results table by the FDR results <- results[with(results, order(FDR)), ] # Save the results write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE) # Plot Fold Changes for repeat classes and types # open the device and plots if (!is.null(opt$plots)) { if (verbose) cat("creating plots\n") pdf(opt$plots) plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples") plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot") logFC <- results[, "logFC"] # Plot the repeat classes classes <- with(results, reorder(class, -logFC, median)) classes par(mar=c(6,10,4,1)) boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE, las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Class")) abline(v=0) # Plot the repeat types types <- with(results, reorder(type, -logFC, median)) boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE, las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Type"), yaxt="n") axis(2, cex.axis=(1*11)/(length(levels(types))), at=seq(from=1, to=length(levels(types))), labels=levels(types), las=2) abline(v=0) # volcano plot TEdata = cbind(rownames(results), as.data.frame(results), score=-log(results$FDR, 10)) colnames(TEdata) = c("Tag","log2FC", "FDR", "Class", "Type", "score") color = ifelse(TEdata$FDR<0.05, "red", "black") s <- subset(TEdata, FDR<0.01) with(TEdata, plot(log2FC, score, pch=20, col=color, main="Volcano plot (all tag types)", ylab="-log10(FDR)")) text(s[,2], s[,6], labels = s[,5], pos = seq(from=1, to=3), cex=0.5) } # close the plot device if (!is.null(opt$plots)) { cat("closing plot device\n") dev.off() } cat("Session information:\n\n") sessionInfo()