Mercurial > repos > iuc > deseq2
view deseq2.R @ 7:cb23006f34ff draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 82d6e2f33ec7b26276715e01c5b17944a0f5cd0f
author | iuc |
---|---|
date | Mon, 20 Mar 2017 17:23:27 -0400 |
parents | 4939397c4706 |
children | 6f91b8ce6e07 |
line wrap: on
line source
#!/usr/bin/env Rscript # A command-line interface to DESeq2 for use with Galaxy # written by Bjoern Gruening and modified by Michael Love 2016.03.30 # # one of these arguments is required: # # 'factors' a JSON list object from Galaxy # # 'sample_table' is a sample table as described in ?DESeqDataSetFromHTSeq # with columns: sample name, filename, then factors (variables) # # the output file has columns: # # baseMean (mean normalized count) # log2FoldChange (by default a moderated LFC estimate) # lfcSE (the standard error) # stat (the Wald statistic) # pvalue (p-value from comparison of Wald statistic to a standard Normal) # padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter) # # the first variable in 'factors' and first column in 'sample_table' will be the primary factor. # the levels of the primary factor are used in the order of appearance in factors or in sample_table. # # by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile' # # for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B, # to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc. # all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages. # # fit_type is an integer valued argument, with the options from ?estimateDisperions # 1 "parametric" # 2 "local" # 3 "mean" # setup R error handling to go to stderr options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") library("getopt") library("tools") options(stringAsFactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) # get options, using the spec as defined by the enclosed list. # we read the options from the default: commandArgs(TRUE). spec <- matrix(c( "quiet", "q", 0, "logical", "help", "h", 0, "logical", "outfile", "o", 1, "character", "countsfile", "n", 1, "character", "factors", "f", 1, "character", "plots" , "p", 1, "character", "sample_table", "s", 1, "character", "tximport", "i", 0, "logical", "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) "fit_type", "t", 1, "integer", "many_contrasts", "m", 0, "logical", "outlier_replace_off" , "a", 0, "logical", "outlier_filter_off" , "b", 0, "logical", "auto_mean_filter_off", "c", 0, "logical", "beta_prior_off", "d", 0, "logical"), byrow=TRUE, ncol=4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { cat(getopt(spec, usage=TRUE)) q(status=1) } # enforce the following required arguments if (is.null(opt$outfile)) { cat("'outfile' is required\n") q(status=1) } if (is.null(opt$sample_table) & is.null(opt$factors)) { cat("'factors' or 'sample_table' is required\n") q(status=1) } verbose <- if (is.null(opt$quiet)) { TRUE } else { FALSE } if (!is.null(opt$tximport)) { if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport") if (tolower(file_ext(opt$tx2gene)) == "gtf") { gtfFile <- opt$tx2gene } else { gtfFile <- NULL tx2gene <- read.table(opt$tx2gene, header=FALSE) } useTXI <- TRUE } else { useTXI <- FALSE } suppressPackageStartupMessages({ library("DESeq2") library("RColorBrewer") library("gplots") }) # build or read sample table trim <- function (x) gsub("^\\s+|\\s+$", "", x) # switch on if 'factors' was provided: if (!is.null(opt$factors)) { library("rjson") parser <- newJSONParser() parser$addData(opt$factors) factorList <- parser$getObject() factors <- sapply(factorList, function(x) x[[1]]) primaryFactor <- factors[1] filenamesIn <- unname(unlist(factorList[[1]][[2]])) sampleTable <- data.frame(sample=basename(filenamesIn), filename=filenamesIn, row.names=filenamesIn, stringsAsFactors=FALSE) for (factor in factorList) { factorName <- trim(factor[[1]]) sampleTable[[factorName]] <- character(nrow(sampleTable)) lvls <- sapply(factor[[2]], function(x) names(x)) for (i in seq_along(factor[[2]])) { files <- factor[[2]][[i]][[1]] sampleTable[files,factorName] <- trim(lvls[i]) } sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) } rownames(sampleTable) <- sampleTable$sample } else { # read the sample_table argument # this table is described in ?DESeqDataSet # one column for the sample name, one for the filename, and # the remaining columns for factors in the analysis sampleTable <- read.delim(opt$sample_table, stringsAsFactors=FALSE) factors <- colnames(sampleTable)[-c(1:2)] for (factor in factors) { lvls <- unique(as.character(sampleTable[[factor]])) sampleTable[[factor]] <- factor(sampleTable[[factor]], levels=lvls) } } primaryFactor <- factors[1] designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + "))) if (verbose) { cat("DESeq2 run information\n\n") cat("sample table:\n") print(sampleTable[,-c(1:2),drop=FALSE]) cat("\ndesign formula:\n") print(designFormula) cat("\n\n") } # these are plots which are made once for each analysis generateGenericPlots <- function(dds, factors) { rld <- rlog(dds) d=plotPCA(rld, intgroup=rev(factors), returnData=TRUE) labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors]))) library(ggplot2) print(ggplot(d, aes(x=PC1,y=PC2, col=group,label=factor(labs)), environment = environment()) + geom_point() + geom_text(size=3)) dat <- assay(rld) colnames(dat) <- labs distsRL <- dist(t(dat)) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col = rev(hmcol), main="Sample-to-sample distances", margin=c(13,13)) plotDispEsts(dds, main="Dispersion estimates") } # these are plots which can be made for each comparison, e.g. # once for C vs A and once for B vs A generateSpecificPlots <- function(res, threshold, title_suffix) { use <- res$baseMean > threshold if (sum(!use) == 0) { h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE) barplot(height = h$counts, col = "powderblue", space = 0, xlab="p-values", ylab="frequency", main=paste("Histogram of p-values for",title_suffix)) text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) } else { h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue") barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, xlab="p-values", ylab="frequency", main=paste("Histogram of p-values for",title_suffix)) text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white") } plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE)) } if (verbose) { cat(paste("primary factor:",primaryFactor,"\n")) if (length(factors) > 1) { cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n")) } cat("\n---------------------\n") } # if JSON input from Galaxy, path is absolute # otherwise, from sample_table, assume it is relative dir <- if (is.null(opt$factors)) { "." } else { "" } if (!useTXI) { # construct the object from HTSeq files dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = dir, design = designFormula) } else { # construct the object using tximport # first need to make the tx2gene table # this takes ~2-3 minutes using Bioconductor functions if (!is.null(gtfFile)) { suppressPackageStartupMessages({ library("GenomicFeatures") }) txdb <- makeTxDbFromGFF(gtfFile, format="gtf") k <- keys(txdb, keytype = "GENEID") df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") tx2gene <- df[, 2:1] # tx ID, then gene ID } library("tximport") txiFiles <- as.character(sampleTable[,2]) names(txiFiles) <- as.character(sampleTable[,1]) txi <- tximport(txiFiles, type="sailfish", tx2gene=tx2gene) dds <- DESeqDataSetFromTximport(txi, sampleTable[,3:ncol(sampleTable),drop=FALSE], designFormula) } if (verbose) cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) # optional outlier behavior if (is.null(opt$outlier_replace_off)) { minRep <- 7 } else { minRep <- Inf if (verbose) cat("outlier replacement off\n") } if (is.null(opt$outlier_filter_off)) { cooksCutoff <- TRUE } else { cooksCutoff <- FALSE if (verbose) cat("outlier filtering off\n") } # optional automatic mean filtering if (is.null(opt$auto_mean_filter_off)) { independentFiltering <- TRUE } else { independentFiltering <- FALSE if (verbose) cat("automatic filtering on the mean off\n") } # shrinkage of LFCs if (is.null(opt$beta_prior_off)) { betaPrior <- TRUE } else { betaPrior <- FALSE if (verbose) cat("beta prior off\n") } # dispersion fit type if (is.null(opt$fit_type)) { fitType <- "parametric" } else { fitType <- c("parametric","local","mean")[opt$fit_type] } if (verbose) cat(paste("using disperion fit type:",fitType,"\n")) # run the analysis dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep) # create the generic plots and leave the device open if (!is.null(opt$plots)) { if (verbose) cat("creating plots\n") pdf(opt$plots) generateGenericPlots(dds, factors) } n <- nlevels(colData(dds)[[primaryFactor]]) allLevels <- levels(colData(dds)[[primaryFactor]]) if (!is.null(opt$countsfile)) { labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors]))) normalizedCounts<-counts(dds,normalized=TRUE) colnames(normalizedCounts)<-labs write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE) } if (is.null(opt$many_contrasts)) { # only contrast the first and second level of the primary factor ref <- allLevels[1] lvl <- allLevels[2] res <- results(dds, contrast=c(primaryFactor, lvl, ref), cooksCutoff=cooksCutoff, independentFiltering=independentFiltering) if (verbose) { cat("summary of results\n") cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n")) print(summary(res)) } resSorted <- res[order(res$padj),] outDF <- as.data.frame(resSorted) outDF$geneID <- rownames(outDF) outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] filename <- opt$outfile write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) if (independentFiltering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) if (!is.null(opt$plots)) { generateSpecificPlots(res, threshold, title_suffix) } } else { # rotate through the possible contrasts of the primary factor # write out a sorted table of results with the contrast as a suffix # add contrast specific plots to the device for (i in seq_len(n-1)) { ref <- allLevels[i] contrastLevels <- allLevels[(i+1):n] for (lvl in contrastLevels) { res <- results(dds, contrast=c(primaryFactor, lvl, ref), cooksCutoff=cooksCutoff, independentFiltering=independentFiltering) resSorted <- res[order(res$padj),] outDF <- as.data.frame(resSorted) outDF$geneID <- rownames(outDF) outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref) write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) if (independentFiltering) { threshold <- unname(attr(res, "filterThreshold")) } else { threshold <- 0 } title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) if (!is.null(opt$plots)) { generateSpecificPlots(res, threshold, title_suffix) } } } } # close the plot device if (!is.null(opt$plots)) { cat("closing plot device\n") dev.off() } cat("Session information:\n\n") sessionInfo()