Mercurial > repos > nikhil-joshi > deseq_and_sam2counts
changeset 0:d7f27b43b8ff draft
Uploaded
author | nikhil-joshi |
---|---|
date | Thu, 05 Jul 2012 21:02:43 -0400 |
parents | |
children | 3348f484c49c |
files | deseq/README.md deseq/deseq.R deseq/deseq.xml deseq/sam2counts.xml deseq/sam2counts_galaxy.py deseq/stderr_wrapper.py |
diffstat | 6 files changed, 554 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/README.md Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,23 @@ +# sam2counts and DESeq in Galaxy + +## About + +This is a Galaxy package that wraps sam2counts and DESeq for RNA-Seq analysis using a transcriptome reference. sam2counts takes SAM files that are created from an alignment to a transcriptome and creates counts of aligned reads for each transcript. DESeq uses the DESeq package from Bioconductor in R and analyzes the count data from sam2counts. DESeq outputs a toptable of transcripts sorted by adjusted p-value and a page of diagnostic plots. + +## Requirements + +Python 2.6.5 +pysam 0.6 (package for Python) +R 2.15 +Bioconductor 2.10 (package for R) +DESeq 1.8.3 (package for R) +aroma.light 1.24.0 (package for R) +lattice 0.20-6 (package for R) + +## Installation + +stderr_wrapper.py and sam2counts_galaxy.py must be in the path or they can remain in the tools directory with the xml files. deseq.R must be copied to the "tool-data" directory under the main Galaxy install directory. + +## Use + +sam2counts needs a SAM file (produced by aligning to a transcriptome) with header information as the input. The count data produced from this SAM file gets fed into DESeq.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/deseq.R Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,195 @@ +## Run DESeq (oversimplified) in Galaxy. +## Format: Rscript deseq3.R tab-delimited-counts.txt comma,delimited,col,names,except,first,gene,id,col comparison,here outputfile +## +## The incoming data must have the first column be the gene names, and +## the rest raw counts. The next argument is the list of groups +## (i.e. liver,liver,kidney,kidney), and the final argument is the +## comparison, i.e. kidney,liver. This produces a top-table called +## top-table.txt ordered by p-value. + +cargs <- commandArgs() +cargs <- cargs[(which(cargs == "--args")+1):length(cargs)] + +countstable <- cargs[1] +conds <- unlist(strsplit(cargs[2], ',')) +comparison <- unlist(strsplit(cargs[3], ',')) +outputfile <- cargs[4] +diag.html = cargs[5] +temp.files.path = cargs[6] +counts.name = cargs[7] + +# the comparison must only have two values and the conds must +# be a vector from those values, at least one of each. +if (length(unique(conds)) != 2) { + stop("You can only have two columns types: ", cargs[2]) +} + +if (length(comparison) != 2) { + stop("Comparison type must be a tuple: ", cargs[3]) +} + +if (!identical(sort(comparison), sort(unique(conds)))) { + stop("Column types must use the two names from Comparison type, and vice versa. Must have at least one of each in the Column types.\nColumn types: ", cargs[2], "\n", "Comparison type: ", cargs[3]) +} + +sink("/dev/null") +dir.create(temp.files.path, recursive=TRUE) +library(DESeq) + +d <- read.table(countstable, sep="\t", header=TRUE, row.names=1) + +if (length(d) != length(conds)) { + stop("Number of total sample columns in counts file must correspond to the columns types field. E.g. if column types is 'kidney,kidney,liver,liver' then number of sample columns in counts file must be 4 as well.") +} + +cds <- newCountDataSet(d, conds) +cds <- estimateSizeFactors(cds) + +cdsBlind <- estimateDispersions( cds, method="blind" ) + +if (length(conds) != 2) { + cds <- estimateDispersions( cds ) + norep = FALSE +} + +if (length(conds) == 2) { + cds <- estimateDispersions( cds, method="blind", sharingMode="fit-only" ) + norep = TRUE +} + +plotDispEsts <- function( cds ) +{ + plot( + rowMeans( counts( cds, normalized=TRUE ) ), + fitInfo(cds)$perGeneDispEsts, + pch = '.', log="xy" ) + xg <- 10^seq( -.5, 5, length.out=300 ) + lines( xg, fitInfo(cds)$dispFun( xg ), col="red" ) + } + +temp.disp.est.plot = file.path( temp.files.path, "DispersionEstimatePlot.png" ) +png( temp.disp.est.plot, width=500, height=500 ) +plotDispEsts( cds ) +dev.off() + +res <- nbinomTest(cds, comparison[1], comparison[2]) + +write.table(res[order(res$padj), ], file=outputfile, quote=FALSE, row.names=FALSE, sep="\t") + + +plotDE <- function( res ) + plot( + res$baseMean, + res$log2FoldChange, + log="x", pch=20, cex=.3, + col = ifelse( res$padj < .1, "red", "black" ) ) + +temp.de.plot = file.path( temp.files.path, "DiffExpPlot.png") +png( temp.de.plot, width=500, height=500 ) +plotDE( res ) +dev.off() + + +temp.pval.plot = file.path( temp.files.path, "PvalHist.png") +png( temp.pval.plot, width=500, height=500 ) +hist(res$pval, breaks=100, col="skyblue", border="slateblue", main="") +dev.off() + +vsd <- getVarianceStabilizedData( cdsBlind ) + +temp.heatmap.plot = file.path( temp.files.path, "heatmap.png" ) +png( temp.heatmap.plot, width=500, height=500 ) +select <- order(res$pval)[1:100] +colors <- colorRampPalette(c("white","darkblue"))(100) +heatmap( vsd[select,], col = colors, scale = "none", Rowv=NULL, main="") +dev.off() + + +mod_lfc <- (rowMeans( vsd[, conditions(cds)==comparison[2], drop=FALSE] ) - rowMeans( vsd[, conditions(cds)==comparison[1], drop=FALSE] )) +lfc <- res$log2FoldChange +finite <- is.finite(lfc) +table(as.character(lfc[!finite]), useNA="always") +largeNumber <- 10 +lfc <- ifelse(finite, lfc, sign(lfc) * largeNumber) + +temp.modlfc.plot = file.path( temp.files.path, "modlfc.png" ) +png( temp.modlfc.plot, width=500, height=500 ) +plot( lfc, mod_lfc, pch=20, cex=.3, col = ifelse( finite, "#80808040", "red" ) ) +abline( a=0, b=1, col="#40404040" ) +dev.off() + +temp.sampclust.plot = file.path( temp.files.path, "sampclust.png" ) +png( temp.sampclust.plot, width=500, height=500 ) +dists <- dist( t( vsd ) ) +heatmap( as.matrix( dists ), + symm=TRUE, scale="none", margins=c(10,10), + col = colorRampPalette(c("darkblue","white"))(100), + labRow = paste( pData(cdsBlind)$condition, pData(cdsBlind)$type ) ) +dev.off() + + +library(aroma.light) +library(lattice) + +mdsPlot <- function(x, conds=NULL, cex=1, ...) { + d <- dist(x) + + mds.fit <- cmdscale(d, eig=TRUE, k=2) + + mds.d <- data.frame(x1=mds.fit$points[, 1], + x2=mds.fit$points[, 2], + labels=rownames(x)) + if (!is.null(conds)) + mds.d$treatment <- as.factor(conds) + + if (!is.null(conds)) { + p <- xyplot(x2 ~ x1, group=treatment, data=mds.d, panel=function(x, y, ..., groups, subscripts) { + panel.text(x, y, mds.d$labels[subscripts], cex=cex, col=trellis.par.get()$superpose.line$col[groups]) + }, ...) + } else { + p <- xyplot(x2 ~ x1, data=mds.d, panel=function(x, y, ..., groups, subscripts) { + panel.text(x, y, mds.d$labels[subscripts], cex=cex) + }, ...) + } + return(p) +} + +if (!norep) { + dn = normalizeQuantileRank(as.matrix(d)) + p = mdsPlot(t(log10(dn+1)), conds=conds, xlab="dimension 1", ylab="dimension 2", main="") + + temp.mds.plot = file.path( temp.files.path, "mds.png" ) + png( temp.mds.plot, width=500, height=500 ) + print(p) + dev.off() +} + +file.conn = file(diag.html, open="w") +writeLines( c("<html><body bgcolor='lightgray'>"), file.conn) +writeLines( c("<h2><u>Diagnostic Plots for ", counts.name, "</u></h2>"), file.conn) +writeLines( c("<h3>Dispersion Estimates</h3>"), file.conn) +writeLines( c("<img src='DispersionEstimatePlot.png'><br/><br/>"), file.conn) +writeLines( c("<h3>Differential Expression - Base Mean vs. Log2 Fold Change</h3>"), file.conn) +writeLines( c("<img src='DiffExpPlot.png'><br/><br/>"), file.conn) +writeLines( c("<h3>P-value histogram</h3>"), file.conn) +writeLines( c("<img src='PvalHist.png'><br/><br/>"), file.conn) +writeLines( c("<h3>Top 100 Genes/Transcripts by P-value</h3>"), file.conn) +writeLines( c("<img src='heatmap.png'><br/><br/>"), file.conn) +writeLines( c("<h3>Moderated LFC vs. LFC</h3>"), file.conn) +writeLines( c("<img src='modlfc.png'><br/><br/>"), file.conn) +writeLines( c("<h3>VST Sample Clustering</h3>"), file.conn) +writeLines( c("<img src='sampclust.png'><br/><br/>"), file.conn) +writeLines( c("<h3>MDS Plot</h3>"), file.conn) + +if (!norep) { + writeLines( c("<img src='mds.png'><br/><br/>"), file.conn) +} + +if (norep) { + writeLines( c("<h4>MDS plot not produced for unreplicated data</h4>"), file.conn) +} + +writeLines( c("</body></html>"), file.conn) +close(file.conn) + +sink(NULL)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/deseq.xml Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,31 @@ +<tool id="deseq" name="DE Seq"> + <description>Run Differential Expression analysis from SAM To Count data</description> + + <command interpreter="python"> + stderr_wrapper.py Rscript ${GALAXY_DATA_INDEX_DIR}/deseq.R $counts $column_types $comparison $top_table $diagnostic_html "$diagnostic_html.files_path" "$counts.name" + </command> + + <inputs> + <param format="tabular" name="counts" type="data" optional="false" label="Counts file (from sam2counts)" help="Must have same number of samples as the column types field. E.g. if the column types field is 'kidney,kidney,liver,liver', then there must be 4 sample columns in the counts file."/> + + <param name="column_types" type="text" size="40" optional="false" label="Column Types in counts file" help="A comma separated list (no spaces) of the types of the data columns using the same name for replicates. E.g. kidney,kidney,kidney,liver,liver,liver"> + <validator type="empty_field"/> + <validator type="regex" message="Must be a comma-separated list with no spaces">^(\w+,)+\w+$</validator> + </param> + + <param name="comparison" type="text" size="30" optional="false" label="Comparison type" help="A comma separated tuple (no spaces) of the comparison you want to do. Must use the names from the Column Types list. E.g. comparing kidney to liver: kidney,liver. Comparing liver to kidney: liver,kidney"> + <validator type="empty_field"/> + <validator type="regex" message="Must be a comma-separated tuple with no spaces">^\w+,\w+$</validator> + </param> + </inputs> + + <outputs> + <data format="tabular" name="top_table" label="Top Table from ${tool.name} on ${on_string}"/> + <data format="html" name="diagnostic_html" label="Diagnostic Plots for ${tool.name} on ${on_string}"/> + </outputs> + + <help> + NOTE: This DEseq Galaxy tool can only be run on counts files that are created from SAM files that have been aligned to a transcriptome. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/sam2counts.xml Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,39 @@ +<tool id="sam2counts" name="SAM To Counts"> + <description>Produce count data from SAM files</description> + + <requirements> + <requirement type="python-module">sam2counts_galaxy.py</requirement> + </requirements> + + <command interpreter="python"> + sam2counts_galaxy.py + + ${first_input} + #for $input_file in $input_files: + ${input_file.additional_input} + #end for + + -o $counts + -l $colnames + </command> + + <inputs> + <param name="colnames" type="text" size="30" label="Short names for samples" help="A comma-separated list of short names for the samples in order"> + <validator type="empty_field"/> + <validator type="regex" message="Must be a comma-separated tuple with no spaces">^(\w+,)+\w+$</validator> + </param> + + <param format="sam" name="first_input" type="data" label="SAM file" help=""/> + <repeat name="input_files" title="Additional SAM Files"> + <param format="sam" name="additional_input" type="data" label="SAM file" help=""/> + </repeat> + </inputs> + + <outputs> + <data format="tabular" name="counts"/> + </outputs> + + <help> + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/sam2counts_galaxy.py Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,209 @@ +#!/usr/bin/env python + +""" +count.py -- Take SAM files and output a table of counts with column +names that are the filenames, and rowname that are the reference +names. + +Author: Vince Buffalo +Email: vsbuffaloAAAAA@gmail.com (with poly-A tail removed) +""" + +VERSION = 0.91 + +import sys +import csv +from os import path +try: + import pysam +except ImportError: + sys.exit("pysam not installed; please install it\n") + +from optparse import OptionParser + +def SAM_file_to_counts(filename, bam=False, extra=False, use_all_references=True): + """ + Take SAM filename, and create a hash of mapped and unmapped reads; + keys are reference sequences, values are the counts of occurences. + + Also, a hash of qualities (either 0 or >0) of mapped reads + is output, which is handy for diagnostics. + """ + counts = dict() + unique = dict() + nonunique = dict() + mode = 'r' + if bam: + mode = 'rb' + sf = pysam.Samfile(filename, mode) + + if use_all_references: + # Make dictionary of all entries in header + for sn in sf.header['SQ']: + if extra: + unique[sn['SN']] = 0 + nonunique[sn['SN']] = 0 + counts[sn['SN']] = 0 + + for read in sf: + if not read.is_unmapped: + id_name = sf.getrname(read.rname) if read.rname != -1 else 0 + + if not use_all_references and not counts.get(id_name, False): + ## Only make keys based on aligning reads, make empty hash + if extra: + unique[id_name] = 0 + nonunique[id_name] = 0 + ## initiate entry; even if not mapped, record 0 count + counts[id_name] = counts.get(id_name, 0) + + + counts[id_name] = counts.get(id_name, 0) + 1 + + if extra: + if read.mapq == 0: + nonunique[id_name] = nonunique[id_name] + 1 + else: + unique[id_name] = unique[id_name] + 1 + + if extra: + return {'counts':counts, 'unique':unique, 'nonunique':nonunique} + + return {'counts':counts} + +def collapsed_nested_count_dict(counts_dict, all_ids, order=None): + """ + Takes a nested dictionary `counts_dict` and `all_ids`, which is + built with the `table_dict`. All files (first keys) in + `counts_dict` are made into columns with order specified by + `order`. + + Output is a dictionary with keys that are the id's (genes or + transcripts), with values that are ordered counts. A header will + be created on the first row from the ordered columns (extracted + from filenames). + """ + if order is None: + col_order = counts_dict.keys() + else: + col_order = order + + collapsed_dict = dict() + for i, filename in enumerate(col_order): + for id_name in all_ids: + if not collapsed_dict.get(id_name, False): + collapsed_dict[id_name] = list() + + # get counts and append + c = counts_dict[filename].get(id_name, 0) + collapsed_dict[id_name].append(c) + return {'table':collapsed_dict, 'header':col_order} + + +def counts_to_file(table_dict, outfilename, delimiter=',', labels=''): + """ + A function for its side-effect of writing `table_dict` (which + contains a table and header), to `outfilename` with the specified + `delimiter`. + """ + writer = csv.writer(open(outfilename, 'w'), delimiter=delimiter) + table = table_dict['table'] + if labels: + header = labels.split(',') + else: + header = table_dict['header'] + + header_row = True + for id_name, fields in table.items(): + if header_row: + row = ['id'] + header + writer.writerow(row) + header_row = False + + if id_name == 0: + continue + row = [id_name] + row.extend(fields) + writer.writerow(row) + +if __name__ == '__main__': + parser = OptionParser() + parser.add_option("-d", "--delimiter", dest="delimiter", + help="the delimiter (default: tab)", default='\t') + parser.add_option("-o", "--out-file", dest="out_file", + help="output filename (default: counts.txt)", + default='counts.txt', action="store", type="string") + parser.add_option("-u", "--extra-output", dest="extra_out", + help="output extra information on non-unique and unique mappers (default: False)", + default=False, action="store_true") + parser.add_option("-b", "--bam", dest="bam", + help="all input files are BAM (default: False)", + default=False, action="store_true") + parser.add_option("-r", "--use-all-references", dest="use_all_references", + help="Use all the references from the SAM header (default: True)", + default=True, action="store_false") + parser.add_option("-f", "--extra-out-files", dest="extra_out_files", + help="comma-delimited filenames of unique and non-unique output " + "(default: unique.txt,nonunique.txt)", + default='unique.txt,nonunique.txt', action="store", type="string") + parser.add_option("-v", "--verbose", dest="verbose", + help="enable verbose output") + parser.add_option("-l", "--columns-labels", dest="col_labels", help="comma-delimited label names for samples", action="store", type="string") + + (options, args) = parser.parse_args() + + if len(args) < 1: + parser.error("one or more SAM files as arguments required") + + file_counts = dict() + file_unique_counts = dict() + file_nonunique_counts = dict() + all_ids = list() + files = [path.basename(f) for f in args] + + if options.col_labels and len(files) != len(options.col_labels.split(',')): + parser.error("Number of sample names does not equal number of files") + + if len(set(files)) != len(set(args)): + parser.error("file args must have unique base names (i.e. no foo/bar joo/bar)") + + ## do a pre-run check that all files exist + for full_filename in args: + if not path.exists(full_filename): + parser.error("file '%s' does not exist" % full_filename) + + for full_filename in args: + filename = path.basename(full_filename) + ## read in SAM file, extract counts, and unpack counts + tmp = SAM_file_to_counts(full_filename, bam=options.bam, extra=options.extra_out, + use_all_references=options.use_all_references) + + if options.extra_out: + counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] + else: + counts = tmp['counts'] + + ## save counts, and unique/non-unique counts + file_counts[filename] = counts + + if options.extra_out: + file_unique_counts[filename] = unique + file_nonunique_counts[filename] = nonunique + + ## add all ids encountered in this in this file + all_ids.extend(file_counts[filename].keys()) + + ## Uniquify all_ids, and then take the nested file_counts + ## dictionary, collapse, and write to file. + all_ids = set(all_ids) + table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=files) + counts_to_file(table_dict, options.out_file, delimiter=options.delimiter, labels=options.col_labels) + + if options.extra_out: + unique_fn, nonunique_fn = options.extra_out_files.split(',') + unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) + nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) + + counts_to_file(unique_table_dict, unique_fn, delimiter=options.delimiter) + counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=options.delimiter) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/stderr_wrapper.py Thu Jul 05 21:02:43 2012 -0400 @@ -0,0 +1,57 @@ +#!/usr/bin/python + +""" +Wrapper that executes a program with its arguments but reports standard error +messages only if the program exit status was not 0. This is useful to prevent +Galaxy to interpret that there was an error if something was printed on stderr, +e.g. if this was simply a warning. +Example: ./stderr_wrapper.py myprog arg1 -f arg2 +Author: Florent Angly +""" + +import sys, subprocess + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + # Get command-line arguments + args = sys.argv + # Remove name of calling program, i.e. ./stderr_wrapper.py + args.pop(0) + # If there are no arguments left, we're done + if len(args) == 0: + return + + # If one needs to silence stdout + args.append( ">" ) + args.append( "/dev/null" ) + + #cmdline = " ".join(args) + #print cmdline + try: + # Run program + proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) + returncode = proc.wait() + # Capture stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + # Running Grinder failed: write error message to stderr + if returncode != 0: + raise Exception, stderr + except Exception, e: + # Running Grinder failed: write error message to stderr + stop_err( 'Error: ' + str( e ) ) + + +if __name__ == "__main__": __main__()