Mercurial > repos > fubar > digital_dge
diff rgDGE.py @ 0:1959becd0592
Initial upload of files for DGE tool
author | fubar |
---|---|
date | Fri, 09 Sep 2011 01:07:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rgDGE.py Fri Sep 09 01:07:48 2011 -0400 @@ -0,0 +1,338 @@ +# Copyright ross lazarus +# september 2011 +# all rights reserved +# for the Rgenetics project +# all rights reserved +# licensed to you under the terms of the LGPL as documented at http://www.gnu.org/copyleft/lesser.html + +import sys +import shutil +import subprocess +import os +import time +import tempfile +import optparse + +progname = os.path.split(sys.argv[0])[1] +myversion = 'V000.2 September 2011' +verbose = False +debug = False + + +galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> +<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> +<title></title> +<link rel="stylesheet" href="/static/style/base.css" type="text/css" /> +</head> +<body> +<div class="document"> +""" +galhtmlattr = """<b><a href="http://rgenetics.org">Galaxy Rgenetics</a> tool output %s run at %s</b><br/>""" +galhtmlpostfix = """</div></body></html>\n""" + +DGESCRIPT=""" +#### edgeR.Rscript +# +# Performs DGE on a count table containing n replicates of two conditions +# +# Parameters +# +# 1 - Output Dir + +# Writen by: S.Lunke and A.Kaspi +#### + + +usage <- function(){ + print("#### edgeR.R", quote=F) + print("", quote=F) + print("Performs DGE on a count table containing n replicates of two conditions", quote=F) + print("", quote=F) + print("USAGE: Rscript edgeR.R <OUT_DIR> <INPUT> <TreatmentName> <ControlName> <Treatcol1,2,3> <Controlcol1,2,3>", quote=F) + print("", quote=F) + print(" Parameters", quote=F) + print("", quote=F) + print(" 1 - Output Dir", quote=F) + print(" 2 - Input File", quote=F) + print(" 3 - Treatment name", quote=F) + print(" 4 - Treatment Columns i.e. 3,4,6", quote=F) + print(" 5 - Control name", quote=F) + print(" 6 - Control Columns i.e. 2,7,8", quote=F) + print(" 7 - Output tabular file name", quote=F) + + + print("", quote=F) + print(" Interface writen by: S.Lunke and A.Kaspi", quote=F) + print(" Makes extensive use of the edgeR software - Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818", quote=F) + print("####", quote=F) + q() + } + + +## edgeIt runs edgeR +edgeIt <- function (Input,group,outputfilename) { + + + ## Error handling + if (length(unique(group))!=2){ + print("Number of conditions identified in experiment does not equal 2") + q() + } + + #The workhorse + require(edgeR) + + ## Setup DGEList object + DGEList <- DGEList(Count_Matrix, group = group) + + #Extract T v C names + TName=unique(group)[1] + CName=unique(group)[2] + + # Outfile name - we need something more predictable than + # outfile <- paste(Out_Dir,"/",TName,"_Vs_",CName, sep="") + # since it needs to be renamed and squirted into the history so added as a paramter + + # Filter all tags that have less than one read per million in half + 1 or more libraries. n = ceiling(length(colnames(DGEList))/2)+1 + n = ceiling(length(colnames(DGEList))/2)+1 + DGEList <- DGEList[rowSums(1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList)) > 1) >= n, ] + + + ## Run tagwise dispersion test and DGE analysis + prior.n <- ceiling(50/(length(group)-length(2))) + DGEList <- calcNormFactors(DGEList) + DGEList <- estimateCommonDisp(DGEList) + DGEList <- estimateTagwiseDisp(DGEList, prior.n=prior.n, trend=T, grid.length=1000) + DE <- exactTest(DGEList, common.disp=F) + + ## Normalized data RPM + normData <- (1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList))) + colnames(normData) <- paste( "norm",colnames(normData),sep="_") + + + #Prepare our output file + output <- cbind( + Name=as.character(rownames(DGEList$counts)), + DE$table, + adj.p.value=p.adjust(DE$table$p.value, method="fdr"), + Dispersion=DGEList$tagwise.dispersion,normData, + DGEList$counts + ) + soutput = output[order(output$p.val),] # sorted into p value order - for quick toptable + #Write output + write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F) + + + ## Plot MAplot + pdf("Smearplot.pdf") + #TODO colour siggenes + nsig = sum(output$adj.p.value < 0.05) + deTags = rownames(topTags(DE,n=nsig)$table) + plotSmear(DGEList,de.tags=deTags,main=paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@0.05, N = ',nsig,')',sep='')) + grid(col="blue") + dev.off() + + ## Plot MDS + pdf("MDSplot.pdf") + plotMDS.dge(DGEList,xlim=c(-2,1),main=paste("MDS Plot for",TName,'Vs',CName),cex=0.5) + grid(col="blue") + dev.off() + + #Return our main table + output + +} #Done + +#### MAIN #### + +parameters <- commandArgs(trailingOnly=T) + +## Error handling +if (length(parameters) < 6){ + print("Not enough Input files supplied. Specify at least two input files.", quote=F) + print("", quote=F) + usage() +} + +Out_Dir <- as.character(parameters[1]) +Input <- as.character(parameters[2]) +TreatmentName <- as.character(parameters[3]) +TreatmentCols <- as.character(parameters[4]) +ControlName <- as.character(parameters[5]) +ControlCols <- as.character(parameters[6]) +outputfilename <- as.character(parameters[7]) + + +#Set our columns +TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 #+1 +CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 #+1 +cat('## got TCols=') +cat(TCols) +cat('; CCols=') +cat(CCols) +cat('\n') + + +group<-strsplit(TreatmentCols,",")[[1]] + +## Create output dir if non existent + if (file.exists(Out_Dir) == F) dir.create(Out_Dir) + +Count_Matrix<-read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header +Count_Matrix<-Count_Matrix[,c(TCols,CCols)] #extract columns we care about +group<-c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor +colnames(Count_Matrix) <- paste(group,colnames(Count_Matrix),sep="_") #Relable columns + + +results <- edgeIt(Input,group,outputfilename) #Run the main function +# for the log +sessionInfo() + +""" + +def timenow(): + """return current time as a string + """ + return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) + +class DGE: + """class is a wrapper for DGE - note hard coded script above so it's all in one place for Galaxy""" + + def __init__(self,myName=None,opts=None): + """ + Rscript edgeR.R results HGHN_mRNA_counts.txt HiGly 2,3 Control 1,4 +Out_Dir <- as.character(parameters[1]) +Input <- as.character(parameters[2]) +TreatmentName <- as.character(parameters[3]) +TreatmentCols <- as.character(parameters[4]) +ControlName <- as.character(parameters[5]) +ControlCols <- as.character(parameters[6]) +outputfilename <- as.character(parameters[7]) + """ + self.thumbformat = 'jpg' + self.tlog = os.path.join(opts.output_dir,"DGE_runner.log") + self.opts = opts + self.myName=myName + self.cl = [] + a = self.cl.append + rfname = os.path.join(opts.output_dir,'DGE.R') + rscript = open(rfname,'w') + rscript.write(DGESCRIPT) + rscript.close() + a('Rscript') + a(rfname) + a("%s" % opts.output_dir) + a("%s" % opts.input_matrix) + a("%s" % opts.treatname) + a("%s" % opts.treatcols) + a("%s" % opts.ctrlname) + a("%s" % opts.ctrlcols) + a("%s" % opts.output_tab) + + def compressPDF(self,inpdf=None,thumbformat='png'): + """need absolute path to pdf + """ + assert os.path.isfile(inpdf), "## Input %s supplied to %s compressPDF not found" % (inpdf,self.myName) + sto = open(self.tlog,'a') + outpdf = '%s_compressed' % inpdf + cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-sOutputFile=%s" % outpdf,inpdf] + x = subprocess.Popen(cl,stdout=sto,stderr=sto,cwd=self.opts.output_dir) + retval1 = x.wait() + if retval1 == 0: + os.unlink(inpdf) + shutil.move(outpdf,inpdf) + outpng = '%s.%s' % (os.path.splitext(inpdf)[0],thumbformat) + cl2 = ['convert', inpdf, outpng] + x = subprocess.Popen(cl2,stdout=sto,stderr=sto,cwd=self.opts.output_dir) + retval2 = x.wait() + sto.close() + retval = retval1 or retval2 + return retval + + def runDGE(self): + """ + """ + sto = open(self.tlog,'w') + x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir) + retval = x.wait() + sto.close() + flist = os.listdir(self.opts.output_dir) + flist.sort() + html = [galhtmlprefix % progname,] + html.append('<h2>Galaxy DGE outputs run at %s<h2></br>Click on the images below to download high quality PDF versions</br>\n' % timenow()) + if len(flist) > 0: + html.append('<table>\n') + for fname in flist: + dname,e = os.path.splitext(fname) + if e.lower() == '.pdf' : # compress and make a thumbnail + thumb = '%s.%s' % (dname,self.thumbformat) + pdff = os.path.join(self.opts.output_dir,fname) + retval = self.compressPDF(inpdf=pdff,thumbformat=self.thumbformat) + if retval == 0: + s= '<tr><td><a href="%s"><img src="%s" title="Click to download a print quality PDF of %s" hspace="10" width="600"></a></td></tr>' \ + % (fname,thumb,fname) + else: + dname = '%s (thumbnail image not_found)' % fname + s= '<tr><td><a href="%s">%s</a></td></tr>' % (fname,dname) + html.append(s) + else: + html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,fname)) + html.append('</table>\n') + else: + html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>') + html.append('<h3>R log follows below</h3><hr><pre>\n') + rlog = open(self.tlog,'r').readlines() + html += rlog + html.append('%s CL = %s</br>\n' % (self.myName,' '.join(sys.argv))) + html.append('DGE.R CL = %s</br>\n' % (' '.join(self.cl))) + html.append('</pre>\n') + html.append(galhtmlattr % (progname,timenow())) + html.append(galhtmlpostfix) + htmlf = file(self.opts.output_html,'w') + htmlf.write('\n'.join(html)) + htmlf.write('\n') + htmlf.close() + + +def main(): + u = """ + This is a Galaxy wrapper. It expects to be called by DGE.xml as: + <command interpreter="python">rgDGE.py --output_html "$html_file.files_path" --input_matrix "$input1" --treatcols "$Treat_cols" --treatname "$treatment_name" +--ctrlcols "$Control_cols" + --ctrlname "$control_name" --output_tab "$outtab" --output_html "$html_file" --output_dir "$html_file.files_path" --method "edgeR" + </command> + """ + op = optparse.OptionParser() + a = op.add_option + a('--input_matrix',default=None) + a('--output_tab',default=None) + a('--output_html',default=None) + a('--treatcols',default=None) + a('--treatname',default='Treatment') + a('--ctrlcols',default=None) + a('--ctrlname',default='Control') + a('--output_dir',default=None) + a('--method',default='edgeR') + opts, args = op.parse_args() + assert opts.input_matrix and opts.output_html,u + assert os.path.isfile(opts.input_matrix),'## DGE runner unable to open supplied input file %s' % opts.input_matrix + assert opts.treatcols,'## DGE runner requires a comma separated list of treatment columns eg --treatcols 4,5,6' + assert opts.treatcols,'## DGE runner requires a comma separated list of control columns eg --ctlcols 2,3,7' + myName=os.path.split(sys.argv[0])[-1] + if not os.path.exists(opts.output_dir): + os.makedirs(opts.output_dir) + m = DGE(myName, opts=opts) + retcode = m.runDGE() + if retcode: + sys.exit(retcode) # indicate failure to job runner + + + +if __name__ == "__main__": + main() + + +