diff rgDGE.py @ 0:1959becd0592

Initial upload of files for DGE tool
author fubar
date Fri, 09 Sep 2011 01:07:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rgDGE.py	Fri Sep 09 01:07:48 2011 -0400
@@ -0,0 +1,338 @@
+# Copyright ross lazarus
+# september 2011 
+# all rights reserved
+# for the Rgenetics project
+# all rights reserved
+# licensed to you under the terms of the LGPL as documented at http://www.gnu.org/copyleft/lesser.html
+
+import sys 
+import shutil 
+import subprocess 
+import os 
+import time 
+import tempfile 
+import optparse
+
+progname = os.path.split(sys.argv[0])[1] 
+myversion = 'V000.2 September 2011' 
+verbose = False 
+debug = False
+
+
+galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> 
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 
+<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
+<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> 
+<title></title> 
+<link rel="stylesheet" href="/static/style/base.css" type="text/css" /> 
+</head> 
+<body> 
+<div class="document"> 
+""" 
+galhtmlattr = """<b><a href="http://rgenetics.org">Galaxy Rgenetics</a> tool output %s run at %s</b><br/>""" 
+galhtmlpostfix = """</div></body></html>\n"""
+
+DGESCRIPT="""
+#### edgeR.Rscript
+#
+# Performs DGE on a count table containing n replicates of two conditions
+#
+# Parameters
+#
+# 1 - Output Dir
+
+# Writen by: S.Lunke and A.Kaspi
+####
+
+
+usage <- function(){
+  print("#### edgeR.R", quote=F)
+  print("", quote=F)
+  print("Performs DGE on a count table containing n replicates of two conditions", quote=F)
+  print("", quote=F)
+  print("USAGE: Rscript edgeR.R <OUT_DIR> <INPUT> <TreatmentName> <ControlName> <Treatcol1,2,3> <Controlcol1,2,3>", quote=F)
+  print("", quote=F)
+  print(" Parameters", quote=F)
+  print("", quote=F)
+  print(" 1 - Output Dir", quote=F)
+  print(" 2 - Input File", quote=F)
+  print(" 3 - Treatment name", quote=F)
+  print(" 4 - Treatment Columns i.e. 3,4,6", quote=F)
+  print(" 5 - Control name", quote=F)
+  print(" 6 - Control Columns i.e. 2,7,8", quote=F)
+  print(" 7 - Output tabular file name", quote=F)
+
+
+  print("", quote=F)
+  print(" Interface writen by: S.Lunke and A.Kaspi", quote=F)
+  print(" Makes extensive use of the edgeR software - Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818", quote=F)
+  print("####", quote=F)
+  q()
+   }
+
+
+## edgeIt runs edgeR
+edgeIt <- function (Input,group,outputfilename) {
+
+
+        ## Error handling
+        if (length(unique(group))!=2){
+                print("Number of conditions identified in experiment does not equal 2")
+                q()
+        }
+
+        #The workhorse  
+        require(edgeR)
+
+        ## Setup DGEList object
+        DGEList <- DGEList(Count_Matrix, group = group)
+
+        #Extract T v C names
+        TName=unique(group)[1]
+        CName=unique(group)[2]
+
+        # Outfile name - we need something more predictable than   
+        # outfile <- paste(Out_Dir,"/",TName,"_Vs_",CName, sep="")
+        # since it needs to be renamed and squirted into the history so added as a paramter
+ 
+        # Filter all tags that have less than one read per million in half + 1 or more libraries. n = ceiling(length(colnames(DGEList))/2)+1
+        n = ceiling(length(colnames(DGEList))/2)+1
+        DGEList <- DGEList[rowSums(1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList)) > 1) >= n, ]
+
+
+        ## Run tagwise dispersion test and DGE analysis
+        prior.n <- ceiling(50/(length(group)-length(2)))
+        DGEList <- calcNormFactors(DGEList)
+        DGEList <- estimateCommonDisp(DGEList)
+        DGEList <- estimateTagwiseDisp(DGEList, prior.n=prior.n, trend=T, grid.length=1000)
+        DE <- exactTest(DGEList, common.disp=F)
+  
+        ## Normalized data RPM
+        normData <- (1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList)))
+        colnames(normData) <- paste( "norm",colnames(normData),sep="_")
+
+
+        #Prepare our output file
+        output <- cbind( 
+                Name=as.character(rownames(DGEList$counts)),
+                DE$table,
+                adj.p.value=p.adjust(DE$table$p.value, method="fdr"),
+                Dispersion=DGEList$tagwise.dispersion,normData,
+                DGEList$counts
+        )
+        soutput = output[order(output$p.val),] # sorted into p value order - for quick toptable
+        #Write output
+        write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
+
+
+        ## Plot MAplot
+        pdf("Smearplot.pdf")
+        #TODO colour siggenes
+        nsig = sum(output$adj.p.value < 0.05)
+        deTags = rownames(topTags(DE,n=nsig)$table)
+        plotSmear(DGEList,de.tags=deTags,main=paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@0.05, N = ',nsig,')',sep=''))
+        grid(col="blue")
+        dev.off()
+
+        ## Plot MDS
+        pdf("MDSplot.pdf")
+        plotMDS.dge(DGEList,xlim=c(-2,1),main=paste("MDS Plot for",TName,'Vs',CName),cex=0.5)
+        grid(col="blue")
+        dev.off()
+
+        #Return our main table
+        output 
+  
+}       #Done
+
+#### MAIN ####
+
+parameters <- commandArgs(trailingOnly=T)
+
+## Error handling
+if (length(parameters) < 6){
+  print("Not enough Input files supplied. Specify at least two input files.", quote=F)
+  print("", quote=F)
+  usage()
+}
+
+Out_Dir         <- as.character(parameters[1])
+Input           <- as.character(parameters[2])
+TreatmentName   <- as.character(parameters[3])
+TreatmentCols   <- as.character(parameters[4])
+ControlName     <- as.character(parameters[5])
+ControlCols     <- as.character(parameters[6])
+outputfilename  <- as.character(parameters[7])
+
+
+#Set our columns 
+TCols           = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 #+1
+CCols           = as.numeric(strsplit(ControlCols,",")[[1]])-1 #+1
+cat('## got TCols=')
+cat(TCols)
+cat('; CCols=')
+cat(CCols)
+cat('\n')
+
+
+group<-strsplit(TreatmentCols,",")[[1]]
+
+## Create output dir if non existent
+  if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+
+Count_Matrix<-read.table(Input,header=T,row.names=1,sep='\t')                           #Load tab file assume header
+Count_Matrix<-Count_Matrix[,c(TCols,CCols)]                                             #extract columns we care about
+group<-c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
+colnames(Count_Matrix) <- paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
+
+
+results <- edgeIt(Input,group,outputfilename)                                           #Run the main function
+# for the log
+sessionInfo()
+
+"""
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+    
+class DGE:
+    """class is a wrapper for DGE - note hard coded script above so it's all in one place for Galaxy"""
+
+    def __init__(self,myName=None,opts=None):
+        """
+        Rscript edgeR.R results HGHN_mRNA_counts.txt HiGly 2,3 Control 1,4
+Out_Dir         <- as.character(parameters[1])
+Input           <- as.character(parameters[2])
+TreatmentName   <- as.character(parameters[3])
+TreatmentCols   <- as.character(parameters[4])
+ControlName     <- as.character(parameters[5])
+ControlCols     <- as.character(parameters[6])
+outputfilename  <- as.character(parameters[7])
+        """
+        self.thumbformat = 'jpg'
+        self.tlog = os.path.join(opts.output_dir,"DGE_runner.log")
+        self.opts = opts
+        self.myName=myName
+        self.cl = []
+        a = self.cl.append
+        rfname = os.path.join(opts.output_dir,'DGE.R')
+        rscript = open(rfname,'w')
+        rscript.write(DGESCRIPT)
+        rscript.close()
+        a('Rscript')
+        a(rfname)
+        a("%s" % opts.output_dir)
+        a("%s" % opts.input_matrix)
+        a("%s" % opts.treatname)
+        a("%s" % opts.treatcols)
+        a("%s" % opts.ctrlname)
+        a("%s" % opts.ctrlcols)
+        a("%s" % opts.output_tab)        
+    
+    def compressPDF(self,inpdf=None,thumbformat='png'):
+        """need absolute path to pdf
+        """
+        assert os.path.isfile(inpdf), "## Input %s supplied to %s compressPDF not found" % (inpdf,self.myName)
+        sto = open(self.tlog,'a')
+        outpdf = '%s_compressed' % inpdf
+        cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-sOutputFile=%s" % outpdf,inpdf]
+        x = subprocess.Popen(cl,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
+        retval1 = x.wait()
+        if retval1 == 0:
+            os.unlink(inpdf)
+            shutil.move(outpdf,inpdf)
+        outpng = '%s.%s' % (os.path.splitext(inpdf)[0],thumbformat)
+        cl2 = ['convert', inpdf, outpng]
+        x = subprocess.Popen(cl2,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
+        retval2 = x.wait()
+        sto.close()
+        retval = retval1 or retval2
+        return retval
+
+    def runDGE(self):
+        """
+        """
+        sto = open(self.tlog,'w')
+        x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
+        retval = x.wait()
+        sto.close()
+        flist = os.listdir(self.opts.output_dir)
+        flist.sort()
+        html = [galhtmlprefix % progname,]
+        html.append('<h2>Galaxy DGE outputs run at %s<h2></br>Click on the images below to download high quality PDF versions</br>\n' % timenow())
+        if len(flist) > 0:
+            html.append('<table>\n')
+            for fname in flist:
+                dname,e = os.path.splitext(fname)
+                if e.lower() == '.pdf' : # compress and make a thumbnail
+                    thumb = '%s.%s' % (dname,self.thumbformat)
+                    pdff = os.path.join(self.opts.output_dir,fname)
+                    retval = self.compressPDF(inpdf=pdff,thumbformat=self.thumbformat)
+                    if retval == 0:
+                        s= '<tr><td><a href="%s"><img src="%s" title="Click to download a print quality PDF of %s" hspace="10" width="600"></a></td></tr>' \
+                         % (fname,thumb,fname)
+                    else:
+                        dname = '%s (thumbnail image not_found)' % fname
+                        s= '<tr><td><a href="%s">%s</a></td></tr>' % (fname,dname)
+                    html.append(s)
+                else:
+                   html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,fname))
+            html.append('</table>\n')
+        else:
+            html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>')
+        html.append('<h3>R log follows below</h3><hr><pre>\n')
+        rlog = open(self.tlog,'r').readlines()
+        html += rlog
+        html.append('%s CL = %s</br>\n' % (self.myName,' '.join(sys.argv)))
+        html.append('DGE.R CL = %s</br>\n' % (' '.join(self.cl)))
+        html.append('</pre>\n')
+        html.append(galhtmlattr % (progname,timenow()))
+        html.append(galhtmlpostfix)
+        htmlf = file(self.opts.output_html,'w')
+        htmlf.write('\n'.join(html))
+        htmlf.write('\n')
+        htmlf.close()
+  
+
+def main():
+    u = """
+    This is a Galaxy wrapper. It expects to be called by DGE.xml as:
+    <command interpreter="python">rgDGE.py --output_html "$html_file.files_path" --input_matrix "$input1" --treatcols "$Treat_cols" --treatname "$treatment_name" 
+--ctrlcols "$Control_cols"
+      --ctrlname "$control_name" --output_tab "$outtab" --output_html "$html_file" --output_dir "$html_file.files_path" --method "edgeR"
+    </command>
+    """
+    op = optparse.OptionParser()
+    a = op.add_option
+    a('--input_matrix',default=None)
+    a('--output_tab',default=None)
+    a('--output_html',default=None)
+    a('--treatcols',default=None)
+    a('--treatname',default='Treatment')
+    a('--ctrlcols',default=None)
+    a('--ctrlname',default='Control')
+    a('--output_dir',default=None)
+    a('--method',default='edgeR')
+    opts, args = op.parse_args()
+    assert opts.input_matrix and opts.output_html,u
+    assert os.path.isfile(opts.input_matrix),'## DGE runner unable to open supplied input file %s' % opts.input_matrix
+    assert opts.treatcols,'## DGE runner requires a comma separated list of treatment columns eg --treatcols 4,5,6'
+    assert opts.treatcols,'## DGE runner requires a comma separated list of control columns eg --ctlcols 2,3,7'
+    myName=os.path.split(sys.argv[0])[-1]
+    if not os.path.exists(opts.output_dir):
+        os.makedirs(opts.output_dir)
+    m = DGE(myName, opts=opts)
+    retcode = m.runDGE()
+    if retcode:
+        sys.exit(retcode) # indicate failure to job runner
+ 
+ 
+ 
+if __name__ == "__main__":
+    main()
+
+
+