Mercurial > repos > fubar > digital_dge
comparison rgDGE.py @ 0:1959becd0592
Initial upload of files for DGE tool
author | fubar |
---|---|
date | Fri, 09 Sep 2011 01:07:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1959becd0592 |
---|---|
1 # Copyright ross lazarus | |
2 # september 2011 | |
3 # all rights reserved | |
4 # for the Rgenetics project | |
5 # all rights reserved | |
6 # licensed to you under the terms of the LGPL as documented at http://www.gnu.org/copyleft/lesser.html | |
7 | |
8 import sys | |
9 import shutil | |
10 import subprocess | |
11 import os | |
12 import time | |
13 import tempfile | |
14 import optparse | |
15 | |
16 progname = os.path.split(sys.argv[0])[1] | |
17 myversion = 'V000.2 September 2011' | |
18 verbose = False | |
19 debug = False | |
20 | |
21 | |
22 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
23 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
24 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
25 <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
26 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
27 <title></title> | |
28 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
29 </head> | |
30 <body> | |
31 <div class="document"> | |
32 """ | |
33 galhtmlattr = """<b><a href="http://rgenetics.org">Galaxy Rgenetics</a> tool output %s run at %s</b><br/>""" | |
34 galhtmlpostfix = """</div></body></html>\n""" | |
35 | |
36 DGESCRIPT=""" | |
37 #### edgeR.Rscript | |
38 # | |
39 # Performs DGE on a count table containing n replicates of two conditions | |
40 # | |
41 # Parameters | |
42 # | |
43 # 1 - Output Dir | |
44 | |
45 # Writen by: S.Lunke and A.Kaspi | |
46 #### | |
47 | |
48 | |
49 usage <- function(){ | |
50 print("#### edgeR.R", quote=F) | |
51 print("", quote=F) | |
52 print("Performs DGE on a count table containing n replicates of two conditions", quote=F) | |
53 print("", quote=F) | |
54 print("USAGE: Rscript edgeR.R <OUT_DIR> <INPUT> <TreatmentName> <ControlName> <Treatcol1,2,3> <Controlcol1,2,3>", quote=F) | |
55 print("", quote=F) | |
56 print(" Parameters", quote=F) | |
57 print("", quote=F) | |
58 print(" 1 - Output Dir", quote=F) | |
59 print(" 2 - Input File", quote=F) | |
60 print(" 3 - Treatment name", quote=F) | |
61 print(" 4 - Treatment Columns i.e. 3,4,6", quote=F) | |
62 print(" 5 - Control name", quote=F) | |
63 print(" 6 - Control Columns i.e. 2,7,8", quote=F) | |
64 print(" 7 - Output tabular file name", quote=F) | |
65 | |
66 | |
67 print("", quote=F) | |
68 print(" Interface writen by: S.Lunke and A.Kaspi", quote=F) | |
69 print(" Makes extensive use of the edgeR software - Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818", quote=F) | |
70 print("####", quote=F) | |
71 q() | |
72 } | |
73 | |
74 | |
75 ## edgeIt runs edgeR | |
76 edgeIt <- function (Input,group,outputfilename) { | |
77 | |
78 | |
79 ## Error handling | |
80 if (length(unique(group))!=2){ | |
81 print("Number of conditions identified in experiment does not equal 2") | |
82 q() | |
83 } | |
84 | |
85 #The workhorse | |
86 require(edgeR) | |
87 | |
88 ## Setup DGEList object | |
89 DGEList <- DGEList(Count_Matrix, group = group) | |
90 | |
91 #Extract T v C names | |
92 TName=unique(group)[1] | |
93 CName=unique(group)[2] | |
94 | |
95 # Outfile name - we need something more predictable than | |
96 # outfile <- paste(Out_Dir,"/",TName,"_Vs_",CName, sep="") | |
97 # since it needs to be renamed and squirted into the history so added as a paramter | |
98 | |
99 # Filter all tags that have less than one read per million in half + 1 or more libraries. n = ceiling(length(colnames(DGEList))/2)+1 | |
100 n = ceiling(length(colnames(DGEList))/2)+1 | |
101 DGEList <- DGEList[rowSums(1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList)) > 1) >= n, ] | |
102 | |
103 | |
104 ## Run tagwise dispersion test and DGE analysis | |
105 prior.n <- ceiling(50/(length(group)-length(2))) | |
106 DGEList <- calcNormFactors(DGEList) | |
107 DGEList <- estimateCommonDisp(DGEList) | |
108 DGEList <- estimateTagwiseDisp(DGEList, prior.n=prior.n, trend=T, grid.length=1000) | |
109 DE <- exactTest(DGEList, common.disp=F) | |
110 | |
111 ## Normalized data RPM | |
112 normData <- (1e+06 * DGEList$counts/expandAsMatrix(DGEList$samples$lib.size, dim(DGEList))) | |
113 colnames(normData) <- paste( "norm",colnames(normData),sep="_") | |
114 | |
115 | |
116 #Prepare our output file | |
117 output <- cbind( | |
118 Name=as.character(rownames(DGEList$counts)), | |
119 DE$table, | |
120 adj.p.value=p.adjust(DE$table$p.value, method="fdr"), | |
121 Dispersion=DGEList$tagwise.dispersion,normData, | |
122 DGEList$counts | |
123 ) | |
124 soutput = output[order(output$p.val),] # sorted into p value order - for quick toptable | |
125 #Write output | |
126 write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F) | |
127 | |
128 | |
129 ## Plot MAplot | |
130 pdf("Smearplot.pdf") | |
131 #TODO colour siggenes | |
132 nsig = sum(output$adj.p.value < 0.05) | |
133 deTags = rownames(topTags(DE,n=nsig)$table) | |
134 plotSmear(DGEList,de.tags=deTags,main=paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@0.05, N = ',nsig,')',sep='')) | |
135 grid(col="blue") | |
136 dev.off() | |
137 | |
138 ## Plot MDS | |
139 pdf("MDSplot.pdf") | |
140 plotMDS.dge(DGEList,xlim=c(-2,1),main=paste("MDS Plot for",TName,'Vs',CName),cex=0.5) | |
141 grid(col="blue") | |
142 dev.off() | |
143 | |
144 #Return our main table | |
145 output | |
146 | |
147 } #Done | |
148 | |
149 #### MAIN #### | |
150 | |
151 parameters <- commandArgs(trailingOnly=T) | |
152 | |
153 ## Error handling | |
154 if (length(parameters) < 6){ | |
155 print("Not enough Input files supplied. Specify at least two input files.", quote=F) | |
156 print("", quote=F) | |
157 usage() | |
158 } | |
159 | |
160 Out_Dir <- as.character(parameters[1]) | |
161 Input <- as.character(parameters[2]) | |
162 TreatmentName <- as.character(parameters[3]) | |
163 TreatmentCols <- as.character(parameters[4]) | |
164 ControlName <- as.character(parameters[5]) | |
165 ControlCols <- as.character(parameters[6]) | |
166 outputfilename <- as.character(parameters[7]) | |
167 | |
168 | |
169 #Set our columns | |
170 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 #+1 | |
171 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 #+1 | |
172 cat('## got TCols=') | |
173 cat(TCols) | |
174 cat('; CCols=') | |
175 cat(CCols) | |
176 cat('\n') | |
177 | |
178 | |
179 group<-strsplit(TreatmentCols,",")[[1]] | |
180 | |
181 ## Create output dir if non existent | |
182 if (file.exists(Out_Dir) == F) dir.create(Out_Dir) | |
183 | |
184 Count_Matrix<-read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header | |
185 Count_Matrix<-Count_Matrix[,c(TCols,CCols)] #extract columns we care about | |
186 group<-c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor | |
187 colnames(Count_Matrix) <- paste(group,colnames(Count_Matrix),sep="_") #Relable columns | |
188 | |
189 | |
190 results <- edgeIt(Input,group,outputfilename) #Run the main function | |
191 # for the log | |
192 sessionInfo() | |
193 | |
194 """ | |
195 | |
196 def timenow(): | |
197 """return current time as a string | |
198 """ | |
199 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
200 | |
201 class DGE: | |
202 """class is a wrapper for DGE - note hard coded script above so it's all in one place for Galaxy""" | |
203 | |
204 def __init__(self,myName=None,opts=None): | |
205 """ | |
206 Rscript edgeR.R results HGHN_mRNA_counts.txt HiGly 2,3 Control 1,4 | |
207 Out_Dir <- as.character(parameters[1]) | |
208 Input <- as.character(parameters[2]) | |
209 TreatmentName <- as.character(parameters[3]) | |
210 TreatmentCols <- as.character(parameters[4]) | |
211 ControlName <- as.character(parameters[5]) | |
212 ControlCols <- as.character(parameters[6]) | |
213 outputfilename <- as.character(parameters[7]) | |
214 """ | |
215 self.thumbformat = 'jpg' | |
216 self.tlog = os.path.join(opts.output_dir,"DGE_runner.log") | |
217 self.opts = opts | |
218 self.myName=myName | |
219 self.cl = [] | |
220 a = self.cl.append | |
221 rfname = os.path.join(opts.output_dir,'DGE.R') | |
222 rscript = open(rfname,'w') | |
223 rscript.write(DGESCRIPT) | |
224 rscript.close() | |
225 a('Rscript') | |
226 a(rfname) | |
227 a("%s" % opts.output_dir) | |
228 a("%s" % opts.input_matrix) | |
229 a("%s" % opts.treatname) | |
230 a("%s" % opts.treatcols) | |
231 a("%s" % opts.ctrlname) | |
232 a("%s" % opts.ctrlcols) | |
233 a("%s" % opts.output_tab) | |
234 | |
235 def compressPDF(self,inpdf=None,thumbformat='png'): | |
236 """need absolute path to pdf | |
237 """ | |
238 assert os.path.isfile(inpdf), "## Input %s supplied to %s compressPDF not found" % (inpdf,self.myName) | |
239 sto = open(self.tlog,'a') | |
240 outpdf = '%s_compressed' % inpdf | |
241 cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-sOutputFile=%s" % outpdf,inpdf] | |
242 x = subprocess.Popen(cl,stdout=sto,stderr=sto,cwd=self.opts.output_dir) | |
243 retval1 = x.wait() | |
244 if retval1 == 0: | |
245 os.unlink(inpdf) | |
246 shutil.move(outpdf,inpdf) | |
247 outpng = '%s.%s' % (os.path.splitext(inpdf)[0],thumbformat) | |
248 cl2 = ['convert', inpdf, outpng] | |
249 x = subprocess.Popen(cl2,stdout=sto,stderr=sto,cwd=self.opts.output_dir) | |
250 retval2 = x.wait() | |
251 sto.close() | |
252 retval = retval1 or retval2 | |
253 return retval | |
254 | |
255 def runDGE(self): | |
256 """ | |
257 """ | |
258 sto = open(self.tlog,'w') | |
259 x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir) | |
260 retval = x.wait() | |
261 sto.close() | |
262 flist = os.listdir(self.opts.output_dir) | |
263 flist.sort() | |
264 html = [galhtmlprefix % progname,] | |
265 html.append('<h2>Galaxy DGE outputs run at %s<h2></br>Click on the images below to download high quality PDF versions</br>\n' % timenow()) | |
266 if len(flist) > 0: | |
267 html.append('<table>\n') | |
268 for fname in flist: | |
269 dname,e = os.path.splitext(fname) | |
270 if e.lower() == '.pdf' : # compress and make a thumbnail | |
271 thumb = '%s.%s' % (dname,self.thumbformat) | |
272 pdff = os.path.join(self.opts.output_dir,fname) | |
273 retval = self.compressPDF(inpdf=pdff,thumbformat=self.thumbformat) | |
274 if retval == 0: | |
275 s= '<tr><td><a href="%s"><img src="%s" title="Click to download a print quality PDF of %s" hspace="10" width="600"></a></td></tr>' \ | |
276 % (fname,thumb,fname) | |
277 else: | |
278 dname = '%s (thumbnail image not_found)' % fname | |
279 s= '<tr><td><a href="%s">%s</a></td></tr>' % (fname,dname) | |
280 html.append(s) | |
281 else: | |
282 html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,fname)) | |
283 html.append('</table>\n') | |
284 else: | |
285 html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>') | |
286 html.append('<h3>R log follows below</h3><hr><pre>\n') | |
287 rlog = open(self.tlog,'r').readlines() | |
288 html += rlog | |
289 html.append('%s CL = %s</br>\n' % (self.myName,' '.join(sys.argv))) | |
290 html.append('DGE.R CL = %s</br>\n' % (' '.join(self.cl))) | |
291 html.append('</pre>\n') | |
292 html.append(galhtmlattr % (progname,timenow())) | |
293 html.append(galhtmlpostfix) | |
294 htmlf = file(self.opts.output_html,'w') | |
295 htmlf.write('\n'.join(html)) | |
296 htmlf.write('\n') | |
297 htmlf.close() | |
298 | |
299 | |
300 def main(): | |
301 u = """ | |
302 This is a Galaxy wrapper. It expects to be called by DGE.xml as: | |
303 <command interpreter="python">rgDGE.py --output_html "$html_file.files_path" --input_matrix "$input1" --treatcols "$Treat_cols" --treatname "$treatment_name" | |
304 --ctrlcols "$Control_cols" | |
305 --ctrlname "$control_name" --output_tab "$outtab" --output_html "$html_file" --output_dir "$html_file.files_path" --method "edgeR" | |
306 </command> | |
307 """ | |
308 op = optparse.OptionParser() | |
309 a = op.add_option | |
310 a('--input_matrix',default=None) | |
311 a('--output_tab',default=None) | |
312 a('--output_html',default=None) | |
313 a('--treatcols',default=None) | |
314 a('--treatname',default='Treatment') | |
315 a('--ctrlcols',default=None) | |
316 a('--ctrlname',default='Control') | |
317 a('--output_dir',default=None) | |
318 a('--method',default='edgeR') | |
319 opts, args = op.parse_args() | |
320 assert opts.input_matrix and opts.output_html,u | |
321 assert os.path.isfile(opts.input_matrix),'## DGE runner unable to open supplied input file %s' % opts.input_matrix | |
322 assert opts.treatcols,'## DGE runner requires a comma separated list of treatment columns eg --treatcols 4,5,6' | |
323 assert opts.treatcols,'## DGE runner requires a comma separated list of control columns eg --ctlcols 2,3,7' | |
324 myName=os.path.split(sys.argv[0])[-1] | |
325 if not os.path.exists(opts.output_dir): | |
326 os.makedirs(opts.output_dir) | |
327 m = DGE(myName, opts=opts) | |
328 retcode = m.runDGE() | |
329 if retcode: | |
330 sys.exit(retcode) # indicate failure to job runner | |
331 | |
332 | |
333 | |
334 if __name__ == "__main__": | |
335 main() | |
336 | |
337 | |
338 |