comparison alignr.py @ 2:a861f40db890

Uploaded
author xuebing
date Tue, 20 Mar 2012 15:15:00 -0400
parents
children
comparison
equal deleted inserted replaced
1:1a177d17b27d 2:a861f40db890
1 '''
2 the scripts takes two files as input, and compute the coverage of
3 features in input 1 across features in input 2. Features in input 2 are
4 divided into bins and coverage is computed for each bin.
5
6 please check the help information by typing:
7
8 python align.py -h
9
10
11 requirement:
12 please install the following tools first:
13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/
14
15 '''
16
17 import os,sys,os.path
18 from optparse import OptionParser
19
20 def lineCount(filename):
21 with open(filename) as f:
22 for i, l in enumerate(f):
23 pass
24 return i + 1
25
26 def combineFilename(f1,f2):
27 '''
28 fuse two file names into one
29 '''
30 return f1.split('/')[-1]+'-'+f2.split('/')[-1]
31
32 def checkFormat(filename1,filename2,input1format):
33 '''
34 check the format of input files
35 '''
36
37 # file1
38 # read the first line, see how many filds
39 ncol1 = 6
40 if input1format == "BED":
41 f = open(filename1)
42 line = f.readline().strip().split('\t')
43 ncol1 = len(line)
44 if ncol1 < 3:
45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
46 sys.exit(1)
47 f.close()
48
49 # file2
50 f = open(filename2)
51 line = f.readline().strip().split('\t')
52 ncol2 = len(line)
53 if ncol2 < 3:
54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
55 sys.exit(1)
56
57 return ncol1,ncol2
58
59
60 def makeBed(filename,ncol):
61 '''
62 add up to 6 column
63 '''
64 f = open(filename)
65 outfile = filename+'.tmp.bed'
66 outf = open(outfile,'w')
67 if ncol == 3:
68 for line in f:
69 outf.write(line.strip()+'\t.\t0\t+\n')
70 elif ncol == 4:
71 for line in f:
72 outf.write(line.strip()+'\t0\t+\n')
73 if ncol == 5:
74 for line in f:
75 outf.write(line.strip()+'\t+\n')
76 f.close()
77 outf.close()
78 return outfile
79
80 def makeWindow(filename,window):
81
82 outfile = filename+'-window='+str(window)+'.tmp.bed'
83 if not os.path.exists(outfile):
84 f=open(filename)
85 out = open(outfile,'w')
86 lines = f.readlines()
87 if 'track' in lines[0]:
88 del lines[0]
89 for line in lines:
90 flds = line.strip().split('\t')
91
92 #new position
93 center = (int(flds[1]) + int(flds[2]))/2
94 start = center - window
95 end = center + window
96 if start >= 0:
97 flds[1] = str(start)
98 flds[2] = str(end)
99 out.write('\t'.join(flds)+'\n')
100 f.close()
101 out.close()
102 return outfile
103
104 def groupReadsMapped2aRegion(filename,ncol):
105 '''
106 read output from intersectBED
107 find all reads mapped to each region
108 '''
109 try:
110 f=open(filename)
111 #If filename cannot be opened, print an error message and exit
112 except IOError:
113 print "could not open",filename,"Are you sure this file exists?"
114 sys.exit(1)
115 lines = f.readlines()
116
117 allReadsStart = {}
118 allReadsEnd = {}
119 regionStrand = {}
120 regionStart = {}
121 regionEnd = {}
122
123 for line in lines:
124 flds = line.strip().split('\t')
125 key = '_'.join(flds[ncol:(ncol+4)])
126 if not allReadsStart.has_key(key):
127 allReadsStart[key] = list()
128 allReadsEnd[key] = list()
129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
130 allReadsStart[key].append(int(flds[1]))
131 allReadsEnd[key].append(int(flds[2]))
132 regionStrand[key] = flds[ncol+5]
133 regionStart[key] = int(flds[ncol+1])
134 regionEnd[key] = int(flds[ncol+2])
135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
136
137
138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
139 '''
140 each region is divided into nbins
141 compute the number of reads covering each bin for each region
142 '''
143 RegionProfile = {}
144 nRead = {} # num of all reads in the region
145 for region in allReadsStart.keys():
146 RegionProfile[region] = [0]*nbins
147 nRead[region] = len(allReadsStart[region])
148 #print region,nRead[region],allReadsStart[region]
149 for i in range(nRead[region]):
150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
151 return RegionProfile,nRead
152
153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
154 '''
155 each region is divided into nbins,
156 add 1 to each bin covered by the read
157 '''
158 L = regionEnd-regionStart
159 start = int(nbins*(readStart-regionStart)/L)
160 end = int(nbins*(readEnd-regionStart)/L)
161 if start < 0:
162 start = 0
163 if end > nbins:
164 end = nbins
165 if strand == '-':
166 for i in range(start,end):
167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
168 else: # if the 6th column of the input is not strand, will treat as + strand by default
169 for i in range(start,end):
170 RegionCount[i] = RegionCount[i] + 1
171 return RegionCount
172
173 def saveProfile(filename,Profile,nRegion):
174 out = open(filename,'w')
175 for regionType in Profile.keys():
176 #print Profile[regionType]
177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')
178
179 def saveSummary(filename,Profile,nbin):
180 out = open(filename+'.summary','w')
181
182 nfeat = len(Profile)
183 summaryprofile = [0]*nbin
184 for regionType in Profile.keys():
185 for i in range(nbin):
186 summaryprofile[i] += Profile[regionType][i]
187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')
188 out.close()
189 # calculate standard error
190 out = open(filename+'.standarderror','w')
191 sd = [0.0]*nbin
192 u = [0.0]*nbin
193 for i in range(nbin):
194 u[i] = float(summaryprofile[i])/nfeat
195 for regionType in Profile.keys():
196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
197 sd[i] = sd[i]**0.5 / nfeat
198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')
199 out.close()
200
201 def main():
202 usage = "usage: %prog [options] -a inputA -b inputB"
203 parser = OptionParser(usage)
204 parser.add_option("-a", dest="inputA",
205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )
206 parser.add_option("-b",dest="inputB",
207 help="(required) input file B, interval file" )
208 parser.add_option("-f",dest="aformat",default="BED",
209 help="Format of input file A. Can be BED (default) or BAM")
210 parser.add_option("-w",type='int',dest="window",
211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )
212 parser.add_option("-n", type="int", dest="nbins",default=100,
213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")
214 parser.add_option("-s",action="store_true", dest="strandness",
215 help="enforce strandness: require overlapping on the same strand. Default is off")
216 parser.add_option("-p",action="store_true", dest="plot",default=False,
217 help="load existed intersectBed outputfile")
218 parser.add_option("-q", action="store_true", dest="quiet",default=False,
219 help="suppress output on screen")
220 parser.add_option("-o", dest="output_data",
221 help="(optional) output coverage file (txt) name." )
222 parser.add_option("-v", dest="output_plot",
223 help="(optional) output plot (pdf) file name." )
224 parser.add_option("-l", dest="plot_title", default="",
225 help="(optional) output title of the plot." )
226 parser.add_option("--ylim", dest="ylim", default="min,max",
227 help="(optional) ylim of the plot" )
228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
229 help="save profile summary only (no data for individual features)")
230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
231 help="compute and plot standard deviation for each bin. used when --summary-only is on")
232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
233 help="save profile only (no plot)")
234 parser.add_option("--span", type="float", dest="span",default=0.1,
235 help="loess span smooth parameter, 0.1 ~ 1")
236
237 (options, args) = parser.parse_args()
238
239 if options.inputA == None or options.inputB == None:
240 parser.error("Please specify two input files!!")
241
242 if not options.quiet:
243 print "checking input file format..."
244
245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
246
247 if ncol2 < 6:
248 options.inputB = makeBed(options.inputB,ncol2)
249 if not options.quiet:
250 print "fill up 6 columns"
251
252 if options.window > 0:
253 if not options.quiet:
254 print "making windows from "+options.inputB+"..."
255 options.inputB = makeWindow(options.inputB,options.window)
256
257 output = combineFilename(str(options.inputA),str(options.inputB))
258
259 if not options.plot:
260 if options.aformat == "BAM":
261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
262 else:
263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
264 if options.strandness:
265 cmd = cmd + ' -s'
266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
267 if not options.quiet:
268 print "search for overlappings: "+cmd
269 status = os.system(cmd)
270 if status != 0:
271 sys.exit(1)
272
273
274 if not options.quiet:
275 print 'group reads mapped to the same region...'
276
277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
278
279 if len(allReadsStart) == 0:
280 if not options.quiet:
281 print 'no overlap found!!'
282 os.system('rm *tmp.*')
283 sys.exit(1)
284
285 if not options.quiet:
286 print 'count number of reads mapped to each bin...'
287
288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins)
289
290 if options.output_data == None:
291 options.output_data = output+'.txt'
292
293 if options.summary_only:
294 saveSummary(options.output_data,RegionProfile,options.nbins)
295
296 else:
297 saveProfile(options.output_data,RegionProfile,nRead)
298
299 if not options.quiet:
300 print 'results saved to: '+ options.output_data
301
302 if not (options.summary_only or options.profile_only ):
303 # visualize
304
305 if options.window < 1:
306 xlab = 'relative position (bins)'
307 else:
308 xlab = 'relative position (bp)'
309
310 if options.output_plot == None:
311 options.output_plot = output+'.pdf'
312
313 title = options.plot_title+'\n n = '+str(len(RegionProfile))
314
315 rscript = open("tmp.r","w")
316 rscript.write("x <- read.table('"+options.output_data+"')\n")
317 rscript.write("pdf('"+options.output_plot+"')\n")
318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
320
321 if options.window == 0:
322 rscript.write("xticks <- seq("+str(options.nbins)+")\n")
323 else:
324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
325
326 if options.ylim != 'min,max':
327 rscript.write("ylim=c("+options.ylim+")\n")
328 else:
329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
330 rscript.write("par(cex=1.5)\n")
331 #smooth
332 if options.span >= 0.1:
333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")
336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
340 #rscript.write("lines(xticks,avg,lwd=2)\n")
341 #rscript.write("lines(xticks,avg-err,col='green')\n")
342 #rscript.write("lines(xticks,avg+err,col='green')\n")
343 rscript.write("dev.off()\n")
344 rscript.close()
345
346 os.system("R --vanilla < tmp.r")
347
348 # remove intermediate output
349 os.system('rm *tmp.*')
350
351
352 if __name__ == "__main__":
353 main()