Mercurial > repos > xuebing > sharplab_interval_analysis
comparison alignr.py @ 2:a861f40db890
Uploaded
author | xuebing |
---|---|
date | Tue, 20 Mar 2012 15:15:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:1a177d17b27d | 2:a861f40db890 |
---|---|
1 ''' | |
2 the scripts takes two files as input, and compute the coverage of | |
3 features in input 1 across features in input 2. Features in input 2 are | |
4 divided into bins and coverage is computed for each bin. | |
5 | |
6 please check the help information by typing: | |
7 | |
8 python align.py -h | |
9 | |
10 | |
11 requirement: | |
12 please install the following tools first: | |
13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ | |
14 | |
15 ''' | |
16 | |
17 import os,sys,os.path | |
18 from optparse import OptionParser | |
19 | |
20 def lineCount(filename): | |
21 with open(filename) as f: | |
22 for i, l in enumerate(f): | |
23 pass | |
24 return i + 1 | |
25 | |
26 def combineFilename(f1,f2): | |
27 ''' | |
28 fuse two file names into one | |
29 ''' | |
30 return f1.split('/')[-1]+'-'+f2.split('/')[-1] | |
31 | |
32 def checkFormat(filename1,filename2,input1format): | |
33 ''' | |
34 check the format of input files | |
35 ''' | |
36 | |
37 # file1 | |
38 # read the first line, see how many filds | |
39 ncol1 = 6 | |
40 if input1format == "BED": | |
41 f = open(filename1) | |
42 line = f.readline().strip().split('\t') | |
43 ncol1 = len(line) | |
44 if ncol1 < 3: | |
45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." | |
46 sys.exit(1) | |
47 f.close() | |
48 | |
49 # file2 | |
50 f = open(filename2) | |
51 line = f.readline().strip().split('\t') | |
52 ncol2 = len(line) | |
53 if ncol2 < 3: | |
54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." | |
55 sys.exit(1) | |
56 | |
57 return ncol1,ncol2 | |
58 | |
59 | |
60 def makeBed(filename,ncol): | |
61 ''' | |
62 add up to 6 column | |
63 ''' | |
64 f = open(filename) | |
65 outfile = filename+'.tmp.bed' | |
66 outf = open(outfile,'w') | |
67 if ncol == 3: | |
68 for line in f: | |
69 outf.write(line.strip()+'\t.\t0\t+\n') | |
70 elif ncol == 4: | |
71 for line in f: | |
72 outf.write(line.strip()+'\t0\t+\n') | |
73 if ncol == 5: | |
74 for line in f: | |
75 outf.write(line.strip()+'\t+\n') | |
76 f.close() | |
77 outf.close() | |
78 return outfile | |
79 | |
80 def makeWindow(filename,window): | |
81 | |
82 outfile = filename+'-window='+str(window)+'.tmp.bed' | |
83 if not os.path.exists(outfile): | |
84 f=open(filename) | |
85 out = open(outfile,'w') | |
86 lines = f.readlines() | |
87 if 'track' in lines[0]: | |
88 del lines[0] | |
89 for line in lines: | |
90 flds = line.strip().split('\t') | |
91 | |
92 #new position | |
93 center = (int(flds[1]) + int(flds[2]))/2 | |
94 start = center - window | |
95 end = center + window | |
96 if start >= 0: | |
97 flds[1] = str(start) | |
98 flds[2] = str(end) | |
99 out.write('\t'.join(flds)+'\n') | |
100 f.close() | |
101 out.close() | |
102 return outfile | |
103 | |
104 def groupReadsMapped2aRegion(filename,ncol): | |
105 ''' | |
106 read output from intersectBED | |
107 find all reads mapped to each region | |
108 ''' | |
109 try: | |
110 f=open(filename) | |
111 #If filename cannot be opened, print an error message and exit | |
112 except IOError: | |
113 print "could not open",filename,"Are you sure this file exists?" | |
114 sys.exit(1) | |
115 lines = f.readlines() | |
116 | |
117 allReadsStart = {} | |
118 allReadsEnd = {} | |
119 regionStrand = {} | |
120 regionStart = {} | |
121 regionEnd = {} | |
122 | |
123 for line in lines: | |
124 flds = line.strip().split('\t') | |
125 key = '_'.join(flds[ncol:(ncol+4)]) | |
126 if not allReadsStart.has_key(key): | |
127 allReadsStart[key] = list() | |
128 allReadsEnd[key] = list() | |
129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2] | |
130 allReadsStart[key].append(int(flds[1])) | |
131 allReadsEnd[key].append(int(flds[2])) | |
132 regionStrand[key] = flds[ncol+5] | |
133 regionStart[key] = int(flds[ncol+1]) | |
134 regionEnd[key] = int(flds[ncol+2]) | |
135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) | |
136 | |
137 | |
138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): | |
139 ''' | |
140 each region is divided into nbins | |
141 compute the number of reads covering each bin for each region | |
142 ''' | |
143 RegionProfile = {} | |
144 nRead = {} # num of all reads in the region | |
145 for region in allReadsStart.keys(): | |
146 RegionProfile[region] = [0]*nbins | |
147 nRead[region] = len(allReadsStart[region]) | |
148 #print region,nRead[region],allReadsStart[region] | |
149 for i in range(nRead[region]): | |
150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) | |
151 return RegionProfile,nRead | |
152 | |
153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): | |
154 ''' | |
155 each region is divided into nbins, | |
156 add 1 to each bin covered by the read | |
157 ''' | |
158 L = regionEnd-regionStart | |
159 start = int(nbins*(readStart-regionStart)/L) | |
160 end = int(nbins*(readEnd-regionStart)/L) | |
161 if start < 0: | |
162 start = 0 | |
163 if end > nbins: | |
164 end = nbins | |
165 if strand == '-': | |
166 for i in range(start,end): | |
167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 | |
168 else: # if the 6th column of the input is not strand, will treat as + strand by default | |
169 for i in range(start,end): | |
170 RegionCount[i] = RegionCount[i] + 1 | |
171 return RegionCount | |
172 | |
173 def saveProfile(filename,Profile,nRegion): | |
174 out = open(filename,'w') | |
175 for regionType in Profile.keys(): | |
176 #print Profile[regionType] | |
177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') | |
178 | |
179 def saveSummary(filename,Profile,nbin): | |
180 out = open(filename+'.summary','w') | |
181 | |
182 nfeat = len(Profile) | |
183 summaryprofile = [0]*nbin | |
184 for regionType in Profile.keys(): | |
185 for i in range(nbin): | |
186 summaryprofile[i] += Profile[regionType][i] | |
187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') | |
188 out.close() | |
189 # calculate standard error | |
190 out = open(filename+'.standarderror','w') | |
191 sd = [0.0]*nbin | |
192 u = [0.0]*nbin | |
193 for i in range(nbin): | |
194 u[i] = float(summaryprofile[i])/nfeat | |
195 for regionType in Profile.keys(): | |
196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 | |
197 sd[i] = sd[i]**0.5 / nfeat | |
198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') | |
199 out.close() | |
200 | |
201 def main(): | |
202 usage = "usage: %prog [options] -a inputA -b inputB" | |
203 parser = OptionParser(usage) | |
204 parser.add_option("-a", dest="inputA", | |
205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) | |
206 parser.add_option("-b",dest="inputB", | |
207 help="(required) input file B, interval file" ) | |
208 parser.add_option("-f",dest="aformat",default="BED", | |
209 help="Format of input file A. Can be BED (default) or BAM") | |
210 parser.add_option("-w",type='int',dest="window", | |
211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) | |
212 parser.add_option("-n", type="int", dest="nbins",default=100, | |
213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") | |
214 parser.add_option("-s",action="store_true", dest="strandness", | |
215 help="enforce strandness: require overlapping on the same strand. Default is off") | |
216 parser.add_option("-p",action="store_true", dest="plot",default=False, | |
217 help="load existed intersectBed outputfile") | |
218 parser.add_option("-q", action="store_true", dest="quiet",default=False, | |
219 help="suppress output on screen") | |
220 parser.add_option("-o", dest="output_data", | |
221 help="(optional) output coverage file (txt) name." ) | |
222 parser.add_option("-v", dest="output_plot", | |
223 help="(optional) output plot (pdf) file name." ) | |
224 parser.add_option("-l", dest="plot_title", default="", | |
225 help="(optional) output title of the plot." ) | |
226 parser.add_option("--ylim", dest="ylim", default="min,max", | |
227 help="(optional) ylim of the plot" ) | |
228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, | |
229 help="save profile summary only (no data for individual features)") | |
230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, | |
231 help="compute and plot standard deviation for each bin. used when --summary-only is on") | |
232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, | |
233 help="save profile only (no plot)") | |
234 parser.add_option("--span", type="float", dest="span",default=0.1, | |
235 help="loess span smooth parameter, 0.1 ~ 1") | |
236 | |
237 (options, args) = parser.parse_args() | |
238 | |
239 if options.inputA == None or options.inputB == None: | |
240 parser.error("Please specify two input files!!") | |
241 | |
242 if not options.quiet: | |
243 print "checking input file format..." | |
244 | |
245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) | |
246 | |
247 if ncol2 < 6: | |
248 options.inputB = makeBed(options.inputB,ncol2) | |
249 if not options.quiet: | |
250 print "fill up 6 columns" | |
251 | |
252 if options.window > 0: | |
253 if not options.quiet: | |
254 print "making windows from "+options.inputB+"..." | |
255 options.inputB = makeWindow(options.inputB,options.window) | |
256 | |
257 output = combineFilename(str(options.inputA),str(options.inputB)) | |
258 | |
259 if not options.plot: | |
260 if options.aformat == "BAM": | |
261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' | |
262 else: | |
263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) | |
264 if options.strandness: | |
265 cmd = cmd + ' -s' | |
266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' | |
267 if not options.quiet: | |
268 print "search for overlappings: "+cmd | |
269 status = os.system(cmd) | |
270 if status != 0: | |
271 sys.exit(1) | |
272 | |
273 | |
274 if not options.quiet: | |
275 print 'group reads mapped to the same region...' | |
276 | |
277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) | |
278 | |
279 if len(allReadsStart) == 0: | |
280 if not options.quiet: | |
281 print 'no overlap found!!' | |
282 os.system('rm *tmp.*') | |
283 sys.exit(1) | |
284 | |
285 if not options.quiet: | |
286 print 'count number of reads mapped to each bin...' | |
287 | |
288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) | |
289 | |
290 if options.output_data == None: | |
291 options.output_data = output+'.txt' | |
292 | |
293 if options.summary_only: | |
294 saveSummary(options.output_data,RegionProfile,options.nbins) | |
295 | |
296 else: | |
297 saveProfile(options.output_data,RegionProfile,nRead) | |
298 | |
299 if not options.quiet: | |
300 print 'results saved to: '+ options.output_data | |
301 | |
302 if not (options.summary_only or options.profile_only ): | |
303 # visualize | |
304 | |
305 if options.window < 1: | |
306 xlab = 'relative position (bins)' | |
307 else: | |
308 xlab = 'relative position (bp)' | |
309 | |
310 if options.output_plot == None: | |
311 options.output_plot = output+'.pdf' | |
312 | |
313 title = options.plot_title+'\n n = '+str(len(RegionProfile)) | |
314 | |
315 rscript = open("tmp.r","w") | |
316 rscript.write("x <- read.table('"+options.output_data+"')\n") | |
317 rscript.write("pdf('"+options.output_plot+"')\n") | |
318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") | |
319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") | |
320 | |
321 if options.window == 0: | |
322 rscript.write("xticks <- seq("+str(options.nbins)+")\n") | |
323 else: | |
324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") | |
325 | |
326 if options.ylim != 'min,max': | |
327 rscript.write("ylim=c("+options.ylim+")\n") | |
328 else: | |
329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n") | |
330 rscript.write("par(cex=1.5)\n") | |
331 #smooth | |
332 if options.span >= 0.1: | |
333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") | |
334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") | |
335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") | |
336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") | |
337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n") | |
338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") | |
339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") | |
340 #rscript.write("lines(xticks,avg,lwd=2)\n") | |
341 #rscript.write("lines(xticks,avg-err,col='green')\n") | |
342 #rscript.write("lines(xticks,avg+err,col='green')\n") | |
343 rscript.write("dev.off()\n") | |
344 rscript.close() | |
345 | |
346 os.system("R --vanilla < tmp.r") | |
347 | |
348 # remove intermediate output | |
349 os.system('rm *tmp.*') | |
350 | |
351 | |
352 if __name__ == "__main__": | |
353 main() |