7
|
1 '''
|
|
2 the scripts takes two files as input, and compute the coverage of
|
|
3 features in input 1 across features in input 2. Features in input 2 are
|
|
4 divided into bins and coverage is computed for each bin.
|
|
5
|
|
6 please check the help information by typing:
|
|
7
|
|
8 python align.py -h
|
|
9
|
|
10
|
|
11 requirement:
|
|
12 please install the following tools first:
|
|
13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/
|
|
14
|
|
15 '''
|
|
16
|
|
17 import os,sys,os.path
|
|
18 from optparse import OptionParser
|
|
19
|
|
20 def lineCount(filename):
|
|
21 with open(filename) as f:
|
|
22 for i, l in enumerate(f):
|
|
23 pass
|
|
24 return i + 1
|
|
25
|
|
26 def combineFilename(f1,f2):
|
|
27 '''
|
|
28 fuse two file names into one
|
|
29 '''
|
|
30 return f1.split('/')[-1]+'-'+f2.split('/')[-1]
|
|
31
|
|
32 def checkFormat(filename1,filename2,input1format):
|
|
33 '''
|
|
34 check the format of input files
|
|
35 '''
|
|
36
|
|
37 # file1
|
|
38 # read the first line, see how many filds
|
|
39 ncol1 = 6
|
|
40 if input1format == "BED":
|
|
41 f = open(filename1)
|
|
42 line = f.readline().strip().split('\t')
|
|
43 ncol1 = len(line)
|
|
44 if ncol1 < 3:
|
|
45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
|
|
46 sys.exit(1)
|
|
47 f.close()
|
|
48
|
|
49 # file2
|
|
50 f = open(filename2)
|
|
51 line = f.readline().strip().split('\t')
|
|
52 ncol2 = len(line)
|
|
53 if ncol2 < 3:
|
|
54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
|
|
55 sys.exit(1)
|
|
56
|
|
57 return ncol1,ncol2
|
|
58
|
|
59
|
|
60 def makeBed(filename,ncol):
|
|
61 '''
|
|
62 add up to 6 column
|
|
63 '''
|
|
64 f = open(filename)
|
|
65 outfile = filename+'.tmp.bed'
|
|
66 outf = open(outfile,'w')
|
|
67 if ncol == 3:
|
|
68 for line in f:
|
|
69 outf.write(line.strip()+'\t.\t0\t+\n')
|
|
70 elif ncol == 4:
|
|
71 for line in f:
|
|
72 outf.write(line.strip()+'\t0\t+\n')
|
|
73 if ncol == 5:
|
|
74 for line in f:
|
|
75 outf.write(line.strip()+'\t+\n')
|
|
76 f.close()
|
|
77 outf.close()
|
|
78 return outfile
|
|
79
|
|
80 def makeWindow(filename,window):
|
|
81
|
|
82 outfile = filename+'-window='+str(window)+'.tmp.bed'
|
|
83 if not os.path.exists(outfile):
|
|
84 f=open(filename)
|
|
85 out = open(outfile,'w')
|
|
86 lines = f.readlines()
|
|
87 if 'track' in lines[0]:
|
|
88 del lines[0]
|
|
89 for line in lines:
|
|
90 flds = line.strip().split('\t')
|
|
91
|
|
92 #new position
|
|
93 center = (int(flds[1]) + int(flds[2]))/2
|
|
94 start = center - window
|
|
95 end = center + window
|
|
96 if start >= 0:
|
|
97 flds[1] = str(start)
|
|
98 flds[2] = str(end)
|
|
99 out.write('\t'.join(flds)+'\n')
|
|
100 f.close()
|
|
101 out.close()
|
|
102 return outfile
|
|
103
|
|
104 def groupReadsMapped2aRegion(filename,ncol):
|
|
105 '''
|
|
106 read output from intersectBED
|
|
107 find all reads mapped to each region
|
|
108 '''
|
|
109 try:
|
|
110 f=open(filename)
|
|
111 #If filename cannot be opened, print an error message and exit
|
|
112 except IOError:
|
|
113 print "could not open",filename,"Are you sure this file exists?"
|
|
114 sys.exit(1)
|
|
115 lines = f.readlines()
|
|
116
|
|
117 allReadsStart = {}
|
|
118 allReadsEnd = {}
|
|
119 regionStrand = {}
|
|
120 regionStart = {}
|
|
121 regionEnd = {}
|
|
122
|
|
123 for line in lines:
|
|
124 flds = line.strip().split('\t')
|
|
125 key = '_'.join(flds[ncol:(ncol+4)])
|
|
126 if not allReadsStart.has_key(key):
|
|
127 allReadsStart[key] = list()
|
|
128 allReadsEnd[key] = list()
|
|
129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
|
|
130 allReadsStart[key].append(int(flds[1]))
|
|
131 allReadsEnd[key].append(int(flds[2]))
|
|
132 regionStrand[key] = flds[ncol+5]
|
|
133 regionStart[key] = int(flds[ncol+1])
|
|
134 regionEnd[key] = int(flds[ncol+2])
|
|
135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
|
|
136
|
|
137
|
|
138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
|
|
139 '''
|
|
140 each region is divided into nbins
|
|
141 compute the number of reads covering each bin for each region
|
|
142 '''
|
|
143 RegionProfile = {}
|
|
144 nRead = {} # num of all reads in the region
|
|
145 for region in allReadsStart.keys():
|
|
146 RegionProfile[region] = [0]*nbins
|
|
147 nRead[region] = len(allReadsStart[region])
|
|
148 #print region,nRead[region],allReadsStart[region]
|
|
149 for i in range(nRead[region]):
|
|
150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
|
|
151 return RegionProfile,nRead
|
|
152
|
|
153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
|
|
154 '''
|
|
155 each region is divided into nbins,
|
|
156 add 1 to each bin covered by the read
|
|
157 '''
|
|
158 L = regionEnd-regionStart
|
|
159 start = int(nbins*(readStart-regionStart)/L)
|
|
160 end = int(nbins*(readEnd-regionStart)/L)
|
|
161 if start < 0:
|
|
162 start = 0
|
|
163 if end > nbins:
|
|
164 end = nbins
|
|
165 if strand == '-':
|
|
166 for i in range(start,end):
|
|
167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
|
|
168 else: # if the 6th column of the input is not strand, will treat as + strand by default
|
|
169 for i in range(start,end):
|
|
170 RegionCount[i] = RegionCount[i] + 1
|
|
171 return RegionCount
|
|
172
|
|
173 def saveProfile(filename,Profile,nRegion):
|
|
174 out = open(filename,'w')
|
|
175 for regionType in Profile.keys():
|
|
176 #print Profile[regionType]
|
|
177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')
|
|
178
|
|
179 def saveSummary(filename,Profile,nbin):
|
|
180 out = open(filename+'.summary','w')
|
|
181
|
|
182 nfeat = len(Profile)
|
|
183 summaryprofile = [0]*nbin
|
|
184 for regionType in Profile.keys():
|
|
185 for i in range(nbin):
|
|
186 summaryprofile[i] += Profile[regionType][i]
|
|
187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')
|
|
188 out.close()
|
|
189 # calculate standard error
|
|
190 out = open(filename+'.standarderror','w')
|
|
191 sd = [0.0]*nbin
|
|
192 u = [0.0]*nbin
|
|
193 for i in range(nbin):
|
|
194 u[i] = float(summaryprofile[i])/nfeat
|
|
195 for regionType in Profile.keys():
|
|
196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
|
|
197 sd[i] = sd[i]**0.5 / nfeat
|
|
198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')
|
|
199 out.close()
|
|
200
|
|
201 def main():
|
|
202 usage = "usage: %prog [options] -a inputA -b inputB"
|
|
203 parser = OptionParser(usage)
|
|
204 parser.add_option("-a", dest="inputA",
|
|
205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )
|
|
206 parser.add_option("-b",dest="inputB",
|
|
207 help="(required) input file B, interval file" )
|
|
208 parser.add_option("-f",dest="aformat",default="BED",
|
|
209 help="Format of input file A. Can be BED (default) or BAM")
|
|
210 parser.add_option("-w",type='int',dest="window",
|
|
211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )
|
|
212 parser.add_option("-n", type="int", dest="nbins",default=100,
|
|
213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")
|
|
214 parser.add_option("-s",action="store_true", dest="strandness",
|
|
215 help="enforce strandness: require overlapping on the same strand. Default is off")
|
|
216 parser.add_option("-p",action="store_true", dest="plot",default=False,
|
|
217 help="load existed intersectBed outputfile")
|
|
218 parser.add_option("-q", action="store_true", dest="quiet",default=False,
|
|
219 help="suppress output on screen")
|
|
220 parser.add_option("-o", dest="output_data",
|
|
221 help="(optional) output coverage file (txt) name." )
|
|
222 parser.add_option("-v", dest="output_plot",
|
|
223 help="(optional) output plot (pdf) file name." )
|
|
224 parser.add_option("-l", dest="plot_title", default="",
|
|
225 help="(optional) output title of the plot." )
|
|
226 parser.add_option("--ylim", dest="ylim", default="min,max",
|
|
227 help="(optional) ylim of the plot" )
|
|
228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
|
|
229 help="save profile summary only (no data for individual features)")
|
|
230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
|
|
231 help="compute and plot standard deviation for each bin. used when --summary-only is on")
|
|
232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
|
|
233 help="save profile only (no plot)")
|
|
234 parser.add_option("--span", type="float", dest="span",default=0.1,
|
|
235 help="loess span smooth parameter, 0.1 ~ 1")
|
|
236
|
|
237 (options, args) = parser.parse_args()
|
|
238
|
|
239 if options.inputA == None or options.inputB == None:
|
|
240 parser.error("Please specify two input files!!")
|
|
241
|
|
242 if not options.quiet:
|
|
243 print "checking input file format..."
|
|
244
|
|
245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
|
|
246
|
|
247 if ncol2 < 6:
|
|
248 options.inputB = makeBed(options.inputB,ncol2)
|
|
249 if not options.quiet:
|
|
250 print "fill up 6 columns"
|
|
251
|
|
252 if options.window > 0:
|
|
253 if not options.quiet:
|
|
254 print "making windows from "+options.inputB+"..."
|
|
255 options.inputB = makeWindow(options.inputB,options.window)
|
|
256
|
|
257 output = combineFilename(str(options.inputA),str(options.inputB))
|
|
258
|
|
259 if not options.plot:
|
|
260 if options.aformat == "BAM":
|
|
261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
|
|
262 else:
|
|
263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
|
|
264 if options.strandness:
|
|
265 cmd = cmd + ' -s'
|
|
266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
|
|
267 if not options.quiet:
|
|
268 print "search for overlappings: "+cmd
|
|
269 status = os.system(cmd)
|
|
270 if status != 0:
|
|
271 sys.exit(1)
|
|
272
|
|
273
|
|
274 if not options.quiet:
|
|
275 print 'group reads mapped to the same region...'
|
|
276
|
|
277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
|
|
278
|
|
279 if len(allReadsStart) == 0:
|
|
280 if not options.quiet:
|
|
281 print 'no overlap found!!'
|
|
282 os.system('rm *tmp.*')
|
|
283 sys.exit(1)
|
|
284
|
|
285 if not options.quiet:
|
|
286 print 'count number of reads mapped to each bin...'
|
|
287
|
|
288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins)
|
|
289
|
|
290 if options.output_data == None:
|
|
291 options.output_data = output+'.txt'
|
|
292
|
|
293 if options.summary_only:
|
|
294 saveSummary(options.output_data,RegionProfile,options.nbins)
|
|
295
|
|
296 else:
|
|
297 saveProfile(options.output_data,RegionProfile,nRead)
|
|
298
|
|
299 if not options.quiet:
|
|
300 print 'results saved to: '+ options.output_data
|
|
301
|
|
302 if not (options.summary_only or options.profile_only ):
|
|
303 # visualize
|
|
304
|
|
305 if options.window < 1:
|
|
306 xlab = 'relative position (bins)'
|
|
307 else:
|
|
308 xlab = 'relative position (bp)'
|
|
309
|
|
310 if options.output_plot == None:
|
|
311 options.output_plot = output+'.pdf'
|
|
312
|
|
313 title = options.plot_title+'\n n = '+str(len(RegionProfile))
|
|
314
|
|
315 rscript = open("tmp.r","w")
|
|
316 rscript.write("x <- read.table('"+options.output_data+"')\n")
|
|
317 rscript.write("pdf('"+options.output_plot+"')\n")
|
|
318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
|
|
319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
|
|
320
|
|
321 if options.window == 0:
|
|
322 rscript.write("xticks <- seq("+str(options.nbins)+")\n")
|
|
323 else:
|
|
324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
|
|
325
|
|
326 if options.ylim != 'min,max':
|
|
327 rscript.write("ylim=c("+options.ylim+")\n")
|
|
328 else:
|
|
329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
|
|
330 rscript.write("par(cex=1.5)\n")
|
|
331 #smooth
|
|
332 if options.span >= 0.1:
|
|
333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
|
|
334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
|
|
335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")
|
|
336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
|
|
337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
|
|
338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
|
|
339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
|
|
340 #rscript.write("lines(xticks,avg,lwd=2)\n")
|
|
341 #rscript.write("lines(xticks,avg-err,col='green')\n")
|
|
342 #rscript.write("lines(xticks,avg+err,col='green')\n")
|
|
343 rscript.write("dev.off()\n")
|
|
344 rscript.close()
|
|
345
|
|
346 os.system("R --vanilla < tmp.r")
|
|
347
|
|
348 # remove intermediate output
|
|
349 os.system('rm *tmp.*')
|
|
350
|
|
351
|
|
352 if __name__ == "__main__":
|
|
353 main()
|