2
+ − 1 '''
+ − 2 the scripts takes two files as input, and compute the coverage of
+ − 3 features in input 1 across features in input 2. Features in input 2 are
+ − 4 divided into bins and coverage is computed for each bin.
+ − 5
+ − 6 please check the help information by typing:
+ − 7
+ − 8 python align.py -h
+ − 9
+ − 10
+ − 11 requirement:
+ − 12 please install the following tools first:
+ − 13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/
+ − 14
+ − 15 '''
+ − 16
+ − 17 import os,sys,os.path
+ − 18 from optparse import OptionParser
+ − 19
+ − 20 def lineCount(filename):
+ − 21 with open(filename) as f:
+ − 22 for i, l in enumerate(f):
+ − 23 pass
+ − 24 return i + 1
+ − 25
+ − 26 def combineFilename(f1,f2):
+ − 27 '''
+ − 28 fuse two file names into one
+ − 29 '''
+ − 30 return f1.split('/')[-1]+'-'+f2.split('/')[-1]
+ − 31
+ − 32 def checkFormat(filename1,filename2,input1format):
+ − 33 '''
+ − 34 check the format of input files
+ − 35 '''
+ − 36
+ − 37 # file1
+ − 38 # read the first line, see how many filds
+ − 39 ncol1 = 6
+ − 40 if input1format == "BED":
+ − 41 f = open(filename1)
+ − 42 line = f.readline().strip().split('\t')
+ − 43 ncol1 = len(line)
+ − 44 if ncol1 < 3:
+ − 45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
+ − 46 sys.exit(1)
+ − 47 f.close()
+ − 48
+ − 49 # file2
+ − 50 f = open(filename2)
+ − 51 line = f.readline().strip().split('\t')
+ − 52 ncol2 = len(line)
+ − 53 if ncol2 < 3:
+ − 54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
+ − 55 sys.exit(1)
+ − 56
+ − 57 return ncol1,ncol2
+ − 58
+ − 59
+ − 60 def makeBed(filename,ncol):
+ − 61 '''
+ − 62 add up to 6 column
+ − 63 '''
+ − 64 f = open(filename)
+ − 65 outfile = filename+'.tmp.bed'
+ − 66 outf = open(outfile,'w')
+ − 67 if ncol == 3:
+ − 68 for line in f:
+ − 69 outf.write(line.strip()+'\t.\t0\t+\n')
+ − 70 elif ncol == 4:
+ − 71 for line in f:
+ − 72 outf.write(line.strip()+'\t0\t+\n')
+ − 73 if ncol == 5:
+ − 74 for line in f:
+ − 75 outf.write(line.strip()+'\t+\n')
+ − 76 f.close()
+ − 77 outf.close()
+ − 78 return outfile
+ − 79
+ − 80 def makeWindow(filename,window):
+ − 81
+ − 82 outfile = filename+'-window='+str(window)+'.tmp.bed'
+ − 83 if not os.path.exists(outfile):
+ − 84 f=open(filename)
+ − 85 out = open(outfile,'w')
+ − 86 lines = f.readlines()
+ − 87 if 'track' in lines[0]:
+ − 88 del lines[0]
+ − 89 for line in lines:
+ − 90 flds = line.strip().split('\t')
+ − 91
+ − 92 #new position
+ − 93 center = (int(flds[1]) + int(flds[2]))/2
+ − 94 start = center - window
+ − 95 end = center + window
+ − 96 if start >= 0:
+ − 97 flds[1] = str(start)
+ − 98 flds[2] = str(end)
+ − 99 out.write('\t'.join(flds)+'\n')
+ − 100 f.close()
+ − 101 out.close()
+ − 102 return outfile
+ − 103
+ − 104 def groupReadsMapped2aRegion(filename,ncol):
+ − 105 '''
+ − 106 read output from intersectBED
+ − 107 find all reads mapped to each region
+ − 108 '''
+ − 109 try:
+ − 110 f=open(filename)
+ − 111 #If filename cannot be opened, print an error message and exit
+ − 112 except IOError:
+ − 113 print "could not open",filename,"Are you sure this file exists?"
+ − 114 sys.exit(1)
+ − 115 lines = f.readlines()
+ − 116
+ − 117 allReadsStart = {}
+ − 118 allReadsEnd = {}
+ − 119 regionStrand = {}
+ − 120 regionStart = {}
+ − 121 regionEnd = {}
+ − 122
+ − 123 for line in lines:
+ − 124 flds = line.strip().split('\t')
+ − 125 key = '_'.join(flds[ncol:(ncol+4)])
+ − 126 if not allReadsStart.has_key(key):
+ − 127 allReadsStart[key] = list()
+ − 128 allReadsEnd[key] = list()
+ − 129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
+ − 130 allReadsStart[key].append(int(flds[1]))
+ − 131 allReadsEnd[key].append(int(flds[2]))
+ − 132 regionStrand[key] = flds[ncol+5]
+ − 133 regionStart[key] = int(flds[ncol+1])
+ − 134 regionEnd[key] = int(flds[ncol+2])
+ − 135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
+ − 136
+ − 137
+ − 138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
+ − 139 '''
+ − 140 each region is divided into nbins
+ − 141 compute the number of reads covering each bin for each region
+ − 142 '''
+ − 143 RegionProfile = {}
+ − 144 nRead = {} # num of all reads in the region
+ − 145 for region in allReadsStart.keys():
+ − 146 RegionProfile[region] = [0]*nbins
+ − 147 nRead[region] = len(allReadsStart[region])
+ − 148 #print region,nRead[region],allReadsStart[region]
+ − 149 for i in range(nRead[region]):
+ − 150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
+ − 151 return RegionProfile,nRead
+ − 152
+ − 153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
+ − 154 '''
+ − 155 each region is divided into nbins,
+ − 156 add 1 to each bin covered by the read
+ − 157 '''
+ − 158 L = regionEnd-regionStart
+ − 159 start = int(nbins*(readStart-regionStart)/L)
+ − 160 end = int(nbins*(readEnd-regionStart)/L)
+ − 161 if start < 0:
+ − 162 start = 0
+ − 163 if end > nbins:
+ − 164 end = nbins
+ − 165 if strand == '-':
+ − 166 for i in range(start,end):
+ − 167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
+ − 168 else: # if the 6th column of the input is not strand, will treat as + strand by default
+ − 169 for i in range(start,end):
+ − 170 RegionCount[i] = RegionCount[i] + 1
+ − 171 return RegionCount
+ − 172
+ − 173 def saveProfile(filename,Profile,nRegion):
+ − 174 out = open(filename,'w')
+ − 175 for regionType in Profile.keys():
+ − 176 #print Profile[regionType]
+ − 177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')
+ − 178
+ − 179 def saveSummary(filename,Profile,nbin):
+ − 180 out = open(filename+'.summary','w')
+ − 181
+ − 182 nfeat = len(Profile)
+ − 183 summaryprofile = [0]*nbin
+ − 184 for regionType in Profile.keys():
+ − 185 for i in range(nbin):
+ − 186 summaryprofile[i] += Profile[regionType][i]
+ − 187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')
+ − 188 out.close()
+ − 189 # calculate standard error
+ − 190 out = open(filename+'.standarderror','w')
+ − 191 sd = [0.0]*nbin
+ − 192 u = [0.0]*nbin
+ − 193 for i in range(nbin):
+ − 194 u[i] = float(summaryprofile[i])/nfeat
+ − 195 for regionType in Profile.keys():
+ − 196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
+ − 197 sd[i] = sd[i]**0.5 / nfeat
+ − 198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')
+ − 199 out.close()
+ − 200
+ − 201 def main():
+ − 202 usage = "usage: %prog [options] -a inputA -b inputB"
+ − 203 parser = OptionParser(usage)
+ − 204 parser.add_option("-a", dest="inputA",
+ − 205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )
+ − 206 parser.add_option("-b",dest="inputB",
+ − 207 help="(required) input file B, interval file" )
+ − 208 parser.add_option("-f",dest="aformat",default="BED",
+ − 209 help="Format of input file A. Can be BED (default) or BAM")
+ − 210 parser.add_option("-w",type='int',dest="window",
+ − 211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )
+ − 212 parser.add_option("-n", type="int", dest="nbins",default=100,
+ − 213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")
+ − 214 parser.add_option("-s",action="store_true", dest="strandness",
+ − 215 help="enforce strandness: require overlapping on the same strand. Default is off")
+ − 216 parser.add_option("-p",action="store_true", dest="plot",default=False,
+ − 217 help="load existed intersectBed outputfile")
+ − 218 parser.add_option("-q", action="store_true", dest="quiet",default=False,
+ − 219 help="suppress output on screen")
+ − 220 parser.add_option("-o", dest="output_data",
+ − 221 help="(optional) output coverage file (txt) name." )
+ − 222 parser.add_option("-v", dest="output_plot",
+ − 223 help="(optional) output plot (pdf) file name." )
+ − 224 parser.add_option("-l", dest="plot_title", default="",
+ − 225 help="(optional) output title of the plot." )
+ − 226 parser.add_option("--ylim", dest="ylim", default="min,max",
+ − 227 help="(optional) ylim of the plot" )
+ − 228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
+ − 229 help="save profile summary only (no data for individual features)")
+ − 230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
+ − 231 help="compute and plot standard deviation for each bin. used when --summary-only is on")
+ − 232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
+ − 233 help="save profile only (no plot)")
+ − 234 parser.add_option("--span", type="float", dest="span",default=0.1,
+ − 235 help="loess span smooth parameter, 0.1 ~ 1")
+ − 236
+ − 237 (options, args) = parser.parse_args()
+ − 238
+ − 239 if options.inputA == None or options.inputB == None:
+ − 240 parser.error("Please specify two input files!!")
+ − 241
+ − 242 if not options.quiet:
+ − 243 print "checking input file format..."
+ − 244
+ − 245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
+ − 246
+ − 247 if ncol2 < 6:
+ − 248 options.inputB = makeBed(options.inputB,ncol2)
+ − 249 if not options.quiet:
+ − 250 print "fill up 6 columns"
+ − 251
+ − 252 if options.window > 0:
+ − 253 if not options.quiet:
+ − 254 print "making windows from "+options.inputB+"..."
+ − 255 options.inputB = makeWindow(options.inputB,options.window)
+ − 256
+ − 257 output = combineFilename(str(options.inputA),str(options.inputB))
+ − 258
+ − 259 if not options.plot:
+ − 260 if options.aformat == "BAM":
+ − 261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
+ − 262 else:
+ − 263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
+ − 264 if options.strandness:
+ − 265 cmd = cmd + ' -s'
+ − 266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
+ − 267 if not options.quiet:
+ − 268 print "search for overlappings: "+cmd
+ − 269 status = os.system(cmd)
+ − 270 if status != 0:
+ − 271 sys.exit(1)
+ − 272
+ − 273
+ − 274 if not options.quiet:
+ − 275 print 'group reads mapped to the same region...'
+ − 276
+ − 277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
+ − 278
+ − 279 if len(allReadsStart) == 0:
+ − 280 if not options.quiet:
+ − 281 print 'no overlap found!!'
+ − 282 os.system('rm *tmp.*')
+ − 283 sys.exit(1)
+ − 284
+ − 285 if not options.quiet:
+ − 286 print 'count number of reads mapped to each bin...'
+ − 287
+ − 288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins)
+ − 289
+ − 290 if options.output_data == None:
+ − 291 options.output_data = output+'.txt'
+ − 292
+ − 293 if options.summary_only:
+ − 294 saveSummary(options.output_data,RegionProfile,options.nbins)
+ − 295
+ − 296 else:
+ − 297 saveProfile(options.output_data,RegionProfile,nRead)
+ − 298
+ − 299 if not options.quiet:
+ − 300 print 'results saved to: '+ options.output_data
+ − 301
+ − 302 if not (options.summary_only or options.profile_only ):
+ − 303 # visualize
+ − 304
+ − 305 if options.window < 1:
+ − 306 xlab = 'relative position (bins)'
+ − 307 else:
+ − 308 xlab = 'relative position (bp)'
+ − 309
+ − 310 if options.output_plot == None:
+ − 311 options.output_plot = output+'.pdf'
+ − 312
+ − 313 title = options.plot_title+'\n n = '+str(len(RegionProfile))
+ − 314
+ − 315 rscript = open("tmp.r","w")
+ − 316 rscript.write("x <- read.table('"+options.output_data+"')\n")
+ − 317 rscript.write("pdf('"+options.output_plot+"')\n")
+ − 318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
+ − 319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
+ − 320
+ − 321 if options.window == 0:
+ − 322 rscript.write("xticks <- seq("+str(options.nbins)+")\n")
+ − 323 else:
+ − 324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
+ − 325
+ − 326 if options.ylim != 'min,max':
+ − 327 rscript.write("ylim=c("+options.ylim+")\n")
+ − 328 else:
+ − 329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+ − 330 rscript.write("par(cex=1.5)\n")
+ − 331 #smooth
+ − 332 if options.span >= 0.1:
+ − 333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
+ − 334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
+ − 335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")
+ − 336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
+ − 337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+ − 338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
+ − 339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
+ − 340 #rscript.write("lines(xticks,avg,lwd=2)\n")
+ − 341 #rscript.write("lines(xticks,avg-err,col='green')\n")
+ − 342 #rscript.write("lines(xticks,avg+err,col='green')\n")
+ − 343 rscript.write("dev.off()\n")
+ − 344 rscript.close()
+ − 345
+ − 346 os.system("R --vanilla < tmp.r")
+ − 347
+ − 348 # remove intermediate output
+ − 349 os.system('rm *tmp.*')
+ − 350
+ − 351
+ − 352 if __name__ == "__main__":
+ − 353 main()