Mercurial > repos > xuebing > sharplab_interval_analysis
comparison alignr.py @ 2:a861f40db890
Uploaded
| author | xuebing |
|---|---|
| date | Tue, 20 Mar 2012 15:15:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1a177d17b27d | 2:a861f40db890 |
|---|---|
| 1 ''' | |
| 2 the scripts takes two files as input, and compute the coverage of | |
| 3 features in input 1 across features in input 2. Features in input 2 are | |
| 4 divided into bins and coverage is computed for each bin. | |
| 5 | |
| 6 please check the help information by typing: | |
| 7 | |
| 8 python align.py -h | |
| 9 | |
| 10 | |
| 11 requirement: | |
| 12 please install the following tools first: | |
| 13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ | |
| 14 | |
| 15 ''' | |
| 16 | |
| 17 import os,sys,os.path | |
| 18 from optparse import OptionParser | |
| 19 | |
| 20 def lineCount(filename): | |
| 21 with open(filename) as f: | |
| 22 for i, l in enumerate(f): | |
| 23 pass | |
| 24 return i + 1 | |
| 25 | |
| 26 def combineFilename(f1,f2): | |
| 27 ''' | |
| 28 fuse two file names into one | |
| 29 ''' | |
| 30 return f1.split('/')[-1]+'-'+f2.split('/')[-1] | |
| 31 | |
| 32 def checkFormat(filename1,filename2,input1format): | |
| 33 ''' | |
| 34 check the format of input files | |
| 35 ''' | |
| 36 | |
| 37 # file1 | |
| 38 # read the first line, see how many filds | |
| 39 ncol1 = 6 | |
| 40 if input1format == "BED": | |
| 41 f = open(filename1) | |
| 42 line = f.readline().strip().split('\t') | |
| 43 ncol1 = len(line) | |
| 44 if ncol1 < 3: | |
| 45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." | |
| 46 sys.exit(1) | |
| 47 f.close() | |
| 48 | |
| 49 # file2 | |
| 50 f = open(filename2) | |
| 51 line = f.readline().strip().split('\t') | |
| 52 ncol2 = len(line) | |
| 53 if ncol2 < 3: | |
| 54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." | |
| 55 sys.exit(1) | |
| 56 | |
| 57 return ncol1,ncol2 | |
| 58 | |
| 59 | |
| 60 def makeBed(filename,ncol): | |
| 61 ''' | |
| 62 add up to 6 column | |
| 63 ''' | |
| 64 f = open(filename) | |
| 65 outfile = filename+'.tmp.bed' | |
| 66 outf = open(outfile,'w') | |
| 67 if ncol == 3: | |
| 68 for line in f: | |
| 69 outf.write(line.strip()+'\t.\t0\t+\n') | |
| 70 elif ncol == 4: | |
| 71 for line in f: | |
| 72 outf.write(line.strip()+'\t0\t+\n') | |
| 73 if ncol == 5: | |
| 74 for line in f: | |
| 75 outf.write(line.strip()+'\t+\n') | |
| 76 f.close() | |
| 77 outf.close() | |
| 78 return outfile | |
| 79 | |
| 80 def makeWindow(filename,window): | |
| 81 | |
| 82 outfile = filename+'-window='+str(window)+'.tmp.bed' | |
| 83 if not os.path.exists(outfile): | |
| 84 f=open(filename) | |
| 85 out = open(outfile,'w') | |
| 86 lines = f.readlines() | |
| 87 if 'track' in lines[0]: | |
| 88 del lines[0] | |
| 89 for line in lines: | |
| 90 flds = line.strip().split('\t') | |
| 91 | |
| 92 #new position | |
| 93 center = (int(flds[1]) + int(flds[2]))/2 | |
| 94 start = center - window | |
| 95 end = center + window | |
| 96 if start >= 0: | |
| 97 flds[1] = str(start) | |
| 98 flds[2] = str(end) | |
| 99 out.write('\t'.join(flds)+'\n') | |
| 100 f.close() | |
| 101 out.close() | |
| 102 return outfile | |
| 103 | |
| 104 def groupReadsMapped2aRegion(filename,ncol): | |
| 105 ''' | |
| 106 read output from intersectBED | |
| 107 find all reads mapped to each region | |
| 108 ''' | |
| 109 try: | |
| 110 f=open(filename) | |
| 111 #If filename cannot be opened, print an error message and exit | |
| 112 except IOError: | |
| 113 print "could not open",filename,"Are you sure this file exists?" | |
| 114 sys.exit(1) | |
| 115 lines = f.readlines() | |
| 116 | |
| 117 allReadsStart = {} | |
| 118 allReadsEnd = {} | |
| 119 regionStrand = {} | |
| 120 regionStart = {} | |
| 121 regionEnd = {} | |
| 122 | |
| 123 for line in lines: | |
| 124 flds = line.strip().split('\t') | |
| 125 key = '_'.join(flds[ncol:(ncol+4)]) | |
| 126 if not allReadsStart.has_key(key): | |
| 127 allReadsStart[key] = list() | |
| 128 allReadsEnd[key] = list() | |
| 129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2] | |
| 130 allReadsStart[key].append(int(flds[1])) | |
| 131 allReadsEnd[key].append(int(flds[2])) | |
| 132 regionStrand[key] = flds[ncol+5] | |
| 133 regionStart[key] = int(flds[ncol+1]) | |
| 134 regionEnd[key] = int(flds[ncol+2]) | |
| 135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) | |
| 136 | |
| 137 | |
| 138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): | |
| 139 ''' | |
| 140 each region is divided into nbins | |
| 141 compute the number of reads covering each bin for each region | |
| 142 ''' | |
| 143 RegionProfile = {} | |
| 144 nRead = {} # num of all reads in the region | |
| 145 for region in allReadsStart.keys(): | |
| 146 RegionProfile[region] = [0]*nbins | |
| 147 nRead[region] = len(allReadsStart[region]) | |
| 148 #print region,nRead[region],allReadsStart[region] | |
| 149 for i in range(nRead[region]): | |
| 150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) | |
| 151 return RegionProfile,nRead | |
| 152 | |
| 153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): | |
| 154 ''' | |
| 155 each region is divided into nbins, | |
| 156 add 1 to each bin covered by the read | |
| 157 ''' | |
| 158 L = regionEnd-regionStart | |
| 159 start = int(nbins*(readStart-regionStart)/L) | |
| 160 end = int(nbins*(readEnd-regionStart)/L) | |
| 161 if start < 0: | |
| 162 start = 0 | |
| 163 if end > nbins: | |
| 164 end = nbins | |
| 165 if strand == '-': | |
| 166 for i in range(start,end): | |
| 167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 | |
| 168 else: # if the 6th column of the input is not strand, will treat as + strand by default | |
| 169 for i in range(start,end): | |
| 170 RegionCount[i] = RegionCount[i] + 1 | |
| 171 return RegionCount | |
| 172 | |
| 173 def saveProfile(filename,Profile,nRegion): | |
| 174 out = open(filename,'w') | |
| 175 for regionType in Profile.keys(): | |
| 176 #print Profile[regionType] | |
| 177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') | |
| 178 | |
| 179 def saveSummary(filename,Profile,nbin): | |
| 180 out = open(filename+'.summary','w') | |
| 181 | |
| 182 nfeat = len(Profile) | |
| 183 summaryprofile = [0]*nbin | |
| 184 for regionType in Profile.keys(): | |
| 185 for i in range(nbin): | |
| 186 summaryprofile[i] += Profile[regionType][i] | |
| 187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') | |
| 188 out.close() | |
| 189 # calculate standard error | |
| 190 out = open(filename+'.standarderror','w') | |
| 191 sd = [0.0]*nbin | |
| 192 u = [0.0]*nbin | |
| 193 for i in range(nbin): | |
| 194 u[i] = float(summaryprofile[i])/nfeat | |
| 195 for regionType in Profile.keys(): | |
| 196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 | |
| 197 sd[i] = sd[i]**0.5 / nfeat | |
| 198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') | |
| 199 out.close() | |
| 200 | |
| 201 def main(): | |
| 202 usage = "usage: %prog [options] -a inputA -b inputB" | |
| 203 parser = OptionParser(usage) | |
| 204 parser.add_option("-a", dest="inputA", | |
| 205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) | |
| 206 parser.add_option("-b",dest="inputB", | |
| 207 help="(required) input file B, interval file" ) | |
| 208 parser.add_option("-f",dest="aformat",default="BED", | |
| 209 help="Format of input file A. Can be BED (default) or BAM") | |
| 210 parser.add_option("-w",type='int',dest="window", | |
| 211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) | |
| 212 parser.add_option("-n", type="int", dest="nbins",default=100, | |
| 213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") | |
| 214 parser.add_option("-s",action="store_true", dest="strandness", | |
| 215 help="enforce strandness: require overlapping on the same strand. Default is off") | |
| 216 parser.add_option("-p",action="store_true", dest="plot",default=False, | |
| 217 help="load existed intersectBed outputfile") | |
| 218 parser.add_option("-q", action="store_true", dest="quiet",default=False, | |
| 219 help="suppress output on screen") | |
| 220 parser.add_option("-o", dest="output_data", | |
| 221 help="(optional) output coverage file (txt) name." ) | |
| 222 parser.add_option("-v", dest="output_plot", | |
| 223 help="(optional) output plot (pdf) file name." ) | |
| 224 parser.add_option("-l", dest="plot_title", default="", | |
| 225 help="(optional) output title of the plot." ) | |
| 226 parser.add_option("--ylim", dest="ylim", default="min,max", | |
| 227 help="(optional) ylim of the plot" ) | |
| 228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, | |
| 229 help="save profile summary only (no data for individual features)") | |
| 230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, | |
| 231 help="compute and plot standard deviation for each bin. used when --summary-only is on") | |
| 232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, | |
| 233 help="save profile only (no plot)") | |
| 234 parser.add_option("--span", type="float", dest="span",default=0.1, | |
| 235 help="loess span smooth parameter, 0.1 ~ 1") | |
| 236 | |
| 237 (options, args) = parser.parse_args() | |
| 238 | |
| 239 if options.inputA == None or options.inputB == None: | |
| 240 parser.error("Please specify two input files!!") | |
| 241 | |
| 242 if not options.quiet: | |
| 243 print "checking input file format..." | |
| 244 | |
| 245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) | |
| 246 | |
| 247 if ncol2 < 6: | |
| 248 options.inputB = makeBed(options.inputB,ncol2) | |
| 249 if not options.quiet: | |
| 250 print "fill up 6 columns" | |
| 251 | |
| 252 if options.window > 0: | |
| 253 if not options.quiet: | |
| 254 print "making windows from "+options.inputB+"..." | |
| 255 options.inputB = makeWindow(options.inputB,options.window) | |
| 256 | |
| 257 output = combineFilename(str(options.inputA),str(options.inputB)) | |
| 258 | |
| 259 if not options.plot: | |
| 260 if options.aformat == "BAM": | |
| 261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' | |
| 262 else: | |
| 263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) | |
| 264 if options.strandness: | |
| 265 cmd = cmd + ' -s' | |
| 266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' | |
| 267 if not options.quiet: | |
| 268 print "search for overlappings: "+cmd | |
| 269 status = os.system(cmd) | |
| 270 if status != 0: | |
| 271 sys.exit(1) | |
| 272 | |
| 273 | |
| 274 if not options.quiet: | |
| 275 print 'group reads mapped to the same region...' | |
| 276 | |
| 277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) | |
| 278 | |
| 279 if len(allReadsStart) == 0: | |
| 280 if not options.quiet: | |
| 281 print 'no overlap found!!' | |
| 282 os.system('rm *tmp.*') | |
| 283 sys.exit(1) | |
| 284 | |
| 285 if not options.quiet: | |
| 286 print 'count number of reads mapped to each bin...' | |
| 287 | |
| 288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) | |
| 289 | |
| 290 if options.output_data == None: | |
| 291 options.output_data = output+'.txt' | |
| 292 | |
| 293 if options.summary_only: | |
| 294 saveSummary(options.output_data,RegionProfile,options.nbins) | |
| 295 | |
| 296 else: | |
| 297 saveProfile(options.output_data,RegionProfile,nRead) | |
| 298 | |
| 299 if not options.quiet: | |
| 300 print 'results saved to: '+ options.output_data | |
| 301 | |
| 302 if not (options.summary_only or options.profile_only ): | |
| 303 # visualize | |
| 304 | |
| 305 if options.window < 1: | |
| 306 xlab = 'relative position (bins)' | |
| 307 else: | |
| 308 xlab = 'relative position (bp)' | |
| 309 | |
| 310 if options.output_plot == None: | |
| 311 options.output_plot = output+'.pdf' | |
| 312 | |
| 313 title = options.plot_title+'\n n = '+str(len(RegionProfile)) | |
| 314 | |
| 315 rscript = open("tmp.r","w") | |
| 316 rscript.write("x <- read.table('"+options.output_data+"')\n") | |
| 317 rscript.write("pdf('"+options.output_plot+"')\n") | |
| 318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") | |
| 319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") | |
| 320 | |
| 321 if options.window == 0: | |
| 322 rscript.write("xticks <- seq("+str(options.nbins)+")\n") | |
| 323 else: | |
| 324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") | |
| 325 | |
| 326 if options.ylim != 'min,max': | |
| 327 rscript.write("ylim=c("+options.ylim+")\n") | |
| 328 else: | |
| 329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n") | |
| 330 rscript.write("par(cex=1.5)\n") | |
| 331 #smooth | |
| 332 if options.span >= 0.1: | |
| 333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") | |
| 334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") | |
| 335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") | |
| 336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") | |
| 337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n") | |
| 338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") | |
| 339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") | |
| 340 #rscript.write("lines(xticks,avg,lwd=2)\n") | |
| 341 #rscript.write("lines(xticks,avg-err,col='green')\n") | |
| 342 #rscript.write("lines(xticks,avg+err,col='green')\n") | |
| 343 rscript.write("dev.off()\n") | |
| 344 rscript.close() | |
| 345 | |
| 346 os.system("R --vanilla < tmp.r") | |
| 347 | |
| 348 # remove intermediate output | |
| 349 os.system('rm *tmp.*') | |
| 350 | |
| 351 | |
| 352 if __name__ == "__main__": | |
| 353 main() |
