Mercurial > repos > xuebing > sharplab_interval_analysis
view alignr.py @ 2:a861f40db890
Uploaded
author | xuebing |
---|---|
date | Tue, 20 Mar 2012 15:15:00 -0400 |
parents | |
children |
line wrap: on
line source
''' the scripts takes two files as input, and compute the coverage of features in input 1 across features in input 2. Features in input 2 are divided into bins and coverage is computed for each bin. please check the help information by typing: python align.py -h requirement: please install the following tools first: bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ ''' import os,sys,os.path from optparse import OptionParser def lineCount(filename): with open(filename) as f: for i, l in enumerate(f): pass return i + 1 def combineFilename(f1,f2): ''' fuse two file names into one ''' return f1.split('/')[-1]+'-'+f2.split('/')[-1] def checkFormat(filename1,filename2,input1format): ''' check the format of input files ''' # file1 # read the first line, see how many filds ncol1 = 6 if input1format == "BED": f = open(filename1) line = f.readline().strip().split('\t') ncol1 = len(line) if ncol1 < 3: print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." sys.exit(1) f.close() # file2 f = open(filename2) line = f.readline().strip().split('\t') ncol2 = len(line) if ncol2 < 3: print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." sys.exit(1) return ncol1,ncol2 def makeBed(filename,ncol): ''' add up to 6 column ''' f = open(filename) outfile = filename+'.tmp.bed' outf = open(outfile,'w') if ncol == 3: for line in f: outf.write(line.strip()+'\t.\t0\t+\n') elif ncol == 4: for line in f: outf.write(line.strip()+'\t0\t+\n') if ncol == 5: for line in f: outf.write(line.strip()+'\t+\n') f.close() outf.close() return outfile def makeWindow(filename,window): outfile = filename+'-window='+str(window)+'.tmp.bed' if not os.path.exists(outfile): f=open(filename) out = open(outfile,'w') lines = f.readlines() if 'track' in lines[0]: del lines[0] for line in lines: flds = line.strip().split('\t') #new position center = (int(flds[1]) + int(flds[2]))/2 start = center - window end = center + window if start >= 0: flds[1] = str(start) flds[2] = str(end) out.write('\t'.join(flds)+'\n') f.close() out.close() return outfile def groupReadsMapped2aRegion(filename,ncol): ''' read output from intersectBED find all reads mapped to each region ''' try: f=open(filename) #If filename cannot be opened, print an error message and exit except IOError: print "could not open",filename,"Are you sure this file exists?" sys.exit(1) lines = f.readlines() allReadsStart = {} allReadsEnd = {} regionStrand = {} regionStart = {} regionEnd = {} for line in lines: flds = line.strip().split('\t') key = '_'.join(flds[ncol:(ncol+4)]) if not allReadsStart.has_key(key): allReadsStart[key] = list() allReadsEnd[key] = list() #print flds[ncol+0],flds[ncol+1],flds[ncol+2] allReadsStart[key].append(int(flds[1])) allReadsEnd[key].append(int(flds[2])) regionStrand[key] = flds[ncol+5] regionStart[key] = int(flds[ncol+1]) regionEnd[key] = int(flds[ncol+2]) return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): ''' each region is divided into nbins compute the number of reads covering each bin for each region ''' RegionProfile = {} nRead = {} # num of all reads in the region for region in allReadsStart.keys(): RegionProfile[region] = [0]*nbins nRead[region] = len(allReadsStart[region]) #print region,nRead[region],allReadsStart[region] for i in range(nRead[region]): RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) return RegionProfile,nRead def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): ''' each region is divided into nbins, add 1 to each bin covered by the read ''' L = regionEnd-regionStart start = int(nbins*(readStart-regionStart)/L) end = int(nbins*(readEnd-regionStart)/L) if start < 0: start = 0 if end > nbins: end = nbins if strand == '-': for i in range(start,end): RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 else: # if the 6th column of the input is not strand, will treat as + strand by default for i in range(start,end): RegionCount[i] = RegionCount[i] + 1 return RegionCount def saveProfile(filename,Profile,nRegion): out = open(filename,'w') for regionType in Profile.keys(): #print Profile[regionType] out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') def saveSummary(filename,Profile,nbin): out = open(filename+'.summary','w') nfeat = len(Profile) summaryprofile = [0]*nbin for regionType in Profile.keys(): for i in range(nbin): summaryprofile[i] += Profile[regionType][i] out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') out.close() # calculate standard error out = open(filename+'.standarderror','w') sd = [0.0]*nbin u = [0.0]*nbin for i in range(nbin): u[i] = float(summaryprofile[i])/nfeat for regionType in Profile.keys(): sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 sd[i] = sd[i]**0.5 / nfeat out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') out.close() def main(): usage = "usage: %prog [options] -a inputA -b inputB" parser = OptionParser(usage) parser.add_option("-a", dest="inputA", help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) parser.add_option("-b",dest="inputB", help="(required) input file B, interval file" ) parser.add_option("-f",dest="aformat",default="BED", help="Format of input file A. Can be BED (default) or BAM") parser.add_option("-w",type='int',dest="window", help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) parser.add_option("-n", type="int", dest="nbins",default=100, help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") parser.add_option("-s",action="store_true", dest="strandness", help="enforce strandness: require overlapping on the same strand. Default is off") parser.add_option("-p",action="store_true", dest="plot",default=False, help="load existed intersectBed outputfile") parser.add_option("-q", action="store_true", dest="quiet",default=False, help="suppress output on screen") parser.add_option("-o", dest="output_data", help="(optional) output coverage file (txt) name." ) parser.add_option("-v", dest="output_plot", help="(optional) output plot (pdf) file name." ) parser.add_option("-l", dest="plot_title", default="", help="(optional) output title of the plot." ) parser.add_option("--ylim", dest="ylim", default="min,max", help="(optional) ylim of the plot" ) parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, help="save profile summary only (no data for individual features)") parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, help="compute and plot standard deviation for each bin. used when --summary-only is on") parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, help="save profile only (no plot)") parser.add_option("--span", type="float", dest="span",default=0.1, help="loess span smooth parameter, 0.1 ~ 1") (options, args) = parser.parse_args() if options.inputA == None or options.inputB == None: parser.error("Please specify two input files!!") if not options.quiet: print "checking input file format..." ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) if ncol2 < 6: options.inputB = makeBed(options.inputB,ncol2) if not options.quiet: print "fill up 6 columns" if options.window > 0: if not options.quiet: print "making windows from "+options.inputB+"..." options.inputB = makeWindow(options.inputB,options.window) output = combineFilename(str(options.inputA),str(options.inputB)) if not options.plot: if options.aformat == "BAM": cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' else: cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) if options.strandness: cmd = cmd + ' -s' cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' if not options.quiet: print "search for overlappings: "+cmd status = os.system(cmd) if status != 0: sys.exit(1) if not options.quiet: print 'group reads mapped to the same region...' allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) if len(allReadsStart) == 0: if not options.quiet: print 'no overlap found!!' os.system('rm *tmp.*') sys.exit(1) if not options.quiet: print 'count number of reads mapped to each bin...' RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) if options.output_data == None: options.output_data = output+'.txt' if options.summary_only: saveSummary(options.output_data,RegionProfile,options.nbins) else: saveProfile(options.output_data,RegionProfile,nRead) if not options.quiet: print 'results saved to: '+ options.output_data if not (options.summary_only or options.profile_only ): # visualize if options.window < 1: xlab = 'relative position (bins)' else: xlab = 'relative position (bp)' if options.output_plot == None: options.output_plot = output+'.pdf' title = options.plot_title+'\n n = '+str(len(RegionProfile)) rscript = open("tmp.r","w") rscript.write("x <- read.table('"+options.output_data+"')\n") rscript.write("pdf('"+options.output_plot+"')\n") rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") if options.window == 0: rscript.write("xticks <- seq("+str(options.nbins)+")\n") else: rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") if options.ylim != 'min,max': rscript.write("ylim=c("+options.ylim+")\n") else: rscript.write("ylim=c(min(avg-err),max(avg+err))\n") rscript.write("par(cex=1.5)\n") #smooth if options.span >= 0.1: rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") rscript.write("lines(xticks,avg,type='l',lwd=1)\n") #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") #rscript.write("lines(xticks,avg,lwd=2)\n") #rscript.write("lines(xticks,avg-err,col='green')\n") #rscript.write("lines(xticks,avg+err,col='green')\n") rscript.write("dev.off()\n") rscript.close() os.system("R --vanilla < tmp.r") # remove intermediate output os.system('rm *tmp.*') if __name__ == "__main__": main()