Mercurial > repos > xuebing > sharplabtool
diff alignr.py @ 14:76e1b1b21cce default tip
Deleted selected files
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 19:05:10 -0400 |
parents | 292186c14b08 |
children |
line wrap: on
line diff
--- a/alignr.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,353 +0,0 @@ -''' -the scripts takes two files as input, and compute the coverage of -features in input 1 across features in input 2. Features in input 2 are -divided into bins and coverage is computed for each bin. - -please check the help information by typing: - - python align.py -h - - -requirement: - please install the following tools first: - bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ - -''' - -import os,sys,os.path -from optparse import OptionParser - -def lineCount(filename): - with open(filename) as f: - for i, l in enumerate(f): - pass - return i + 1 - -def combineFilename(f1,f2): - ''' - fuse two file names into one - ''' - return f1.split('/')[-1]+'-'+f2.split('/')[-1] - -def checkFormat(filename1,filename2,input1format): - ''' - check the format of input files - ''' - - # file1 - # read the first line, see how many filds - ncol1 = 6 - if input1format == "BED": - f = open(filename1) - line = f.readline().strip().split('\t') - ncol1 = len(line) - if ncol1 < 3: - print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." - sys.exit(1) - f.close() - - # file2 - f = open(filename2) - line = f.readline().strip().split('\t') - ncol2 = len(line) - if ncol2 < 3: - print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." - sys.exit(1) - - return ncol1,ncol2 - - -def makeBed(filename,ncol): - ''' - add up to 6 column - ''' - f = open(filename) - outfile = filename+'.tmp.bed' - outf = open(outfile,'w') - if ncol == 3: - for line in f: - outf.write(line.strip()+'\t.\t0\t+\n') - elif ncol == 4: - for line in f: - outf.write(line.strip()+'\t0\t+\n') - if ncol == 5: - for line in f: - outf.write(line.strip()+'\t+\n') - f.close() - outf.close() - return outfile - -def makeWindow(filename,window): - - outfile = filename+'-window='+str(window)+'.tmp.bed' - if not os.path.exists(outfile): - f=open(filename) - out = open(outfile,'w') - lines = f.readlines() - if 'track' in lines[0]: - del lines[0] - for line in lines: - flds = line.strip().split('\t') - - #new position - center = (int(flds[1]) + int(flds[2]))/2 - start = center - window - end = center + window - if start >= 0: - flds[1] = str(start) - flds[2] = str(end) - out.write('\t'.join(flds)+'\n') - f.close() - out.close() - return outfile - -def groupReadsMapped2aRegion(filename,ncol): - ''' - read output from intersectBED - find all reads mapped to each region - ''' - try: - f=open(filename) - #If filename cannot be opened, print an error message and exit - except IOError: - print "could not open",filename,"Are you sure this file exists?" - sys.exit(1) - lines = f.readlines() - - allReadsStart = {} - allReadsEnd = {} - regionStrand = {} - regionStart = {} - regionEnd = {} - - for line in lines: - flds = line.strip().split('\t') - key = '_'.join(flds[ncol:(ncol+4)]) - if not allReadsStart.has_key(key): - allReadsStart[key] = list() - allReadsEnd[key] = list() - #print flds[ncol+0],flds[ncol+1],flds[ncol+2] - allReadsStart[key].append(int(flds[1])) - allReadsEnd[key].append(int(flds[2])) - regionStrand[key] = flds[ncol+5] - regionStart[key] = int(flds[ncol+1]) - regionEnd[key] = int(flds[ncol+2]) - return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) - - -def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): - ''' - each region is divided into nbins - compute the number of reads covering each bin for each region - ''' - RegionProfile = {} - nRead = {} # num of all reads in the region - for region in allReadsStart.keys(): - RegionProfile[region] = [0]*nbins - nRead[region] = len(allReadsStart[region]) - #print region,nRead[region],allReadsStart[region] - for i in range(nRead[region]): - RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) - return RegionProfile,nRead - -def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): - ''' - each region is divided into nbins, - add 1 to each bin covered by the read - ''' - L = regionEnd-regionStart - start = int(nbins*(readStart-regionStart)/L) - end = int(nbins*(readEnd-regionStart)/L) - if start < 0: - start = 0 - if end > nbins: - end = nbins - if strand == '-': - for i in range(start,end): - RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 - else: # if the 6th column of the input is not strand, will treat as + strand by default - for i in range(start,end): - RegionCount[i] = RegionCount[i] + 1 - return RegionCount - -def saveProfile(filename,Profile,nRegion): - out = open(filename,'w') - for regionType in Profile.keys(): - #print Profile[regionType] - out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') - -def saveSummary(filename,Profile,nbin): - out = open(filename+'.summary','w') - - nfeat = len(Profile) - summaryprofile = [0]*nbin - for regionType in Profile.keys(): - for i in range(nbin): - summaryprofile[i] += Profile[regionType][i] - out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') - out.close() - # calculate standard error - out = open(filename+'.standarderror','w') - sd = [0.0]*nbin - u = [0.0]*nbin - for i in range(nbin): - u[i] = float(summaryprofile[i])/nfeat - for regionType in Profile.keys(): - sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 - sd[i] = sd[i]**0.5 / nfeat - out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') - out.close() - -def main(): - usage = "usage: %prog [options] -a inputA -b inputB" - parser = OptionParser(usage) - parser.add_option("-a", dest="inputA", - help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) - parser.add_option("-b",dest="inputB", - help="(required) input file B, interval file" ) - parser.add_option("-f",dest="aformat",default="BED", - help="Format of input file A. Can be BED (default) or BAM") - parser.add_option("-w",type='int',dest="window", - help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) - parser.add_option("-n", type="int", dest="nbins",default=100, - help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") - parser.add_option("-s",action="store_true", dest="strandness", - help="enforce strandness: require overlapping on the same strand. Default is off") - parser.add_option("-p",action="store_true", dest="plot",default=False, - help="load existed intersectBed outputfile") - parser.add_option("-q", action="store_true", dest="quiet",default=False, - help="suppress output on screen") - parser.add_option("-o", dest="output_data", - help="(optional) output coverage file (txt) name." ) - parser.add_option("-v", dest="output_plot", - help="(optional) output plot (pdf) file name." ) - parser.add_option("-l", dest="plot_title", default="", - help="(optional) output title of the plot." ) - parser.add_option("--ylim", dest="ylim", default="min,max", - help="(optional) ylim of the plot" ) - parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, - help="save profile summary only (no data for individual features)") - parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, - help="compute and plot standard deviation for each bin. used when --summary-only is on") - parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, - help="save profile only (no plot)") - parser.add_option("--span", type="float", dest="span",default=0.1, - help="loess span smooth parameter, 0.1 ~ 1") - - (options, args) = parser.parse_args() - - if options.inputA == None or options.inputB == None: - parser.error("Please specify two input files!!") - - if not options.quiet: - print "checking input file format..." - - ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) - - if ncol2 < 6: - options.inputB = makeBed(options.inputB,ncol2) - if not options.quiet: - print "fill up 6 columns" - - if options.window > 0: - if not options.quiet: - print "making windows from "+options.inputB+"..." - options.inputB = makeWindow(options.inputB,options.window) - - output = combineFilename(str(options.inputA),str(options.inputB)) - - if not options.plot: - if options.aformat == "BAM": - cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' - else: - cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) - if options.strandness: - cmd = cmd + ' -s' - cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' - if not options.quiet: - print "search for overlappings: "+cmd - status = os.system(cmd) - if status != 0: - sys.exit(1) - - - if not options.quiet: - print 'group reads mapped to the same region...' - - allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) - - if len(allReadsStart) == 0: - if not options.quiet: - print 'no overlap found!!' - os.system('rm *tmp.*') - sys.exit(1) - - if not options.quiet: - print 'count number of reads mapped to each bin...' - - RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) - - if options.output_data == None: - options.output_data = output+'.txt' - - if options.summary_only: - saveSummary(options.output_data,RegionProfile,options.nbins) - - else: - saveProfile(options.output_data,RegionProfile,nRead) - - if not options.quiet: - print 'results saved to: '+ options.output_data - - if not (options.summary_only or options.profile_only ): - # visualize - - if options.window < 1: - xlab = 'relative position (bins)' - else: - xlab = 'relative position (bp)' - - if options.output_plot == None: - options.output_plot = output+'.pdf' - - title = options.plot_title+'\n n = '+str(len(RegionProfile)) - - rscript = open("tmp.r","w") - rscript.write("x <- read.table('"+options.output_data+"')\n") - rscript.write("pdf('"+options.output_plot+"')\n") - rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") - rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") - - if options.window == 0: - rscript.write("xticks <- seq("+str(options.nbins)+")\n") - else: - rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") - - if options.ylim != 'min,max': - rscript.write("ylim=c("+options.ylim+")\n") - else: - rscript.write("ylim=c(min(avg-err),max(avg+err))\n") - rscript.write("par(cex=1.5)\n") - #smooth - if options.span >= 0.1: - rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") - rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") - rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") - rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") - rscript.write("lines(xticks,avg,type='l',lwd=1)\n") - #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") - #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") - #rscript.write("lines(xticks,avg,lwd=2)\n") - #rscript.write("lines(xticks,avg-err,col='green')\n") - #rscript.write("lines(xticks,avg+err,col='green')\n") - rscript.write("dev.off()\n") - rscript.close() - - os.system("R --vanilla < tmp.r") - - # remove intermediate output - os.system('rm *tmp.*') - - -if __name__ == "__main__": - main()