diff interval/metaintv3.py @ 18:9bbb37e8683f

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:24:32 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interval/metaintv3.py	Sat Mar 31 08:24:32 2012 -0400
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    avg = numpy.zeros(nbin)
+    sqr = numpy.zeros(nbin)
+    N = 0
+    for line in fin:
+        #chrom,start,end,name,score,strand
+        flds = line.strip().split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',line
+            continue
+        N = N + 1
+        #print line,scores
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        avg = avg + binned
+        sqr = sqr + binned**2
+    # compute avg and std
+    avg = avg / N
+    err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+    return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+    '''
+    nbins: n1,n2,n3
+    exts: l1,l2,l3,l4
+    '''
+    # make left extension
+    resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+    # compute binned average
+    avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+    # make center region
+    resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+    # compute binned average
+    avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])    
+    avg = numpy.concatenate((avg,avg1))
+    err = numpy.concatenate((err,err1))
+    # make right region
+    resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+    # compute binned average
+    avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])    
+    avg = numpy.concatenate((avg,avg2))
+    err = numpy.concatenate((err,err2))
+    
+    return avg,err
+
+print sys.argv
+bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=','))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")   
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+