annotate mytools/metaintv_ext.py @ 7:f0dc65e7f6c0

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:59:07 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
1 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
2 get binned score of intervals,allow extension
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
3 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
4
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
5 import os,sys,numpy
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
6 import string, random
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
8 from resize import *
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
9
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
10 from bx.bbi.bigwig_file import BigWigFile
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
11
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
12 def binning(x,n):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
13 # make n bin of x
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
14 y = numpy.zeros(n,dtype=float)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
15 if len(x) == 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
16 return y
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
17 step = float(len(x))/n
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
18 for k in range(n):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
19 i = int(step*k)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
20 j = int(step*(k+1)) + 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
21 y[k] = x[i:j].mean()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
22 #print i,j,k,y[k]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
23 return y
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
24
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
25 def getBinnedScore(bwfile,intvfile,nbin):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
26 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
27 get binned average and std
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
28 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
29 fbw = open(bwfile)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
30 bw = BigWigFile(file=fbw)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
31 fin = open(intvfile)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
32 avg = numpy.zeros(nbin)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
33 sqr = numpy.zeros(nbin)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
34 N = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
35 for line in fin:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
36 #print N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
37 #chrom,start,end,name,score,strand
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
38 flds = line.strip().split('\t')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
39 #get the score at base resolution as an array
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
40 scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
41 if scores == None:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
42 print 'not found:\t',N,line
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
43 continue
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
44 N = N + 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
45 #print line,scores
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
46 # reverse if on minus strand
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
47 if flds[5] == '-':
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
48 scores = scores[::-1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
49 # no score = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
50 scores = numpy.nan_to_num(scores)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
51 # bin the data
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
52 binned = binning(scores,nbin)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
53 avg = avg + binned
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
54 sqr = sqr + binned**2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
55 # compute avg and std
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
56 avg = avg / N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
57 err = ((sqr/N-avg**2)**0.5)/(N**0.5)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
58 return avg,err,N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
59
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
60 def getExtendedBinScore(bwfile,intvfile,nbins,exts):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
61 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
62 nbins: n1,n2,n3
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
63 exts: l1,l2,l3,l4
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
64 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
65 avg = []
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
66 err = []
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
67 tmpfile = "".join(random.sample(string.letters+string.digits, 8))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
68 if exts[0]>0 or exts[1]>0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
69 print 'make left extension'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
70 resize(intvfile,tmpfile,'start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
71 print 'compute binned average'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
72 avg,err,N = getBinnedScore(bwfile,tmpfile,nbins[0])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
73 print 'regions used:',N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
74 print 'make center region'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
75 resize(intvfile,tmpfile,'start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
76 print 'compute binned average'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
77 avg1,err1,N = getBinnedScore(bwfile,tmpfile,nbins[1])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
78 print 'regions used:',N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
79 avg = numpy.concatenate((avg,avg1))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
80 err = numpy.concatenate((err,err1))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
81 if exts[2]>0 or exts[3]>0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
82 print 'make right region'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
83 resize(intvfile,tmpfile,'end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
84 print 'compute binned average'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
85 avg2,err2,N = getBinnedScore(bwfile,tmpfile,nbins[2])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
86 print 'regions used:',N
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
87 avg = numpy.concatenate((avg,avg2))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
88 err = numpy.concatenate((err,err2))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
89 os.system('rm '+tmpfile)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
90 return avg,err
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
91
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
92 prog,bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
93 nbins = numpy.fromstring(nbins,dtype=int,sep=',')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
94 exts = numpy.fromstring(exts,dtype=int,sep=',')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
95 avg, err = getExtendedBinScore(bwfile,intvfile,nbins,exts)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
96 print 'save data'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
97 out = open(outfile,'w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
98 numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
99 out.write('\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
100 numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
101 out.write('\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
102 out.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
103
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
104 print 'plot'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
105 start = exts[0]*nbins[0]/(exts[0]+exts[1])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
106 end = nbins[0]+nbins[1]+exts[2]*nbins[2]/(exts[2]+exts[3])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
107 #print start,end
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
108 rscript = open("tmp.r","w")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
109 rscript.write("options(warn=-1)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
110 rscript.write("x <- read.table('"+outfile+"')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
111 rscript.write("pdf('"+outplot+"')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
112 rscript.write("avg <- x[1,]\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
113 rscript.write("err <- x[2,]\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
114 #rscript.write("print(x)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
115 rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
116 rscript.write("xticks <- seq(ncol(x))\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
117 #rscript.write("print(xticks)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
118 rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim,xaxt='n')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
119 rscript.write("axis(1, at=c(min(xticks),"+str(start)+","+str(end)+",max(xticks)),labels=c(-"+str(exts[0])+",0,0,"+str(exts[3])+"), las=2)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
120 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
121 rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
122 rscript.write("lines(c(min(xticks),max(xticks)),c(0,0),lwd=2)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
123 rscript.write("lines(c("+str(start)+","+str(end)+"),c(0,0),lwd=10)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
124 rscript.write("dev.off()\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
125 rscript.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
126 os.system("R --vanilla --slave < tmp.r")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
127 os.system("rm tmp.r")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
128