annotate intervalOverlap.py @ 21:783157ce2817

Uploaded
author xuebing
date Sat, 31 Mar 2012 11:49:22 -0400
parents 16ba480adf96
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
20
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
1 '''
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
2 find overlap and test signifiance
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
3 '''
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
4
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
5 import os,sys
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
6
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
7 def lineCount(filename):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
8 i = 0
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
9 with open(filename) as f:
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
10 for i, l in enumerate(f):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
11 pass
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
12 return i + 1
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
13
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
14 def intersect(fileA,fileB,outfile,fraction,reciprocal):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
15 # return fileA intervals that overlap with interval in fileB
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
16 cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
17 #print cmd
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
18 os.system(cmd)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
19
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
20 def parseIntersect(filename):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
21 # get number of overlapped A and B
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
22 nA = 0
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
23 nB = 0
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
24 return nA,nb
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
25
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
26 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
27 # shuffle fileA N times, return the distribution of overlaps
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
28 nOverlap = []
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
29 for i in range(N):
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
30 # shuffle fileA using shuffleBed
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
31 #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
32 # using random_interval.py
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
33 cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
34 os.system(cmd)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
35 intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
36 nOverlap.append(lineCount('tmp'))
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
37 os.system('rm tmp')
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
38 os.system('rm fileA.shuffled')
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
39 return nOverlap
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
40
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
41 def main():
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
42 fileA = sys.argv[1]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
43 fileB = sys.argv[2]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
44 outfile = sys.argv[3]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
45 outplot = sys.argv[4]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
46 N = int(sys.argv[5]) # times to shuffle
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
47 genomefile = sys.argv[6]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
48 fraction = sys.argv[7]
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
49 if len(sys.argv) == 9:
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
50 reciprocal = sys.argv[8] # can only be '-r'
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
51 else:
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
52 reciprocal = ''
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
53
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
54 print sys.argv
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
55
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
56 # intersect on real data
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
57 intersect(fileA,fileB,outfile,fraction,reciprocal)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
58 # number of overlaps
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
59 nOverlapReal = lineCount(outfile)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
60
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
61 print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
62
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
63 # shuffle fileA to estimate background
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
64 nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
65
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
66 # plot histogram
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
67 rscript = open('tmp.r','w')
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
68 rscript.write("x0 <- "+str(nOverlapReal)+"\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
69 rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
70 rscript.write("library(MASS)\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
71 rscript.write("\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
72 rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
73 rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
74 rscript.write("pdf('"+outplot+"')\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
75 rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
76 rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
77 rscript.write("points(x0,0,col='red')\n")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
78 rscript.write("dev.off()")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
79 rscript.close()
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
80 os.system("R --vanilla < tmp.r")
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
81 os.system('rm tmp.r')
16ba480adf96 Uploaded
xuebing
parents:
diff changeset
82 main()