annotate intersectSig.py @ 11:b7f1d9f8f3bc

Uploaded
author xuebing
date Sat, 10 Mar 2012 07:59:27 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
1 '''
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
2 find overlap and test signifiance
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
3 '''
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
4
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
5 import os,sys
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
6
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
7 def lineCount(filename):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
8 if os.stat(filename).st_size == 0:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
9 return 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
10 with open(filename) as f:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
11 for i, l in enumerate(f):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
12 pass
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
13 print i
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
14 return i+1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
15
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
16 def intersect(fileA,fileB,outfile,fraction,reciprocal):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
17 # return fileA intervals that overlap with interval in fileB
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
18 cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
19 #print cmd
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
20 os.system(cmd)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
21
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
22 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
23 # shuffle fileA N times, return the distribution of overlaps
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
24 nOverlap = []
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
25 for i in range(N):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
26 # shuffle fileA using shuffleBed
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
27 #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
28 # using random_interval.py
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
29 cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
30 os.system(cmd)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
31 intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
32 nOverlap.append(lineCount('tmp'))
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
33 os.system('rm tmp')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
34 os.system('rm fileA.shuffled')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
35 return nOverlap
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
36
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
37 def main():
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
38 fileA = sys.argv[1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
39 fileB = sys.argv[2]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
40 outfile = sys.argv[3]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
41 outplot = sys.argv[4]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
42 outshuffle = sys.argv[5]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
43 N = int(sys.argv[6]) # times to shuffle
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
44 genomefile = sys.argv[7]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
45 fraction = sys.argv[8]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
46 if len(sys.argv) == 10:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
47 reciprocal = sys.argv[9] # can only be '-r'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
48 else:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
49 reciprocal = ''
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
50
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
51 #print sys.argv
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
52
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
53 # number of lines in input
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
54 nA = lineCount(fileA)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
55 nB = lineCount(fileB)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
56
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
57 # intersect on real data
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
58 intersect(fileA,fileB,outfile,fraction,reciprocal)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
59 # number of overlaps
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
60 nOverlapReal = lineCount(outfile)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
61
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
62 #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
63
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
64 # shuffle fileA to estimate background
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
65 nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
66 out = open(outshuffle,'w')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
67 out.write("\t".join(map(str,nOverlapNull)))
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
68 out.close()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
69
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
70 # plot histogram
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
71 rscript = open('tmp.r','w')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
72 rscript.write("options(warn=-1)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
73 rscript.write("x0 <- "+str(nOverlapReal)+"\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
74 rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
75 rscript.write("library(MASS)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
76 rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
77 rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
78 rscript.write("pdf('"+outplot+"')\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
79 rscript.write("library(grid)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
80 rscript.write("library(VennDiagram)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
81 rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
82 rscript.write("grid.draw(venn)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
83 rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
84 rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
85 rscript.write("points(x0,0,col='red')\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
86 rscript.write("dev.off()\n")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
87 rscript.close()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
88 os.system("R --vanilla < tmp.r")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
89 os.system('rm tmp.r')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
90 main()