| 2 | 1 ''' | 
|  | 2 find overlap and test signifiance | 
|  | 3 ''' | 
|  | 4 | 
|  | 5 import os,sys | 
|  | 6 | 
|  | 7 def lineCount(filename): | 
|  | 8     if os.stat(filename).st_size == 0: | 
|  | 9         return 0 | 
|  | 10     with open(filename) as f: | 
|  | 11         for i, l in enumerate(f): | 
|  | 12             pass | 
|  | 13             print i | 
|  | 14     return i+1 | 
|  | 15 | 
|  | 16 def intersect(fileA,fileB,outfile,fraction,reciprocal): | 
|  | 17     # return fileA intervals that overlap with interval in fileB | 
|  | 18     cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile | 
|  | 19     #print cmd | 
|  | 20     os.system(cmd) | 
|  | 21 | 
|  | 22 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): | 
|  | 23     # shuffle fileA N times, return the distribution of overlaps | 
|  | 24     nOverlap = [] | 
|  | 25     for i in range(N): | 
|  | 26         # shuffle fileA using shuffleBed | 
|  | 27         #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' | 
|  | 28         # using random_interval.py | 
|  | 29         cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/bed_shuffle.py '+fileA+' fileA.shuffled across '+genomefile | 
|  | 30         os.system(cmd) | 
|  | 31         intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) | 
|  | 32         nOverlap.append(lineCount('tmp')) | 
|  | 33     os.system('rm tmp') | 
|  | 34     os.system('rm fileA.shuffled') | 
|  | 35     return nOverlap | 
|  | 36 | 
|  | 37 def main(): | 
|  | 38     fileA = sys.argv[1] | 
|  | 39     fileB = sys.argv[2] | 
|  | 40     outfile = sys.argv[3] | 
|  | 41     outplot = sys.argv[4] | 
|  | 42     outshuffle = sys.argv[5] | 
|  | 43     N = int(sys.argv[6]) # times to shuffle | 
|  | 44     genomefile = sys.argv[7] | 
|  | 45     fraction = sys.argv[8] | 
|  | 46     if len(sys.argv) == 10: | 
|  | 47         reciprocal = sys.argv[9] # can only be '-r' | 
|  | 48     else: | 
|  | 49         reciprocal = '' | 
|  | 50 | 
|  | 51     #print sys.argv | 
|  | 52 | 
|  | 53     # number of lines in input | 
|  | 54     nA = lineCount(fileA) | 
|  | 55     nB = lineCount(fileB) | 
|  | 56 | 
|  | 57     # intersect on real data | 
|  | 58     intersect(fileA,fileB,outfile,fraction,reciprocal) | 
|  | 59     # number of overlaps | 
|  | 60     nOverlapReal = lineCount(outfile) | 
|  | 61 | 
|  | 62     #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal | 
|  | 63 | 
|  | 64     # shuffle fileA to estimate background | 
|  | 65     nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) | 
|  | 66     out = open(outshuffle,'w') | 
|  | 67     out.write("\t".join(map(str,nOverlapNull))) | 
|  | 68     out.close() | 
|  | 69 | 
|  | 70     # plot histogram | 
|  | 71     rscript = open('tmp.r','w') | 
|  | 72     rscript.write("options(warn=-1)\n") | 
|  | 73     rscript.write("x0 <- "+str(nOverlapReal)+"\n") | 
|  | 74     rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") | 
|  | 75     rscript.write("library(MASS)\n") | 
|  | 76     rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") | 
|  | 77     rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") | 
|  | 78     rscript.write("pdf('"+outplot+"')\n") | 
|  | 79     rscript.write("library(grid)\n") | 
|  | 80     rscript.write("library(VennDiagram)\n") | 
|  | 81     rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n") | 
|  | 82     rscript.write("grid.draw(venn)\n") | 
|  | 83     rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") | 
|  | 84     rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") | 
|  | 85     rscript.write("points(x0,0,col='red')\n") | 
|  | 86     rscript.write("dev.off()\n") | 
|  | 87     rscript.close() | 
|  | 88     os.system("R --vanilla < tmp.r") | 
|  | 89     os.system('rm tmp.r') | 
|  | 90 main() |