| 18 | 1 ''' | 
|  | 2 find overlap and test signifiance | 
|  | 3 ''' | 
|  | 4 | 
|  | 5 import os,sys | 
|  | 6 | 
|  | 7 def lineCount(filename): | 
|  | 8     i = 0 | 
|  | 9     with open(filename) as f: | 
|  | 10         for i, l in enumerate(f): | 
|  | 11             pass | 
|  | 12     return i + 1 | 
|  | 13 | 
|  | 14 def intersect(fileA,fileB,outfile,fraction,reciprocal): | 
|  | 15     # return fileA intervals that overlap with interval in fileB | 
|  | 16     cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile | 
|  | 17     #print cmd | 
|  | 18     os.system(cmd) | 
|  | 19 | 
|  | 20 def parseIntersect(filename): | 
|  | 21     # get number of overlapped A and B | 
|  | 22     nA = 0 | 
|  | 23     nB = 0 | 
|  | 24     return nA,nb | 
|  | 25 | 
|  | 26 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): | 
|  | 27     # shuffle fileA N times, return the distribution of overlaps | 
|  | 28     nOverlap = [] | 
|  | 29     for i in range(N): | 
|  | 30         # shuffle fileA using shuffleBed | 
|  | 31         #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' | 
|  | 32         # using random_interval.py | 
|  | 33         cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile | 
|  | 34         os.system(cmd) | 
|  | 35         intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) | 
|  | 36         nOverlap.append(lineCount('tmp')) | 
|  | 37     os.system('rm tmp') | 
|  | 38     os.system('rm fileA.shuffled') | 
|  | 39     return nOverlap | 
|  | 40 | 
|  | 41 def main(): | 
|  | 42     fileA = sys.argv[1] | 
|  | 43     fileB = sys.argv[2] | 
|  | 44     outfile = sys.argv[3] | 
|  | 45     outplot = sys.argv[4] | 
|  | 46     N = int(sys.argv[5]) # times to shuffle | 
|  | 47     genomefile = sys.argv[6] | 
|  | 48     fraction = sys.argv[7] | 
|  | 49     if len(sys.argv) == 9: | 
|  | 50         reciprocal = sys.argv[8] # can only be '-r' | 
|  | 51     else: | 
|  | 52         reciprocal = '' | 
|  | 53 | 
|  | 54     print sys.argv | 
|  | 55 | 
|  | 56     # intersect on real data | 
|  | 57     intersect(fileA,fileB,outfile,fraction,reciprocal) | 
|  | 58     # number of overlaps | 
|  | 59     nOverlapReal = lineCount(outfile) | 
|  | 60 | 
|  | 61     print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal | 
|  | 62 | 
|  | 63     # shuffle fileA to estimate background | 
|  | 64     nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) | 
|  | 65 | 
|  | 66     # plot histogram | 
|  | 67     rscript = open('tmp.r','w') | 
|  | 68     rscript.write("x0 <- "+str(nOverlapReal)+"\n") | 
|  | 69     rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") | 
|  | 70     rscript.write("library(MASS)\n") | 
|  | 71     rscript.write("\n") | 
|  | 72     rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") | 
|  | 73     rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") | 
|  | 74     rscript.write("pdf('"+outplot+"')\n") | 
|  | 75     rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") | 
|  | 76     rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") | 
|  | 77     rscript.write("points(x0,0,col='red')\n") | 
|  | 78     rscript.write("dev.off()") | 
|  | 79     rscript.close() | 
|  | 80     os.system("R --vanilla < tmp.r") | 
|  | 81     os.system('rm tmp.r') | 
|  | 82 main() |