Mercurial > repos > xuebing > sharplabtool
comparison intervalOverlap.py @ 11:b7f1d9f8f3bc
Uploaded
| author | xuebing |
|---|---|
| date | Sat, 10 Mar 2012 07:59:27 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:1558594a3c2e | 11:b7f1d9f8f3bc |
|---|---|
| 1 ''' | |
| 2 find overlap and test signifiance | |
| 3 ''' | |
| 4 | |
| 5 import os,sys | |
| 6 | |
| 7 def lineCount(filename): | |
| 8 i = 0 | |
| 9 with open(filename) as f: | |
| 10 for i, l in enumerate(f): | |
| 11 pass | |
| 12 return i + 1 | |
| 13 | |
| 14 def intersect(fileA,fileB,outfile,fraction,reciprocal): | |
| 15 # return fileA intervals that overlap with interval in fileB | |
| 16 cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile | |
| 17 #print cmd | |
| 18 os.system(cmd) | |
| 19 | |
| 20 def parseIntersect(filename): | |
| 21 # get number of overlapped A and B | |
| 22 nA = 0 | |
| 23 nB = 0 | |
| 24 return nA,nb | |
| 25 | |
| 26 def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): | |
| 27 # shuffle fileA N times, return the distribution of overlaps | |
| 28 nOverlap = [] | |
| 29 for i in range(N): | |
| 30 # shuffle fileA using shuffleBed | |
| 31 #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' | |
| 32 # using random_interval.py | |
| 33 cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile | |
| 34 os.system(cmd) | |
| 35 intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) | |
| 36 nOverlap.append(lineCount('tmp')) | |
| 37 os.system('rm tmp') | |
| 38 os.system('rm fileA.shuffled') | |
| 39 return nOverlap | |
| 40 | |
| 41 def main(): | |
| 42 fileA = sys.argv[1] | |
| 43 fileB = sys.argv[2] | |
| 44 outfile = sys.argv[3] | |
| 45 outplot = sys.argv[4] | |
| 46 N = int(sys.argv[5]) # times to shuffle | |
| 47 genomefile = sys.argv[6] | |
| 48 fraction = sys.argv[7] | |
| 49 if len(sys.argv) == 9: | |
| 50 reciprocal = sys.argv[8] # can only be '-r' | |
| 51 else: | |
| 52 reciprocal = '' | |
| 53 | |
| 54 print sys.argv | |
| 55 | |
| 56 # intersect on real data | |
| 57 intersect(fileA,fileB,outfile,fraction,reciprocal) | |
| 58 # number of overlaps | |
| 59 nOverlapReal = lineCount(outfile) | |
| 60 | |
| 61 print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal | |
| 62 | |
| 63 # shuffle fileA to estimate background | |
| 64 nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) | |
| 65 | |
| 66 # plot histogram | |
| 67 rscript = open('tmp.r','w') | |
| 68 rscript.write("x0 <- "+str(nOverlapReal)+"\n") | |
| 69 rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") | |
| 70 rscript.write("library(MASS)\n") | |
| 71 rscript.write("\n") | |
| 72 rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") | |
| 73 rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") | |
| 74 rscript.write("pdf('"+outplot+"')\n") | |
| 75 rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") | |
| 76 rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") | |
| 77 rscript.write("points(x0,0,col='red')\n") | |
| 78 rscript.write("dev.off()") | |
| 79 rscript.close() | |
| 80 os.system("R --vanilla < tmp.r") | |
| 81 os.system('rm tmp.r') | |
| 82 main() |
