Mercurial > repos > xuebing > sharplab_interval_analysis
diff intervalOverlap.py @ 20:16ba480adf96
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 08:31:22 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intervalOverlap.py Sat Mar 31 08:31:22 2012 -0400 @@ -0,0 +1,82 @@ +''' +find overlap and test signifiance +''' + +import os,sys + +def lineCount(filename): + i = 0 + with open(filename) as f: + for i, l in enumerate(f): + pass + return i + 1 + +def intersect(fileA,fileB,outfile,fraction,reciprocal): + # return fileA intervals that overlap with interval in fileB + cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile + #print cmd + os.system(cmd) + +def parseIntersect(filename): + # get number of overlapped A and B + nA = 0 + nB = 0 + return nA,nb + +def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): + # shuffle fileA N times, return the distribution of overlaps + nOverlap = [] + for i in range(N): + # shuffle fileA using shuffleBed + #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' + # using random_interval.py + cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile + os.system(cmd) + intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) + nOverlap.append(lineCount('tmp')) + os.system('rm tmp') + os.system('rm fileA.shuffled') + return nOverlap + +def main(): + fileA = sys.argv[1] + fileB = sys.argv[2] + outfile = sys.argv[3] + outplot = sys.argv[4] + N = int(sys.argv[5]) # times to shuffle + genomefile = sys.argv[6] + fraction = sys.argv[7] + if len(sys.argv) == 9: + reciprocal = sys.argv[8] # can only be '-r' + else: + reciprocal = '' + + print sys.argv + + # intersect on real data + intersect(fileA,fileB,outfile,fraction,reciprocal) + # number of overlaps + nOverlapReal = lineCount(outfile) + + print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal + + # shuffle fileA to estimate background + nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) + + # plot histogram + rscript = open('tmp.r','w') + rscript.write("x0 <- "+str(nOverlapReal)+"\n") + rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") + rscript.write("library(MASS)\n") + rscript.write("\n") + rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") + rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") + rscript.write("pdf('"+outplot+"')\n") + rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") + rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") + rscript.write("points(x0,0,col='red')\n") + rscript.write("dev.off()") + rscript.close() + os.system("R --vanilla < tmp.r") + os.system('rm tmp.r') +main()