diff intersectSig.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/intersectSig.py	Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,90 @@
+'''
+find overlap and test signifiance
+'''
+
+import os,sys
+
+def lineCount(filename):
+    if os.stat(filename).st_size == 0:
+        return 0
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            pass
+            print i
+    return i+1
+
+def intersect(fileA,fileB,outfile,fraction,reciprocal):
+    # return fileA intervals that overlap with interval in fileB
+    cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile
+    #print cmd
+    os.system(cmd)
+    
+def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
+    # shuffle fileA N times, return the distribution of overlaps
+    nOverlap = []
+    for i in range(N):
+        # shuffle fileA using shuffleBed
+        #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
+        # using random_interval.py
+        cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
+        os.system(cmd)
+        intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
+        nOverlap.append(lineCount('tmp'))
+    os.system('rm tmp')
+    os.system('rm fileA.shuffled')
+    return nOverlap
+
+def main():
+    fileA = sys.argv[1]
+    fileB = sys.argv[2]
+    outfile = sys.argv[3]
+    outplot = sys.argv[4]
+    outshuffle = sys.argv[5]
+    N = int(sys.argv[6]) # times to shuffle
+    genomefile = sys.argv[7]
+    fraction = sys.argv[8]
+    if len(sys.argv) == 10:
+        reciprocal = sys.argv[9] # can only be '-r'
+    else:
+        reciprocal = ''
+
+    #print sys.argv
+
+    # number of lines in input
+    nA = lineCount(fileA)
+    nB = lineCount(fileB)    
+
+    # intersect on real data
+    intersect(fileA,fileB,outfile,fraction,reciprocal)
+    # number of overlaps
+    nOverlapReal = lineCount(outfile)
+
+    #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
+    
+    # shuffle fileA to estimate background
+    nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
+    out = open(outshuffle,'w')
+    out.write("\t".join(map(str,nOverlapNull)))
+    out.close()
+
+    # plot histogram
+    rscript = open('tmp.r','w')
+    rscript.write("options(warn=-1)\n")
+    rscript.write("x0 <- "+str(nOverlapReal)+"\n")
+    rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
+    rscript.write("library(MASS)\n")
+    rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
+    rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
+    rscript.write("pdf('"+outplot+"')\n")
+    rscript.write("library(grid)\n")
+    rscript.write("library(VennDiagram)\n")
+    rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n")
+    rscript.write("grid.draw(venn)\n")
+    rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("points(x0,0,col='red')\n")
+    rscript.write("dev.off()\n")
+    rscript.close()
+    os.system("R --vanilla < tmp.r")    
+    os.system('rm tmp.r')
+main()