diff intervalOverlap.py @ 14:76e1b1b21cce default tip

Deleted selected files
author xuebing
date Tue, 13 Mar 2012 19:05:10 -0400
parents 292186c14b08
children
line wrap: on
line diff
--- a/intervalOverlap.py	Sat Mar 10 08:17:36 2012 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,82 +0,0 @@
-'''
-find overlap and test signifiance
-'''
-
-import os,sys
-
-def lineCount(filename):
-    i = 0
-    with open(filename) as f:
-        for i, l in enumerate(f):
-            pass
-    return i + 1
-
-def intersect(fileA,fileB,outfile,fraction,reciprocal):
-    # return fileA intervals that overlap with interval in fileB
-    cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile
-    #print cmd
-    os.system(cmd)
-
-def parseIntersect(filename):
-    # get number of overlapped A and B
-    nA = 0
-    nB = 0
-    return nA,nb
-    
-def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
-    # shuffle fileA N times, return the distribution of overlaps
-    nOverlap = []
-    for i in range(N):
-        # shuffle fileA using shuffleBed
-        #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
-        # using random_interval.py
-        cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
-        os.system(cmd)
-        intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
-        nOverlap.append(lineCount('tmp'))
-    os.system('rm tmp')
-    os.system('rm fileA.shuffled')
-    return nOverlap
-
-def main():
-    fileA = sys.argv[1]
-    fileB = sys.argv[2]
-    outfile = sys.argv[3]
-    outplot = sys.argv[4]
-    N = int(sys.argv[5]) # times to shuffle
-    genomefile = sys.argv[6]
-    fraction = sys.argv[7]
-    if len(sys.argv) == 9:
-        reciprocal = sys.argv[8] # can only be '-r'
-    else:
-        reciprocal = ''
-
-    print sys.argv
-    
-    # intersect on real data
-    intersect(fileA,fileB,outfile,fraction,reciprocal)
-    # number of overlaps
-    nOverlapReal = lineCount(outfile)
-
-    print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
-    
-    # shuffle fileA to estimate background
-    nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
-
-    # plot histogram
-    rscript = open('tmp.r','w')
-    rscript.write("x0 <- "+str(nOverlapReal)+"\n")
-    rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
-    rscript.write("library(MASS)\n")
-    rscript.write("\n")
-    rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
-    rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
-    rscript.write("pdf('"+outplot+"')\n")
-    rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
-    rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
-    rscript.write("points(x0,0,col='red')\n")
-    rscript.write("dev.off()")
-    rscript.close()
-    os.system("R --vanilla < tmp.r")    
-    os.system('rm tmp.r')
-main()