diff shuffleBed.py @ 11:b7f1d9f8f3bc

Uploaded
author xuebing
date Sat, 10 Mar 2012 07:59:27 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/shuffleBed.py	Sat Mar 10 07:59:27 2012 -0500
@@ -0,0 +1,107 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set 
+'''
+
+def inferSizeFromRefBed(filename,header):
+    '''
+    read reference interval set, get chrom size information
+    '''
+    chrSize = {}
+    f = open(filename)
+    if header:
+        header = f.readline()
+    for line in f:
+        flds = line.strip().split('\t')
+        if not chrSize.has_key(flds[0]):
+            chrSize[flds[0]] = int(flds[2])
+        elif chrSize[flds[0]] < int(flds[2]):
+            chrSize[flds[0]] = int(flds[2])
+    f.close()
+    return chrSize 
+
+def getChrSize(filename):
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split('\t')
+        if len(flds) >1:
+            chrSize[flds[0]] = int(flds[1])
+    f.close()
+    return chrSize
+    
+def makeWeightedChrom(chrSize):
+    '''
+    make a list of chr_id, the freq is proportional to its length
+    '''
+     
+    genome_len = 0
+    
+    for chrom in chrSize:
+        chrom_len = chrSize[chrom]
+        genome_len += chrom_len
+
+    weighted_chrom = []
+    for chrom in chrSize:
+        weight = int(round(1000*float(chrSize[chrom])/genome_len))
+        weighted_chrom += [chrom]*weight
+
+    return weighted_chrom            
+
+def randomIntervalWithinChrom(infile,outfile,chrSize,header):
+    '''
+    '''
+    fin = open(infile)
+    if header:
+        header = fin.readline()
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+    fin.close()
+    fout.close()   
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
+    '''
+    '''
+    fin = open(infile)
+    if header:
+        header = fin.readline()
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        # find a random chrom
+        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+        # random start in the chrom
+        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+    fin.close()
+    fout.close()            
+
+import sys,random
+def main():
+    # python random_interval.py test100.bed testout.bed across header human.hg18.genome 
+
+    reference_interval_file = sys.argv[1]
+    output_file = sys.argv[2]
+    across_or_within_chrom = sys.argv[3] # within or across 
+    if sys.argv[4] == 'header':
+        header = True 
+    else:
+        header = False
+    if len(sys.argv) == 6:
+        chrom_size_file = sys.argv[5]
+        chrSize = getChrSize(chrom_size_file)
+    else:
+        chrSize = inferSizeFromRefBed(reference_interval_file,header) 
+    if across_or_within_chrom == 'within':            
+        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
+    else:
+        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)   
+main()