diff random_interval.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_interval.py	Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,96 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set 
+'''
+
+def inferSizeFromRefBed(filename):
+    '''
+    read reference interval set, get chrom size information
+    '''
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split('\t')
+        if not chrSize.has_key(flds[0]):
+            chrSize[flds[0]] = int(flds[2])
+        elif chrSize[flds[0]] < int(flds[2]):
+            chrSize[flds[0]] = int(flds[2])
+    f.close()
+    return chrSize 
+
+def getChrSize(filename):
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split()
+        if len(flds) >1:
+            chrSize[flds[0]] = int(flds[1])
+    f.close()
+    return chrSize
+    
+def makeWeightedChrom(chrSize):
+    '''
+    make a list of chr_id, the freq is proportional to its length
+    '''
+     
+    genome_len = 0
+    
+    for chrom in chrSize:
+        chrom_len = chrSize[chrom]
+        genome_len += chrom_len
+
+    weighted_chrom = []
+    for chrom in chrSize:
+        weight = int(round(1000*float(chrSize[chrom])/genome_len))
+        weighted_chrom += [chrom]*weight
+
+    return weighted_chrom            
+
+def randomIntervalWithinChrom(infile,outfile,chrSize):
+    '''
+    '''
+    fin = open(infile)
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+        flds[2] = str(int(flds[1])+interval_size)
+        fout.write('\t'.join(flds)+'\n')
+    fin.close()
+    fout.close()   
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
+    '''
+    '''
+    fin = open(infile)
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        # find a random chrom
+        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+        flds[2] = str(int(flds[1])+interval_size)
+        fout.write('\t'.join(flds)+'\n')
+    fin.close()
+    fout.close()            
+
+import sys,random
+def main():
+    # python random_interval.py test100.bed testout.bed across human.hg18.genome 
+
+    reference_interval_file = sys.argv[1]
+    output_file = sys.argv[2]
+    across_or_within_chrom = sys.argv[3] # within or across 
+    chrom_size_file = sys.argv[4]
+    chrSize = getChrSize(chrom_size_file)
+    print chrSize.keys()
+    if across_or_within_chrom == 'within':            
+        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
+    else:
+        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))   
+main()