20
|
1 '''
|
|
2 simulate a random interval set that mimics the size and strand of a reference set
|
|
3 '''
|
|
4
|
|
5 def inferSizeFromRefBed(filename,header):
|
|
6 '''
|
|
7 read reference interval set, get chrom size information
|
|
8 '''
|
|
9 chrSize = {}
|
|
10 f = open(filename)
|
|
11 if header:
|
|
12 header = f.readline()
|
|
13 for line in f:
|
|
14 flds = line.strip().split('\t')
|
|
15 if not chrSize.has_key(flds[0]):
|
|
16 chrSize[flds[0]] = int(flds[2])
|
|
17 elif chrSize[flds[0]] < int(flds[2]):
|
|
18 chrSize[flds[0]] = int(flds[2])
|
|
19 f.close()
|
|
20 return chrSize
|
|
21
|
|
22 def getChrSize(filename):
|
|
23 chrSize = {}
|
|
24 f = open(filename)
|
|
25 for line in f:
|
|
26 flds = line.strip().split('\t')
|
|
27 if len(flds) >1:
|
|
28 chrSize[flds[0]] = int(flds[1])
|
|
29 f.close()
|
|
30 return chrSize
|
|
31
|
|
32 def makeWeightedChrom(chrSize):
|
|
33 '''
|
|
34 make a list of chr_id, the freq is proportional to its length
|
|
35 '''
|
|
36
|
|
37 genome_len = 0
|
|
38
|
|
39 for chrom in chrSize:
|
|
40 chrom_len = chrSize[chrom]
|
|
41 genome_len += chrom_len
|
|
42
|
|
43 weighted_chrom = []
|
|
44 for chrom in chrSize:
|
|
45 weight = int(round(1000*float(chrSize[chrom])/genome_len))
|
|
46 weighted_chrom += [chrom]*weight
|
|
47
|
|
48 return weighted_chrom
|
|
49
|
|
50 def randomIntervalWithinChrom(infile,outfile,chrSize,header):
|
|
51 '''
|
|
52 '''
|
|
53 fin = open(infile)
|
|
54 if header:
|
|
55 header = fin.readline()
|
|
56 fout = open(outfile,'w')
|
|
57 n = 0
|
|
58 for line in fin:
|
|
59 n = n + 1
|
|
60 flds = line.strip().split('\t')
|
|
61 interval_size = int(flds[2]) - int(flds[1])
|
|
62 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
|
|
63 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
|
|
64 fin.close()
|
|
65 fout.close()
|
|
66
|
|
67 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
|
|
68 '''
|
|
69 '''
|
|
70 fin = open(infile)
|
|
71 if header:
|
|
72 header = fin.readline()
|
|
73 fout = open(outfile,'w')
|
|
74 n = 0
|
|
75 for line in fin:
|
|
76 n = n + 1
|
|
77 flds = line.strip().split('\t')
|
|
78 interval_size = int(flds[2]) - int(flds[1])
|
|
79 # find a random chrom
|
|
80 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
|
|
81 # random start in the chrom
|
|
82 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
|
|
83 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
|
|
84 fin.close()
|
|
85 fout.close()
|
|
86
|
|
87 import sys,random
|
|
88 def main():
|
|
89 # python random_interval.py test100.bed testout.bed across header human.hg18.genome
|
|
90
|
|
91 reference_interval_file = sys.argv[1]
|
|
92 output_file = sys.argv[2]
|
|
93 across_or_within_chrom = sys.argv[3] # within or across
|
|
94 if sys.argv[4] == 'header':
|
|
95 header = True
|
|
96 else:
|
|
97 header = False
|
|
98 if len(sys.argv) == 6:
|
|
99 chrom_size_file = sys.argv[5]
|
|
100 chrSize = getChrSize(chrom_size_file)
|
|
101 else:
|
|
102 chrSize = inferSizeFromRefBed(reference_interval_file,header)
|
|
103 if across_or_within_chrom == 'within':
|
|
104 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
|
|
105 else:
|
|
106 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)
|
|
107 main()
|