comparison shuffleBed.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
comparison
equal deleted inserted replaced
19:d325683ec368 20:16ba480adf96
1 '''
2 simulate a random interval set that mimics the size and strand of a reference set
3 '''
4
5 def inferSizeFromRefBed(filename,header):
6 '''
7 read reference interval set, get chrom size information
8 '''
9 chrSize = {}
10 f = open(filename)
11 if header:
12 header = f.readline()
13 for line in f:
14 flds = line.strip().split('\t')
15 if not chrSize.has_key(flds[0]):
16 chrSize[flds[0]] = int(flds[2])
17 elif chrSize[flds[0]] < int(flds[2]):
18 chrSize[flds[0]] = int(flds[2])
19 f.close()
20 return chrSize
21
22 def getChrSize(filename):
23 chrSize = {}
24 f = open(filename)
25 for line in f:
26 flds = line.strip().split('\t')
27 if len(flds) >1:
28 chrSize[flds[0]] = int(flds[1])
29 f.close()
30 return chrSize
31
32 def makeWeightedChrom(chrSize):
33 '''
34 make a list of chr_id, the freq is proportional to its length
35 '''
36
37 genome_len = 0
38
39 for chrom in chrSize:
40 chrom_len = chrSize[chrom]
41 genome_len += chrom_len
42
43 weighted_chrom = []
44 for chrom in chrSize:
45 weight = int(round(1000*float(chrSize[chrom])/genome_len))
46 weighted_chrom += [chrom]*weight
47
48 return weighted_chrom
49
50 def randomIntervalWithinChrom(infile,outfile,chrSize,header):
51 '''
52 '''
53 fin = open(infile)
54 if header:
55 header = fin.readline()
56 fout = open(outfile,'w')
57 n = 0
58 for line in fin:
59 n = n + 1
60 flds = line.strip().split('\t')
61 interval_size = int(flds[2]) - int(flds[1])
62 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
63 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
64 fin.close()
65 fout.close()
66
67 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
68 '''
69 '''
70 fin = open(infile)
71 if header:
72 header = fin.readline()
73 fout = open(outfile,'w')
74 n = 0
75 for line in fin:
76 n = n + 1
77 flds = line.strip().split('\t')
78 interval_size = int(flds[2]) - int(flds[1])
79 # find a random chrom
80 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
81 # random start in the chrom
82 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
83 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
84 fin.close()
85 fout.close()
86
87 import sys,random
88 def main():
89 # python random_interval.py test100.bed testout.bed across header human.hg18.genome
90
91 reference_interval_file = sys.argv[1]
92 output_file = sys.argv[2]
93 across_or_within_chrom = sys.argv[3] # within or across
94 if sys.argv[4] == 'header':
95 header = True
96 else:
97 header = False
98 if len(sys.argv) == 6:
99 chrom_size_file = sys.argv[5]
100 chrSize = getChrSize(chrom_size_file)
101 else:
102 chrSize = inferSizeFromRefBed(reference_interval_file,header)
103 if across_or_within_chrom == 'within':
104 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
105 else:
106 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)
107 main()