Mercurial > repos > xuebing > sharplab_interval_analysis
comparison shuffleBed.py @ 20:16ba480adf96
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 08:31:22 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
19:d325683ec368 | 20:16ba480adf96 |
---|---|
1 ''' | |
2 simulate a random interval set that mimics the size and strand of a reference set | |
3 ''' | |
4 | |
5 def inferSizeFromRefBed(filename,header): | |
6 ''' | |
7 read reference interval set, get chrom size information | |
8 ''' | |
9 chrSize = {} | |
10 f = open(filename) | |
11 if header: | |
12 header = f.readline() | |
13 for line in f: | |
14 flds = line.strip().split('\t') | |
15 if not chrSize.has_key(flds[0]): | |
16 chrSize[flds[0]] = int(flds[2]) | |
17 elif chrSize[flds[0]] < int(flds[2]): | |
18 chrSize[flds[0]] = int(flds[2]) | |
19 f.close() | |
20 return chrSize | |
21 | |
22 def getChrSize(filename): | |
23 chrSize = {} | |
24 f = open(filename) | |
25 for line in f: | |
26 flds = line.strip().split('\t') | |
27 if len(flds) >1: | |
28 chrSize[flds[0]] = int(flds[1]) | |
29 f.close() | |
30 return chrSize | |
31 | |
32 def makeWeightedChrom(chrSize): | |
33 ''' | |
34 make a list of chr_id, the freq is proportional to its length | |
35 ''' | |
36 | |
37 genome_len = 0 | |
38 | |
39 for chrom in chrSize: | |
40 chrom_len = chrSize[chrom] | |
41 genome_len += chrom_len | |
42 | |
43 weighted_chrom = [] | |
44 for chrom in chrSize: | |
45 weight = int(round(1000*float(chrSize[chrom])/genome_len)) | |
46 weighted_chrom += [chrom]*weight | |
47 | |
48 return weighted_chrom | |
49 | |
50 def randomIntervalWithinChrom(infile,outfile,chrSize,header): | |
51 ''' | |
52 ''' | |
53 fin = open(infile) | |
54 if header: | |
55 header = fin.readline() | |
56 fout = open(outfile,'w') | |
57 n = 0 | |
58 for line in fin: | |
59 n = n + 1 | |
60 flds = line.strip().split('\t') | |
61 interval_size = int(flds[2]) - int(flds[1]) | |
62 rstart = random.randint(0,chrSize[flds[0]]-interval_size) | |
63 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') | |
64 fin.close() | |
65 fout.close() | |
66 | |
67 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): | |
68 ''' | |
69 ''' | |
70 fin = open(infile) | |
71 if header: | |
72 header = fin.readline() | |
73 fout = open(outfile,'w') | |
74 n = 0 | |
75 for line in fin: | |
76 n = n + 1 | |
77 flds = line.strip().split('\t') | |
78 interval_size = int(flds[2]) - int(flds[1]) | |
79 # find a random chrom | |
80 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] | |
81 # random start in the chrom | |
82 rstart = random.randint(0,chrSize[flds[0]]-interval_size) | |
83 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') | |
84 fin.close() | |
85 fout.close() | |
86 | |
87 import sys,random | |
88 def main(): | |
89 # python random_interval.py test100.bed testout.bed across header human.hg18.genome | |
90 | |
91 reference_interval_file = sys.argv[1] | |
92 output_file = sys.argv[2] | |
93 across_or_within_chrom = sys.argv[3] # within or across | |
94 if sys.argv[4] == 'header': | |
95 header = True | |
96 else: | |
97 header = False | |
98 if len(sys.argv) == 6: | |
99 chrom_size_file = sys.argv[5] | |
100 chrSize = getChrSize(chrom_size_file) | |
101 else: | |
102 chrSize = inferSizeFromRefBed(reference_interval_file,header) | |
103 if across_or_within_chrom == 'within': | |
104 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) | |
105 else: | |
106 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) | |
107 main() |