Mercurial > repos > xuebing > sharplabtool
comparison tools/encode/random_intervals_no_bits.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 #Dan Blankenberg | |
3 #%prog bounding_region_file mask_intervals_file intervals_to_mimic_file out_file mask_chr mask_start mask_end interval_chr interval_start interval_end interval_strand use_mask allow_strand_overlaps | |
4 import sys, random | |
5 from copy import deepcopy | |
6 from galaxy import eggs | |
7 import pkg_resources | |
8 pkg_resources.require( "bx-python" ) | |
9 import bx.intervals.io | |
10 import bx.intervals.intersection | |
11 import psyco_full | |
12 | |
13 assert sys.version_info[:2] >= ( 2, 4 ) | |
14 | |
15 max_iters = 5 | |
16 | |
17 def stop_err( msg ): | |
18 sys.stderr.write( msg ) | |
19 sys.exit() | |
20 | |
21 #Try to add a random region | |
22 def add_random_region( mimic_region, bound, exist_regions, plus_mask, minus_mask, overlaps ): | |
23 region_length, region_strand = mimic_region | |
24 plus_count = plus_mask.count_range() | |
25 minus_count = minus_mask.count_range() | |
26 gaps = [] | |
27 | |
28 if region_strand == "-": | |
29 gaps = minus_mask.get_gaps( region_length ) | |
30 else: | |
31 gaps = plus_mask.get_gaps( region_length ) | |
32 | |
33 while True: | |
34 try: | |
35 gap_length, gap_start, gap_end = gaps.pop( random.randint( 0, len( gaps ) - 1 ) ) | |
36 except: | |
37 break | |
38 try: | |
39 start = random.randint( bound.start + gap_start, bound.start + gap_end - region_length - 1 ) | |
40 except ValueError, ve: | |
41 stop_err( "Exception thrown generating random start value: %s" %str( ve ) ) | |
42 | |
43 end = start + region_length | |
44 try_plus_mask = plus_mask.copy() | |
45 try_minus_mask = minus_mask.copy() | |
46 | |
47 if region_strand == "-": | |
48 try_minus_mask.set_range( start - bound.start, end - bound.start ) | |
49 else: | |
50 try_plus_mask.set_range( start - bound.start, end - bound.start ) | |
51 | |
52 rand_region = bx.intervals.io.GenomicInterval( None, [bound.chrom, start, end, region_strand], 0, 1, 2, 3, "+", fix_strand=True ) | |
53 | |
54 if try_plus_mask.count_range() == plus_count + region_length or try_minus_mask.count_range() == minus_count + region_length: | |
55 if overlaps in ["strand", "all"]: #overlaps allowed across strands | |
56 exist_regions.append( rand_region ) | |
57 if overlaps == "strand": | |
58 return exist_regions, True, try_plus_mask, try_minus_mask | |
59 else: #overlaps allowed everywhere | |
60 return exist_regions, True, plus_mask, minus_mask | |
61 else: #no overlapping anywhere | |
62 exist_regions.append( rand_region ) | |
63 if region_strand == "-": | |
64 return exist_regions, True, try_minus_mask.copy(), try_minus_mask | |
65 else: | |
66 return exist_regions, True, try_plus_mask, try_plus_mask.copy() | |
67 return exist_regions, False, plus_mask, minus_mask | |
68 | |
69 def main(): | |
70 includes_strand = False | |
71 region_uid = sys.argv[1] | |
72 mask_fname = sys.argv[2] | |
73 intervals_fname = sys.argv[3] | |
74 out_fname = sys.argv[4] | |
75 try: | |
76 mask_chr = int( sys.argv[5] ) - 1 | |
77 except: | |
78 stop_err( "'%s' is an invalid chrom column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[5] ) ) | |
79 try: | |
80 mask_start = int( sys.argv[6] ) - 1 | |
81 except: | |
82 stop_err( "'%s' is an invalid start column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[6] ) ) | |
83 try: | |
84 mask_end = int( sys.argv[7] ) - 1 | |
85 except: | |
86 stop_err( "'%s' is an invalid end column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[7] ) ) | |
87 try: | |
88 interval_chr = int( sys.argv[8] ) - 1 | |
89 except: | |
90 stop_err( "'%s' is an invalid chrom column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[8] ) ) | |
91 try: | |
92 interval_start = int( sys.argv[9] ) - 1 | |
93 except: | |
94 stop_err( "'%s' is an invalid start column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[9] ) ) | |
95 try: | |
96 interval_end = int( sys.argv[10] ) - 1 | |
97 except: | |
98 stop_err( "'%s' is an invalid end column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[10] ) ) | |
99 try: | |
100 interval_strand = int( sys.argv[11] ) - 1 | |
101 includes_strand = True | |
102 except: | |
103 interval_strand = -1 | |
104 if includes_strand: | |
105 use_mask = sys.argv[12] | |
106 overlaps = sys.argv[13] | |
107 else: | |
108 use_mask = sys.argv[11] | |
109 overlaps = sys.argv[12] | |
110 available_regions = {} | |
111 loc_file = "%s/regions.loc" % sys.argv[-1] | |
112 | |
113 for i, line in enumerate( file( loc_file ) ): | |
114 line = line.rstrip( '\r\n' ) | |
115 if line and not line.startswith( '#' ): | |
116 fields = line.split( '\t' ) | |
117 #read each line, if not enough fields, go to next line | |
118 try: | |
119 build = fields[0] | |
120 uid = fields[1] | |
121 description = fields[2] | |
122 filepath = fields[3] | |
123 available_regions[uid] = filepath | |
124 except: | |
125 continue | |
126 | |
127 if region_uid not in available_regions: | |
128 stop_err( "Region '%s' is invalid." % region_uid ) | |
129 region_fname = available_regions[region_uid].strip() | |
130 | |
131 #set up bounding regions to hold random intervals | |
132 bounds = [] | |
133 for bound in bx.intervals.io.NiceReaderWrapper( open( region_fname, 'r' ), chrom_col=0, start_col=1, end_col=2, fix_strand=True, return_header=False, return_comments=False ): | |
134 bounds.append( bound ) | |
135 #set up length and number of regions to mimic | |
136 regions = [ [] for i in range( len( bounds ) ) ] | |
137 | |
138 for region in bx.intervals.io.NiceReaderWrapper( open( intervals_fname, 'r' ), chrom_col=interval_chr, start_col=interval_start, end_col=interval_end, strand_col=interval_strand, fix_strand=True, return_header=False, return_comments=False ): | |
139 #loop through bounds, find first proper bounds then add | |
140 #if an interval crosses bounds, it will be added to the first bound | |
141 for i in range( len( bounds ) ): | |
142 if bounds[i].chrom != region.chrom: | |
143 continue | |
144 intersecter = bx.intervals.intersection.Intersecter() | |
145 intersecter.add_interval( bounds[i] ) | |
146 if len( intersecter.find( region.start, region.end ) ) > 0: | |
147 regions[i].append( ( region.end - region.start, region.strand ) ) #add region to proper bound and go to next region | |
148 break | |
149 for region in regions: | |
150 region.sort() | |
151 region.reverse() | |
152 | |
153 #read mask file | |
154 mask = [] | |
155 if use_mask != "no_mask": | |
156 for region in bx.intervals.io.NiceReaderWrapper( open( mask_fname, 'r' ), chrom_col=mask_chr, start_col=mask_start, end_col=mask_end, fix_strand=True, return_header=False, return_comments=False ): | |
157 mask.append( region ) | |
158 | |
159 try: | |
160 out_file = open ( out_fname, "w" ) | |
161 except: | |
162 stop_err( "Error opening output file '%s'." % out_fname ) | |
163 | |
164 i = 0 | |
165 i_iters = 0 | |
166 region_count = 0 | |
167 best_regions = [] | |
168 num_fail = 0 | |
169 while i < len( bounds ): | |
170 i_iters += 1 | |
171 #order regions to mimic | |
172 regions_to_mimic = regions[i][0:] | |
173 if len( regions_to_mimic ) < 1: #if no regions to mimic, skip | |
174 i += 1 | |
175 i_iters = 0 | |
176 continue | |
177 #set up region mask | |
178 plus_mask = Region( bounds[i].end - bounds[i].start ) | |
179 for region in mask: | |
180 if region.chrom != bounds[i].chrom: continue | |
181 mask_start = region.start - bounds[i].start | |
182 mask_end = region.end - bounds[i].start | |
183 if mask_start >= 0 and mask_end > 0: | |
184 plus_mask.set_range( mask_start, mask_end ) | |
185 minus_mask = plus_mask.copy() | |
186 random_regions = [] | |
187 num_added = 0 | |
188 for j in range( len( regions[i] ) ): | |
189 random_regions, added, plus_mask, minus_mask = add_random_region( regions_to_mimic[j], bounds[i], random_regions, plus_mask, minus_mask, overlaps ) | |
190 if added: | |
191 num_added += 1 | |
192 if num_added == len( regions_to_mimic ) or i_iters >= max_iters: | |
193 if len( best_regions ) > len( random_regions ): | |
194 random_regions = best_regions.copy() | |
195 num_fail += ( len( regions_to_mimic ) - len( random_regions ) ) | |
196 i_iters = 0 | |
197 best_regions = [] | |
198 for region in random_regions: | |
199 print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s" % ( region.chrom, region.start, region.end, "region_" + str( region_count ), "0", region.strand ) | |
200 region_count += 1 | |
201 else: | |
202 i -= 1 | |
203 if len( best_regions ) < len( random_regions ): | |
204 best_regions = random_regions[:] | |
205 i+=1 | |
206 | |
207 out_file.close() | |
208 if num_fail: | |
209 print "After %i iterations, %i regions could not be added." % (max_iters, num_fail) | |
210 if use_mask == "use_mask": | |
211 print "The mask you have provided may be too restrictive." | |
212 | |
213 class Region( list ): | |
214 """ | |
215 A list for on/off regions | |
216 """ | |
217 def __init__( self, size=0 ): | |
218 for i in range( size ): | |
219 self.append( False ) | |
220 def copy( self ): | |
221 return deepcopy( self ) | |
222 def set_range( self, start=0, end=None ): | |
223 if start < 0: | |
224 start = 0 | |
225 if ( not end and end != 0 ) or end > len( self ): | |
226 end = len( self ) | |
227 for i in range( start, end ): | |
228 self[i]=True | |
229 def count_range( self, start=0, end=None ): | |
230 if start < 0: | |
231 start = 0 | |
232 if ( not end and end != 0 ) or end > len( self ): | |
233 end = len( self ) | |
234 return self[start:end].count( True ) | |
235 def get_gaps( self, min_size = 0 ): | |
236 gaps = [] | |
237 start = end = 0 | |
238 while True: | |
239 try: | |
240 start = self[end:].index( False ) + end | |
241 except: | |
242 break | |
243 try: | |
244 end = self[start:].index( True ) + start | |
245 except: | |
246 end = len( self ) | |
247 if end > start and end - start >= min_size: | |
248 gaps.append( ( end - start, start, end ) ) | |
249 gaps.sort() | |
250 gaps.reverse() | |
251 return gaps | |
252 | |
253 if __name__ == "__main__": main() |