Mercurial > repos > xuebing > sharplabtool
comparison tools/encode/random_intervals_no_bits.py @ 0:9071e359b9a3
Uploaded
| author | xuebing | 
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:9071e359b9a3 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Dan Blankenberg | |
| 3 #%prog bounding_region_file mask_intervals_file intervals_to_mimic_file out_file mask_chr mask_start mask_end interval_chr interval_start interval_end interval_strand use_mask allow_strand_overlaps | |
| 4 import sys, random | |
| 5 from copy import deepcopy | |
| 6 from galaxy import eggs | |
| 7 import pkg_resources | |
| 8 pkg_resources.require( "bx-python" ) | |
| 9 import bx.intervals.io | |
| 10 import bx.intervals.intersection | |
| 11 import psyco_full | |
| 12 | |
| 13 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 14 | |
| 15 max_iters = 5 | |
| 16 | |
| 17 def stop_err( msg ): | |
| 18 sys.stderr.write( msg ) | |
| 19 sys.exit() | |
| 20 | |
| 21 #Try to add a random region | |
| 22 def add_random_region( mimic_region, bound, exist_regions, plus_mask, minus_mask, overlaps ): | |
| 23 region_length, region_strand = mimic_region | |
| 24 plus_count = plus_mask.count_range() | |
| 25 minus_count = minus_mask.count_range() | |
| 26 gaps = [] | |
| 27 | |
| 28 if region_strand == "-": | |
| 29 gaps = minus_mask.get_gaps( region_length ) | |
| 30 else: | |
| 31 gaps = plus_mask.get_gaps( region_length ) | |
| 32 | |
| 33 while True: | |
| 34 try: | |
| 35 gap_length, gap_start, gap_end = gaps.pop( random.randint( 0, len( gaps ) - 1 ) ) | |
| 36 except: | |
| 37 break | |
| 38 try: | |
| 39 start = random.randint( bound.start + gap_start, bound.start + gap_end - region_length - 1 ) | |
| 40 except ValueError, ve: | |
| 41 stop_err( "Exception thrown generating random start value: %s" %str( ve ) ) | |
| 42 | |
| 43 end = start + region_length | |
| 44 try_plus_mask = plus_mask.copy() | |
| 45 try_minus_mask = minus_mask.copy() | |
| 46 | |
| 47 if region_strand == "-": | |
| 48 try_minus_mask.set_range( start - bound.start, end - bound.start ) | |
| 49 else: | |
| 50 try_plus_mask.set_range( start - bound.start, end - bound.start ) | |
| 51 | |
| 52 rand_region = bx.intervals.io.GenomicInterval( None, [bound.chrom, start, end, region_strand], 0, 1, 2, 3, "+", fix_strand=True ) | |
| 53 | |
| 54 if try_plus_mask.count_range() == plus_count + region_length or try_minus_mask.count_range() == minus_count + region_length: | |
| 55 if overlaps in ["strand", "all"]: #overlaps allowed across strands | |
| 56 exist_regions.append( rand_region ) | |
| 57 if overlaps == "strand": | |
| 58 return exist_regions, True, try_plus_mask, try_minus_mask | |
| 59 else: #overlaps allowed everywhere | |
| 60 return exist_regions, True, plus_mask, minus_mask | |
| 61 else: #no overlapping anywhere | |
| 62 exist_regions.append( rand_region ) | |
| 63 if region_strand == "-": | |
| 64 return exist_regions, True, try_minus_mask.copy(), try_minus_mask | |
| 65 else: | |
| 66 return exist_regions, True, try_plus_mask, try_plus_mask.copy() | |
| 67 return exist_regions, False, plus_mask, minus_mask | |
| 68 | |
| 69 def main(): | |
| 70 includes_strand = False | |
| 71 region_uid = sys.argv[1] | |
| 72 mask_fname = sys.argv[2] | |
| 73 intervals_fname = sys.argv[3] | |
| 74 out_fname = sys.argv[4] | |
| 75 try: | |
| 76 mask_chr = int( sys.argv[5] ) - 1 | |
| 77 except: | |
| 78 stop_err( "'%s' is an invalid chrom column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[5] ) ) | |
| 79 try: | |
| 80 mask_start = int( sys.argv[6] ) - 1 | |
| 81 except: | |
| 82 stop_err( "'%s' is an invalid start column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[6] ) ) | |
| 83 try: | |
| 84 mask_end = int( sys.argv[7] ) - 1 | |
| 85 except: | |
| 86 stop_err( "'%s' is an invalid end column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[7] ) ) | |
| 87 try: | |
| 88 interval_chr = int( sys.argv[8] ) - 1 | |
| 89 except: | |
| 90 stop_err( "'%s' is an invalid chrom column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[8] ) ) | |
| 91 try: | |
| 92 interval_start = int( sys.argv[9] ) - 1 | |
| 93 except: | |
| 94 stop_err( "'%s' is an invalid start column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[9] ) ) | |
| 95 try: | |
| 96 interval_end = int( sys.argv[10] ) - 1 | |
| 97 except: | |
| 98 stop_err( "'%s' is an invalid end column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[10] ) ) | |
| 99 try: | |
| 100 interval_strand = int( sys.argv[11] ) - 1 | |
| 101 includes_strand = True | |
| 102 except: | |
| 103 interval_strand = -1 | |
| 104 if includes_strand: | |
| 105 use_mask = sys.argv[12] | |
| 106 overlaps = sys.argv[13] | |
| 107 else: | |
| 108 use_mask = sys.argv[11] | |
| 109 overlaps = sys.argv[12] | |
| 110 available_regions = {} | |
| 111 loc_file = "%s/regions.loc" % sys.argv[-1] | |
| 112 | |
| 113 for i, line in enumerate( file( loc_file ) ): | |
| 114 line = line.rstrip( '\r\n' ) | |
| 115 if line and not line.startswith( '#' ): | |
| 116 fields = line.split( '\t' ) | |
| 117 #read each line, if not enough fields, go to next line | |
| 118 try: | |
| 119 build = fields[0] | |
| 120 uid = fields[1] | |
| 121 description = fields[2] | |
| 122 filepath = fields[3] | |
| 123 available_regions[uid] = filepath | |
| 124 except: | |
| 125 continue | |
| 126 | |
| 127 if region_uid not in available_regions: | |
| 128 stop_err( "Region '%s' is invalid." % region_uid ) | |
| 129 region_fname = available_regions[region_uid].strip() | |
| 130 | |
| 131 #set up bounding regions to hold random intervals | |
| 132 bounds = [] | |
| 133 for bound in bx.intervals.io.NiceReaderWrapper( open( region_fname, 'r' ), chrom_col=0, start_col=1, end_col=2, fix_strand=True, return_header=False, return_comments=False ): | |
| 134 bounds.append( bound ) | |
| 135 #set up length and number of regions to mimic | |
| 136 regions = [ [] for i in range( len( bounds ) ) ] | |
| 137 | |
| 138 for region in bx.intervals.io.NiceReaderWrapper( open( intervals_fname, 'r' ), chrom_col=interval_chr, start_col=interval_start, end_col=interval_end, strand_col=interval_strand, fix_strand=True, return_header=False, return_comments=False ): | |
| 139 #loop through bounds, find first proper bounds then add | |
| 140 #if an interval crosses bounds, it will be added to the first bound | |
| 141 for i in range( len( bounds ) ): | |
| 142 if bounds[i].chrom != region.chrom: | |
| 143 continue | |
| 144 intersecter = bx.intervals.intersection.Intersecter() | |
| 145 intersecter.add_interval( bounds[i] ) | |
| 146 if len( intersecter.find( region.start, region.end ) ) > 0: | |
| 147 regions[i].append( ( region.end - region.start, region.strand ) ) #add region to proper bound and go to next region | |
| 148 break | |
| 149 for region in regions: | |
| 150 region.sort() | |
| 151 region.reverse() | |
| 152 | |
| 153 #read mask file | |
| 154 mask = [] | |
| 155 if use_mask != "no_mask": | |
| 156 for region in bx.intervals.io.NiceReaderWrapper( open( mask_fname, 'r' ), chrom_col=mask_chr, start_col=mask_start, end_col=mask_end, fix_strand=True, return_header=False, return_comments=False ): | |
| 157 mask.append( region ) | |
| 158 | |
| 159 try: | |
| 160 out_file = open ( out_fname, "w" ) | |
| 161 except: | |
| 162 stop_err( "Error opening output file '%s'." % out_fname ) | |
| 163 | |
| 164 i = 0 | |
| 165 i_iters = 0 | |
| 166 region_count = 0 | |
| 167 best_regions = [] | |
| 168 num_fail = 0 | |
| 169 while i < len( bounds ): | |
| 170 i_iters += 1 | |
| 171 #order regions to mimic | |
| 172 regions_to_mimic = regions[i][0:] | |
| 173 if len( regions_to_mimic ) < 1: #if no regions to mimic, skip | |
| 174 i += 1 | |
| 175 i_iters = 0 | |
| 176 continue | |
| 177 #set up region mask | |
| 178 plus_mask = Region( bounds[i].end - bounds[i].start ) | |
| 179 for region in mask: | |
| 180 if region.chrom != bounds[i].chrom: continue | |
| 181 mask_start = region.start - bounds[i].start | |
| 182 mask_end = region.end - bounds[i].start | |
| 183 if mask_start >= 0 and mask_end > 0: | |
| 184 plus_mask.set_range( mask_start, mask_end ) | |
| 185 minus_mask = plus_mask.copy() | |
| 186 random_regions = [] | |
| 187 num_added = 0 | |
| 188 for j in range( len( regions[i] ) ): | |
| 189 random_regions, added, plus_mask, minus_mask = add_random_region( regions_to_mimic[j], bounds[i], random_regions, plus_mask, minus_mask, overlaps ) | |
| 190 if added: | |
| 191 num_added += 1 | |
| 192 if num_added == len( regions_to_mimic ) or i_iters >= max_iters: | |
| 193 if len( best_regions ) > len( random_regions ): | |
| 194 random_regions = best_regions.copy() | |
| 195 num_fail += ( len( regions_to_mimic ) - len( random_regions ) ) | |
| 196 i_iters = 0 | |
| 197 best_regions = [] | |
| 198 for region in random_regions: | |
| 199 print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s" % ( region.chrom, region.start, region.end, "region_" + str( region_count ), "0", region.strand ) | |
| 200 region_count += 1 | |
| 201 else: | |
| 202 i -= 1 | |
| 203 if len( best_regions ) < len( random_regions ): | |
| 204 best_regions = random_regions[:] | |
| 205 i+=1 | |
| 206 | |
| 207 out_file.close() | |
| 208 if num_fail: | |
| 209 print "After %i iterations, %i regions could not be added." % (max_iters, num_fail) | |
| 210 if use_mask == "use_mask": | |
| 211 print "The mask you have provided may be too restrictive." | |
| 212 | |
| 213 class Region( list ): | |
| 214 """ | |
| 215 A list for on/off regions | |
| 216 """ | |
| 217 def __init__( self, size=0 ): | |
| 218 for i in range( size ): | |
| 219 self.append( False ) | |
| 220 def copy( self ): | |
| 221 return deepcopy( self ) | |
| 222 def set_range( self, start=0, end=None ): | |
| 223 if start < 0: | |
| 224 start = 0 | |
| 225 if ( not end and end != 0 ) or end > len( self ): | |
| 226 end = len( self ) | |
| 227 for i in range( start, end ): | |
| 228 self[i]=True | |
| 229 def count_range( self, start=0, end=None ): | |
| 230 if start < 0: | |
| 231 start = 0 | |
| 232 if ( not end and end != 0 ) or end > len( self ): | |
| 233 end = len( self ) | |
| 234 return self[start:end].count( True ) | |
| 235 def get_gaps( self, min_size = 0 ): | |
| 236 gaps = [] | |
| 237 start = end = 0 | |
| 238 while True: | |
| 239 try: | |
| 240 start = self[end:].index( False ) + end | |
| 241 except: | |
| 242 break | |
| 243 try: | |
| 244 end = self[start:].index( True ) + start | |
| 245 except: | |
| 246 end = len( self ) | |
| 247 if end > start and end - start >= min_size: | |
| 248 gaps.append( ( end - start, start, end ) ) | |
| 249 gaps.sort() | |
| 250 gaps.reverse() | |
| 251 return gaps | |
| 252 | |
| 253 if __name__ == "__main__": main() | 
