Mercurial > repos > gianmarco_piccinno > project_rm
comparison CodonSwitchTool/sre_yield.py @ 41:bd35b13fabfb draft
Uploaded
| author | gianmarco_piccinno |
|---|---|
| date | Mon, 20 May 2019 16:33:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 40:a83452cb14ed | 41:bd35b13fabfb |
|---|---|
| 1 #!/usr/bin/env python2 | |
| 2 # | |
| 3 # Copyright 2011-2016 Google Inc. | |
| 4 # | |
| 5 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 6 # you may not use this file except in compliance with the License. | |
| 7 # You may obtain a copy of the License at | |
| 8 # | |
| 9 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 # | |
| 11 # Unless required by applicable law or agreed to in writing, software | |
| 12 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 # See the License for the specific language governing permissions and | |
| 15 # limitations under the License. | |
| 16 # | |
| 17 # vim: sw=2 sts=2 et | |
| 18 | |
| 19 """This module can generate all strings that match a regular expression. | |
| 20 | |
| 21 The regex is parsed using the SRE module that is standard in python, | |
| 22 then the data structure is executed to form a bunch of iterators. | |
| 23 """ | |
| 24 | |
| 25 __author__ = 'alexperry@google.com (Alex Perry)' | |
| 26 __all__ = ['Values', 'AllStrings', 'AllMatches', 'ParseError'] | |
| 27 | |
| 28 | |
| 29 import bisect | |
| 30 import math | |
| 31 import re | |
| 32 import sre_constants | |
| 33 import sre_parse | |
| 34 import string | |
| 35 import sys | |
| 36 import types | |
| 37 | |
| 38 import cachingseq | |
| 39 import fastdivmod | |
| 40 | |
| 41 try: | |
| 42 xrange = xrange | |
| 43 except NameError: | |
| 44 xrange = range | |
| 45 | |
| 46 _RE_METACHARS = r'$^{}*+\\' | |
| 47 _ESCAPED_METACHAR = r'\\[' + _RE_METACHARS + r']' | |
| 48 ESCAPED_METACHAR_RE = re.compile(_ESCAPED_METACHAR) | |
| 49 # ASCII by default, see https://github.com/google/sre_yield/issues/3 | |
| 50 CHARSET = [chr(c) for c in range(256)] | |
| 51 | |
| 52 WORD = string.ascii_letters + string.digits + '_' | |
| 53 | |
| 54 try: | |
| 55 DEFAULT_RE_FLAGS = re.ASCII | |
| 56 except AttributeError: | |
| 57 DEFAULT_RE_FLAGS = 0 | |
| 58 | |
| 59 STATE_START, STATE_MIDDLE, STATE_END = list(range(3)) | |
| 60 | |
| 61 def Not(chars): | |
| 62 return ''.join(sorted(set(CHARSET) - set(chars))) | |
| 63 | |
| 64 | |
| 65 CATEGORIES = { | |
| 66 sre_constants.CATEGORY_WORD: WORD, | |
| 67 sre_constants.CATEGORY_NOT_WORD: Not(WORD), | |
| 68 sre_constants.CATEGORY_DIGIT: string.digits, | |
| 69 sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits), | |
| 70 sre_constants.CATEGORY_SPACE: string.whitespace, | |
| 71 sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace), | |
| 72 } | |
| 73 | |
| 74 # This constant varies between builds of Python; this is the lower value. | |
| 75 MAX_REPEAT_COUNT = 65535 | |
| 76 | |
| 77 | |
| 78 class ParseError(Exception): | |
| 79 pass | |
| 80 | |
| 81 | |
| 82 def slice_indices(slice_obj, size): | |
| 83 """slice_obj.indices() except this one supports longs.""" | |
| 84 # start stop step | |
| 85 start = slice_obj.start | |
| 86 stop = slice_obj.stop | |
| 87 step = slice_obj.step | |
| 88 | |
| 89 # We don't always update a value for negative indices (if we wrote it here | |
| 90 # due to None). | |
| 91 if step is None: | |
| 92 step = 1 | |
| 93 if start is None: | |
| 94 if step > 0: | |
| 95 start = 0 | |
| 96 else: | |
| 97 start = size - 1 | |
| 98 else: | |
| 99 start = _adjust_index(start, size) | |
| 100 | |
| 101 if stop is None: | |
| 102 if step > 0: | |
| 103 stop = size | |
| 104 else: | |
| 105 stop = -1 | |
| 106 else: | |
| 107 stop = _adjust_index(stop, size) | |
| 108 | |
| 109 return (start, stop, step) | |
| 110 | |
| 111 | |
| 112 def _adjust_index(n, size): | |
| 113 if n < 0: | |
| 114 n += size | |
| 115 | |
| 116 if n < 0: | |
| 117 raise IndexError("Out of range") | |
| 118 if n > size: | |
| 119 n = size | |
| 120 return n | |
| 121 | |
| 122 | |
| 123 def _xrange(*args): | |
| 124 """Because xrange doesn't support longs :(""" | |
| 125 # prefer real xrange if it works | |
| 126 try: | |
| 127 return xrange(*args) | |
| 128 except OverflowError: | |
| 129 return _bigrange(*args) | |
| 130 | |
| 131 | |
| 132 def _bigrange(*args): | |
| 133 if len(args) == 1: | |
| 134 start = 0; stop = args[0]; step = 1 | |
| 135 elif len(args) == 2: | |
| 136 start, stop = args | |
| 137 step = 1 | |
| 138 elif len(args) == 3: | |
| 139 start, stop, step = args | |
| 140 else: | |
| 141 raise ValueError("Too many args for _bigrange") | |
| 142 | |
| 143 i = start | |
| 144 while True: | |
| 145 yield i | |
| 146 i += step | |
| 147 if step < 0 and i <= stop: | |
| 148 break | |
| 149 if step > 0 and i >= stop: | |
| 150 break | |
| 151 | |
| 152 | |
| 153 class WrappedSequence(object): | |
| 154 """This wraps a sequence, purely as a base clase for the other uses.""" | |
| 155 | |
| 156 def __init__(self, raw): | |
| 157 # Derived classes will likely override this constructor | |
| 158 self.raw = raw | |
| 159 # Note that we can't use the function len() because it insists on trying | |
| 160 # to convert the returned number from a long-int to an ordinary int. | |
| 161 self.length = raw.__len__() | |
| 162 | |
| 163 def get_item(self, i, d=None): | |
| 164 i = _adjust_index(i, self.length) | |
| 165 if hasattr(self.raw, 'get_item'): | |
| 166 return self.raw.get_item(i, d) | |
| 167 return self.raw[i] | |
| 168 | |
| 169 def __len__(self): | |
| 170 return self.length | |
| 171 | |
| 172 def __getitem__(self, i): | |
| 173 # If the user wanted a slice, we provide a wrapper | |
| 174 if isinstance(i, slice): | |
| 175 result = SlicedSequence(self, slicer=i) | |
| 176 if result.__len__() < 16: | |
| 177 # Short lists are unpacked | |
| 178 result = [item for item in result] | |
| 179 return result | |
| 180 i = _adjust_index(i, self.length) | |
| 181 # Usually we just call the user-provided function | |
| 182 return self.get_item(i) | |
| 183 | |
| 184 def __iter__(self): | |
| 185 for i in _xrange(int(self.length)): | |
| 186 yield self.get_item(i) | |
| 187 | |
| 188 | |
| 189 def _sign(x): | |
| 190 if x > 0: | |
| 191 return 1 | |
| 192 else: | |
| 193 return -1 | |
| 194 | |
| 195 | |
| 196 class SlicedSequence(WrappedSequence): | |
| 197 """This is part of an immutable and potentially arbitrarily long list.""" | |
| 198 | |
| 199 def __init__(self, raw, slicer=None): | |
| 200 # Derived classes will likely override this constructor | |
| 201 self.raw = raw | |
| 202 if slicer is None: | |
| 203 self.start, self.stop, self.step = 0, raw.__len__(), 1 | |
| 204 else: | |
| 205 self.start, self.stop, self.step = slice_indices(slicer, raw.__len__()) | |
| 206 | |
| 207 # Integer round up, depending on step direction | |
| 208 self.length = ((self.stop - self.start + self.step - _sign(self.step)) / | |
| 209 self.step) | |
| 210 | |
| 211 def get_item(self, i, d=None): | |
| 212 j = i * self.step + self.start | |
| 213 return self.raw[j] | |
| 214 | |
| 215 | |
| 216 class ConcatenatedSequence(WrappedSequence): | |
| 217 """This is equivalent to using extend() but without unpacking the lists.""" | |
| 218 | |
| 219 def __init__(self, *alternatives): | |
| 220 self.list_lengths = [(a, a.__len__()) for a in alternatives] | |
| 221 self.length = sum(a_len for _, a_len in self.list_lengths) | |
| 222 | |
| 223 def get_item(self, i, d=None): | |
| 224 for a, a_len in self.list_lengths: | |
| 225 if i < a_len: | |
| 226 return a[i] | |
| 227 i -= a_len | |
| 228 raise IndexError('Too Big') | |
| 229 | |
| 230 def __contains__(self, item): | |
| 231 for a, _ in self.list_lengths: | |
| 232 if item in a: | |
| 233 return True | |
| 234 return False | |
| 235 | |
| 236 def __repr__(self): | |
| 237 return '{concat ' + repr(self.list_lengths) + '}' | |
| 238 | |
| 239 | |
| 240 class CombinatoricsSequence(WrappedSequence): | |
| 241 """This uses all combinations of one item from each passed list.""" | |
| 242 | |
| 243 def __init__(self, *components): | |
| 244 self.list_lengths = [(a, a.__len__()) for a in components] | |
| 245 self.length = 1 | |
| 246 for _, c_len in self.list_lengths: | |
| 247 self.length *= c_len | |
| 248 | |
| 249 def get_item(self, i, d=None): | |
| 250 result = [] | |
| 251 if i < 0: | |
| 252 i += self.length | |
| 253 if i < 0 or i >= self.length: | |
| 254 raise IndexError("Index %d out of bounds" % (i,)) | |
| 255 | |
| 256 if len(self.list_lengths) == 1: | |
| 257 # skip unnecessary ''.join -- big speedup | |
| 258 return self.list_lengths[0][0][i] | |
| 259 | |
| 260 for c, c_len in self.list_lengths: | |
| 261 i, mod = divmod(i, c_len) | |
| 262 if hasattr(c, 'get_item'): | |
| 263 result.append(c.get_item(mod, d)) | |
| 264 else: | |
| 265 result.append(c[mod]) | |
| 266 return ''.join(result) | |
| 267 | |
| 268 def __repr__(self): | |
| 269 return '{combin ' + repr(self.list_lengths) + '}' | |
| 270 | |
| 271 | |
| 272 class RepetitiveSequence(WrappedSequence): | |
| 273 """This chooses an entry from a list, many times, and concatenates.""" | |
| 274 | |
| 275 def __init__(self, content, lowest=1, highest=1): | |
| 276 self.content = content | |
| 277 self.content_length = content.__len__() | |
| 278 self.length = fastdivmod.powersum(self.content_length, lowest, highest) | |
| 279 self.lowest = lowest | |
| 280 self.highest = highest | |
| 281 | |
| 282 def arbitrary_entry(i): | |
| 283 return (fastdivmod.powersum(self.content_length, lowest, i+lowest-1), i+lowest) | |
| 284 | |
| 285 def entry_from_prev(i, prev): | |
| 286 return (prev[0] + (self.content_length ** prev[1]), prev[1] + 1) | |
| 287 | |
| 288 self.offsets = cachingseq.CachingFuncSequence( | |
| 289 arbitrary_entry, highest - lowest+1, entry_from_prev) | |
| 290 # This needs to be a constant in order to reuse caclulations in future | |
| 291 # calls to bisect (a moving target will produce more misses). | |
| 292 if self.offsets[-1][0] > sys.maxsize: | |
| 293 i = 0 | |
| 294 while i + 2 < len(self.offsets): | |
| 295 if self.offsets[i+1][0] > sys.maxsize: | |
| 296 self.index_of_offset = i | |
| 297 self.offset_break = self.offsets[i][0] | |
| 298 break | |
| 299 i += 1 | |
| 300 else: | |
| 301 self.index_of_offset = len(self.offsets) | |
| 302 self.offset_break = sys.maxsize | |
| 303 | |
| 304 def get_item(self, i, d=None): | |
| 305 """Finds out how many repeats this index implies, then picks strings.""" | |
| 306 if i < self.offset_break: | |
| 307 by_bisect = bisect.bisect_left(self.offsets, (i, -1), hi=self.index_of_offset) | |
| 308 else: | |
| 309 by_bisect = bisect.bisect_left(self.offsets, (i, -1), lo=self.index_of_offset) | |
| 310 | |
| 311 if by_bisect == len(self.offsets) or self.offsets[by_bisect][0] > i: | |
| 312 by_bisect -= 1 | |
| 313 | |
| 314 num = i - self.offsets[by_bisect][0] | |
| 315 count = self.offsets[by_bisect][1] | |
| 316 | |
| 317 if count > 100 and self.content_length < 1000: | |
| 318 content = list(self.content) | |
| 319 else: | |
| 320 content = self.content | |
| 321 | |
| 322 result = [] | |
| 323 | |
| 324 if count == 0: | |
| 325 return '' | |
| 326 | |
| 327 for modulus in fastdivmod.divmod_iter(num, self.content_length): | |
| 328 result.append(content[modulus]) | |
| 329 | |
| 330 leftover = count - len(result) | |
| 331 if leftover: | |
| 332 assert leftover > 0 | |
| 333 result.extend([content[0]] * leftover) | |
| 334 | |
| 335 # smallest place value ends up on the right | |
| 336 return ''.join(result[::-1]) | |
| 337 | |
| 338 def __repr__(self): | |
| 339 return '{repeat base=%d low=%d high=%d}' % (self.content_length, self.lowest, self.highest) | |
| 340 | |
| 341 | |
| 342 class SaveCaptureGroup(WrappedSequence): | |
| 343 def __init__(self, parsed, key): | |
| 344 self.key = key | |
| 345 super(SaveCaptureGroup, self).__init__(parsed) | |
| 346 | |
| 347 def get_item(self, n, d=None): | |
| 348 rv = super(SaveCaptureGroup, self).get_item(n, d) | |
| 349 if d is not None: | |
| 350 d[self.key] = rv | |
| 351 return rv | |
| 352 | |
| 353 | |
| 354 class ReadCaptureGroup(WrappedSequence): | |
| 355 def __init__(self, n): | |
| 356 self.num = n | |
| 357 self.length = 1 | |
| 358 | |
| 359 def get_item(self, i, d=None): | |
| 360 if i != 0: | |
| 361 raise IndexError(i) | |
| 362 if d is None: | |
| 363 raise ValueError('ReadCaptureGroup with no dict') | |
| 364 return d.get(self.num, "fail") | |
| 365 | |
| 366 | |
| 367 class RegexMembershipSequence(WrappedSequence): | |
| 368 """Creates a sequence from the regex, knows how to test membership.""" | |
| 369 | |
| 370 def empty_list(self, *_): | |
| 371 return [] | |
| 372 | |
| 373 def nothing_added(self, *_): | |
| 374 return [''] | |
| 375 | |
| 376 def branch_values(self, _, items): | |
| 377 """Converts SRE parser data into literals and merges those lists.""" | |
| 378 return ConcatenatedSequence( | |
| 379 *[self.sub_values(parsed) for parsed in items]) | |
| 380 | |
| 381 def max_repeat_values(self, min_count, max_count, items): | |
| 382 """Sequential expansion of the count to be combinatorics.""" | |
| 383 max_count = min(max_count, self.max_count) | |
| 384 return RepetitiveSequence( | |
| 385 self.sub_values(items), min_count, max_count) | |
| 386 | |
| 387 def in_values(self, items): | |
| 388 # Special case which distinguishes branch from charset operator | |
| 389 if items and items[0][0] == sre_constants.NEGATE: | |
| 390 items = self.branch_values(None, items[1:]) | |
| 391 return [item for item in self.charset if item not in items] | |
| 392 return self.branch_values(None, items) | |
| 393 | |
| 394 def not_literal(self, y): | |
| 395 return self.in_values(((sre_constants.NEGATE,), | |
| 396 (sre_constants.LITERAL, y),)) | |
| 397 | |
| 398 def category(self, y): | |
| 399 return CATEGORIES[y] | |
| 400 | |
| 401 def groupref(self, n): | |
| 402 self.has_groupref = True | |
| 403 return ReadCaptureGroup(n) | |
| 404 | |
| 405 def get_item(self, i, d=None): | |
| 406 """Typically only pass i. d is an internal detail, for consistency with other classes. | |
| 407 | |
| 408 If you care about the capture groups, you should use | |
| 409 RegexMembershipSequenceMatches instead, which returns a Match object | |
| 410 instead of a string.""" | |
| 411 if self.has_groupref or d is not None: | |
| 412 if d is None: | |
| 413 d = {} | |
| 414 return super(RegexMembershipSequence, self).get_item(i, d) | |
| 415 else: | |
| 416 return super(RegexMembershipSequence, self).get_item(i) | |
| 417 | |
| 418 def sub_values(self, parsed): | |
| 419 """This knows how to convert one piece of parsed pattern.""" | |
| 420 # If this is a subpattern object, we just want its data | |
| 421 if isinstance(parsed, sre_parse.SubPattern): | |
| 422 parsed = parsed.data | |
| 423 # A list indicates sequential elements of a string | |
| 424 if isinstance(parsed, list): | |
| 425 elements = [self.sub_values(p) for p in parsed] | |
| 426 return CombinatoricsSequence(*elements) | |
| 427 # If not a list, a tuple represents a specific match type | |
| 428 if isinstance(parsed, tuple) and parsed: | |
| 429 matcher, arguments = parsed | |
| 430 if not isinstance(arguments, tuple): | |
| 431 arguments = (arguments,) | |
| 432 if matcher in self.backends: | |
| 433 self.check_anchor_state(matcher, arguments) | |
| 434 return self.backends[matcher](*arguments) | |
| 435 # No idea what to do here | |
| 436 raise ParseError(repr(parsed)) | |
| 437 | |
| 438 def maybe_save(self, *args): | |
| 439 # Python 3.6 has group, add_flags, del_flags, parsed | |
| 440 # while earlier versions just have group, parsed | |
| 441 group = args[0] | |
| 442 parsed = args[-1] | |
| 443 rv = self.sub_values(parsed) | |
| 444 if group is not None: | |
| 445 rv = SaveCaptureGroup(rv, group) | |
| 446 return rv | |
| 447 | |
| 448 def check_anchor_state(self, matcher, arguments): | |
| 449 # A bit of a hack to support zero-width leading anchors. The goal is | |
| 450 # that /^(a|b)$/ will match properly, and that /a^b/ or /a\bb/ throws | |
| 451 # an error. (It's unfortunate that I couldn't easily handle /$^/ which | |
| 452 # matches the empty string; I went for the common case.) | |
| 453 # | |
| 454 # There are three states, for example: | |
| 455 # / STATE_START | |
| 456 # | / STATE_START (^ causes no transition here, but is illegal at STATE_MIDDLE or STATE_END) | |
| 457 # | | / STATE_START (\b causes no transition here, but advances MIDDLE to END) | |
| 458 # | | | / (same as above for ^) | |
| 459 # | | | | / STATE_MIDDLE (anything besides ^ and \b advances START to MIDDLE) | |
| 460 # | | | | | / still STATE_MIDDLE | |
| 461 # . . . . . . / advances MIDDLE to END | |
| 462 # ^ \b ^ X Y \b $ | |
| 463 old_state = self.state | |
| 464 if self.state == STATE_START: | |
| 465 if matcher == sre_constants.AT: | |
| 466 if arguments[0] in (sre_constants.AT_END, sre_constants.AT_END_STRING): | |
| 467 self.state = STATE_END | |
| 468 elif arguments[0] == sre_constants.AT_NON_BOUNDARY: | |
| 469 # This is nonsensical at beginning of string | |
| 470 raise ParseError('Anchor %r found at START state' % (arguments[0],)) | |
| 471 # All others (AT_BEGINNING, AT_BEGINNING_STRING, and AT_BOUNDARY) remain in START. | |
| 472 elif matcher != sre_constants.SUBPATTERN: | |
| 473 self.state = STATE_MIDDLE | |
| 474 # subpattern remains in START | |
| 475 elif self.state == STATE_END: | |
| 476 if matcher == sre_constants.AT: | |
| 477 if arguments[0] not in ( | |
| 478 sre_constants.AT_END, sre_constants.AT_END_STRING, | |
| 479 sre_constants.AT_BOUNDARY): | |
| 480 raise ParseError('Anchor %r found at END state' % (arguments[0],)) | |
| 481 # those three remain in END | |
| 482 elif matcher != sre_constants.SUBPATTERN: | |
| 483 raise ParseError('Non-end-anchor %r found at END state' % (arguments[0],)) | |
| 484 # subpattern remains in END | |
| 485 else: # self.state == STATE_MIDDLE | |
| 486 if matcher == sre_constants.AT: | |
| 487 if arguments[0] not in ( | |
| 488 sre_constants.AT_END, sre_constants.AT_END_STRING, | |
| 489 sre_constants.AT_BOUNDARY): | |
| 490 raise ParseError('Anchor %r found at MIDDLE state' % (arguments[0],)) | |
| 491 # All others (AT_END, AT_END_STRING, AT_BOUNDARY) advance to END. | |
| 492 self.state = STATE_END | |
| 493 | |
| 494 def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None): | |
| 495 # If the RE module cannot compile it, we give up quickly | |
| 496 self.matcher = re.compile(r'(?:%s)\Z' % pattern, flags) | |
| 497 if not flags & re.DOTALL: | |
| 498 charset = ''.join(c for c in charset if c != '\n') | |
| 499 self.charset = charset | |
| 500 | |
| 501 self.named_group_lookup = self.matcher.groupindex | |
| 502 | |
| 503 flags |= DEFAULT_RE_FLAGS # https://github.com/google/sre_yield/issues/3 | |
| 504 if flags & re.IGNORECASE: | |
| 505 raise ParseError('Flag "i" not supported. https://github.com/google/sre_yield/issues/4') | |
| 506 elif flags & re.UNICODE: | |
| 507 raise ParseError('Flag "u" not supported. https://github.com/google/sre_yield/issues/3') | |
| 508 elif flags & re.LOCALE: | |
| 509 raise ParseError('Flag "l" not supported. https://github.com/google/sre_yield/issues/5') | |
| 510 | |
| 511 if max_count is None: | |
| 512 self.max_count = MAX_REPEAT_COUNT | |
| 513 else: | |
| 514 self.max_count = max_count | |
| 515 | |
| 516 self.has_groupref = False | |
| 517 | |
| 518 # Configure the parser backends | |
| 519 self.backends = { | |
| 520 sre_constants.LITERAL: lambda y: [chr(y)], | |
| 521 sre_constants.RANGE: lambda l, h: [chr(c) for c in range(l, h+1)], | |
| 522 sre_constants.SUBPATTERN: self.maybe_save, | |
| 523 sre_constants.BRANCH: self.branch_values, | |
| 524 sre_constants.MIN_REPEAT: self.max_repeat_values, | |
| 525 sre_constants.MAX_REPEAT: self.max_repeat_values, | |
| 526 sre_constants.AT: self.nothing_added, | |
| 527 sre_constants.ASSERT: self.empty_list, | |
| 528 sre_constants.ASSERT_NOT: self.empty_list, | |
| 529 sre_constants.ANY: | |
| 530 lambda _: self.in_values(((sre_constants.NEGATE,),)), | |
| 531 sre_constants.IN: self.in_values, | |
| 532 sre_constants.NOT_LITERAL: self.not_literal, | |
| 533 sre_constants.CATEGORY: self.category, | |
| 534 sre_constants.GROUPREF: self.groupref, | |
| 535 } | |
| 536 self.state = STATE_START | |
| 537 # Now build a generator that knows all possible patterns | |
| 538 self.raw = self.sub_values(sre_parse.parse(pattern, flags)) | |
| 539 # Configure this class instance to know about that result | |
| 540 self.length = self.raw.__len__() | |
| 541 | |
| 542 def __contains__(self, item): | |
| 543 # Since we have a regex, we can search the list really cheaply | |
| 544 return self.matcher.match(item) is not None | |
| 545 | |
| 546 | |
| 547 class RegexMembershipSequenceMatches(RegexMembershipSequence): | |
| 548 def __getitem__(self, i): | |
| 549 if isinstance(i, slice): | |
| 550 result = SlicedSequence(self, slicer=i) | |
| 551 if result.__len__() < 16: | |
| 552 # Short lists are unpacked | |
| 553 result = [item for item in result] | |
| 554 return result | |
| 555 | |
| 556 d = {} | |
| 557 s = super(RegexMembershipSequenceMatches, self).get_item(i, d) | |
| 558 return Match(s, d, self.named_group_lookup) | |
| 559 | |
| 560 | |
| 561 def AllStrings(regex, flags=0, charset=CHARSET, max_count=None): | |
| 562 """Constructs an object that will generate all matching strings.""" | |
| 563 return RegexMembershipSequence(regex, flags, charset, max_count=max_count) | |
| 564 | |
| 565 Values = AllStrings | |
| 566 | |
| 567 | |
| 568 class Match(object): | |
| 569 def __init__(self, string, groups, named_groups): | |
| 570 # TODO keep group(0) only, and spans for the rest. | |
| 571 self._string = string | |
| 572 self._groups = groups | |
| 573 self._named_groups = named_groups | |
| 574 self.lastindex = len(groups) + 1 | |
| 575 | |
| 576 def group(self, n=0): | |
| 577 if n == 0: | |
| 578 return self._string | |
| 579 if not isinstance(n, int): | |
| 580 n = self._named_groups[n] | |
| 581 return self._groups[n] | |
| 582 | |
| 583 def groups(self): | |
| 584 return tuple(self._groups[i] for i in range(1, self.lastindex)) | |
| 585 | |
| 586 def groupdict(self): | |
| 587 d = {} | |
| 588 for k, v in self._named_groups.items(): | |
| 589 d[k] = self._groups[v] | |
| 590 return d | |
| 591 | |
| 592 def span(self, n=0): | |
| 593 raise NotImplementedError() | |
| 594 | |
| 595 | |
| 596 def AllMatches(regex, flags=0, charset=CHARSET, max_count=None): | |
| 597 """Constructs an object that will generate all matching strings.""" | |
| 598 return RegexMembershipSequenceMatches(regex, flags, charset, max_count=max_count) | |
| 599 | |
| 600 | |
| 601 def main(argv=None): | |
| 602 """This module can be executed on the command line for testing.""" | |
| 603 if argv is None: | |
| 604 argv = sys.argv | |
| 605 for arg in argv[1:]: | |
| 606 for i in AllStrings(arg): | |
| 607 print(i) | |
| 608 | |
| 609 | |
| 610 if __name__ == '__main__': | |
| 611 main() |
