| 41 | 1 #!/usr/bin/env python2 | 
|  | 2 # | 
|  | 3 # Copyright 2011-2016 Google Inc. | 
|  | 4 # | 
|  | 5 # Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 6 # you may not use this file except in compliance with the License. | 
|  | 7 # You may obtain a copy of the License at | 
|  | 8 # | 
|  | 9 #      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 10 # | 
|  | 11 # Unless required by applicable law or agreed to in writing, software | 
|  | 12 # distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 14 # See the License for the specific language governing permissions and | 
|  | 15 # limitations under the License. | 
|  | 16 # | 
|  | 17 # vim: sw=2 sts=2 et | 
|  | 18 | 
|  | 19 """This module can generate all strings that match a regular expression. | 
|  | 20 | 
|  | 21 The regex is parsed using the SRE module that is standard in python, | 
|  | 22 then the data structure is executed to form a bunch of iterators. | 
|  | 23 """ | 
|  | 24 | 
|  | 25 __author__ = 'alexperry@google.com (Alex Perry)' | 
|  | 26 __all__ = ['Values', 'AllStrings', 'AllMatches', 'ParseError'] | 
|  | 27 | 
|  | 28 | 
|  | 29 import bisect | 
|  | 30 import math | 
|  | 31 import re | 
|  | 32 import sre_constants | 
|  | 33 import sre_parse | 
|  | 34 import string | 
|  | 35 import sys | 
|  | 36 import types | 
|  | 37 | 
|  | 38 import cachingseq | 
|  | 39 import fastdivmod | 
|  | 40 | 
|  | 41 try: | 
|  | 42     xrange = xrange | 
|  | 43 except NameError: | 
|  | 44     xrange = range | 
|  | 45 | 
|  | 46 _RE_METACHARS = r'$^{}*+\\' | 
|  | 47 _ESCAPED_METACHAR = r'\\[' + _RE_METACHARS + r']' | 
|  | 48 ESCAPED_METACHAR_RE = re.compile(_ESCAPED_METACHAR) | 
|  | 49 # ASCII by default, see https://github.com/google/sre_yield/issues/3 | 
|  | 50 CHARSET = [chr(c) for c in range(256)] | 
|  | 51 | 
|  | 52 WORD = string.ascii_letters + string.digits + '_' | 
|  | 53 | 
|  | 54 try: | 
|  | 55     DEFAULT_RE_FLAGS = re.ASCII | 
|  | 56 except AttributeError: | 
|  | 57     DEFAULT_RE_FLAGS = 0 | 
|  | 58 | 
|  | 59 STATE_START, STATE_MIDDLE, STATE_END = list(range(3)) | 
|  | 60 | 
|  | 61 def Not(chars): | 
|  | 62     return ''.join(sorted(set(CHARSET) - set(chars))) | 
|  | 63 | 
|  | 64 | 
|  | 65 CATEGORIES = { | 
|  | 66     sre_constants.CATEGORY_WORD: WORD, | 
|  | 67     sre_constants.CATEGORY_NOT_WORD: Not(WORD), | 
|  | 68     sre_constants.CATEGORY_DIGIT: string.digits, | 
|  | 69     sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits), | 
|  | 70     sre_constants.CATEGORY_SPACE: string.whitespace, | 
|  | 71     sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace), | 
|  | 72 } | 
|  | 73 | 
|  | 74 # This constant varies between builds of Python; this is the lower value. | 
|  | 75 MAX_REPEAT_COUNT = 65535 | 
|  | 76 | 
|  | 77 | 
|  | 78 class ParseError(Exception): | 
|  | 79     pass | 
|  | 80 | 
|  | 81 | 
|  | 82 def slice_indices(slice_obj, size): | 
|  | 83     """slice_obj.indices() except this one supports longs.""" | 
|  | 84     # start stop step | 
|  | 85     start = slice_obj.start | 
|  | 86     stop = slice_obj.stop | 
|  | 87     step = slice_obj.step | 
|  | 88 | 
|  | 89     # We don't always update a value for negative indices (if we wrote it here | 
|  | 90     # due to None). | 
|  | 91     if step is None: | 
|  | 92         step = 1 | 
|  | 93     if start is None: | 
|  | 94         if step > 0: | 
|  | 95             start = 0 | 
|  | 96         else: | 
|  | 97             start = size - 1 | 
|  | 98     else: | 
|  | 99         start = _adjust_index(start, size) | 
|  | 100 | 
|  | 101     if stop is None: | 
|  | 102         if step > 0: | 
|  | 103             stop = size | 
|  | 104         else: | 
|  | 105             stop = -1 | 
|  | 106     else: | 
|  | 107         stop = _adjust_index(stop, size) | 
|  | 108 | 
|  | 109     return (start, stop, step) | 
|  | 110 | 
|  | 111 | 
|  | 112 def _adjust_index(n, size): | 
|  | 113     if n < 0: | 
|  | 114         n += size | 
|  | 115 | 
|  | 116     if n < 0: | 
|  | 117         raise IndexError("Out of range") | 
|  | 118     if n > size: | 
|  | 119         n = size | 
|  | 120     return n | 
|  | 121 | 
|  | 122 | 
|  | 123 def _xrange(*args): | 
|  | 124     """Because xrange doesn't support longs :(""" | 
|  | 125     # prefer real xrange if it works | 
|  | 126     try: | 
|  | 127         return xrange(*args) | 
|  | 128     except OverflowError: | 
|  | 129         return _bigrange(*args) | 
|  | 130 | 
|  | 131 | 
|  | 132 def _bigrange(*args): | 
|  | 133     if len(args) == 1: | 
|  | 134         start = 0; stop = args[0]; step = 1 | 
|  | 135     elif len(args) == 2: | 
|  | 136         start, stop = args | 
|  | 137         step = 1 | 
|  | 138     elif len(args) == 3: | 
|  | 139         start, stop, step = args | 
|  | 140     else: | 
|  | 141         raise ValueError("Too many args for _bigrange") | 
|  | 142 | 
|  | 143     i = start | 
|  | 144     while True: | 
|  | 145         yield i | 
|  | 146         i += step | 
|  | 147         if step < 0 and i <= stop: | 
|  | 148             break | 
|  | 149         if step > 0 and i >= stop: | 
|  | 150             break | 
|  | 151 | 
|  | 152 | 
|  | 153 class WrappedSequence(object): | 
|  | 154     """This wraps a sequence, purely as a base clase for the other uses.""" | 
|  | 155 | 
|  | 156     def __init__(self, raw): | 
|  | 157         # Derived classes will likely override this constructor | 
|  | 158         self.raw = raw | 
|  | 159         # Note that we can't use the function len() because it insists on trying | 
|  | 160         # to convert the returned number from a long-int to an ordinary int. | 
|  | 161         self.length = raw.__len__() | 
|  | 162 | 
|  | 163     def get_item(self, i, d=None): | 
|  | 164         i = _adjust_index(i, self.length) | 
|  | 165         if hasattr(self.raw, 'get_item'): | 
|  | 166             return self.raw.get_item(i, d) | 
|  | 167         return self.raw[i] | 
|  | 168 | 
|  | 169     def __len__(self): | 
|  | 170         return self.length | 
|  | 171 | 
|  | 172     def __getitem__(self, i): | 
|  | 173         # If the user wanted a slice, we provide a wrapper | 
|  | 174         if isinstance(i, slice): | 
|  | 175             result = SlicedSequence(self, slicer=i) | 
|  | 176             if result.__len__() < 16: | 
|  | 177                 # Short lists are unpacked | 
|  | 178                 result = [item for item in result] | 
|  | 179             return result | 
|  | 180         i = _adjust_index(i, self.length) | 
|  | 181         # Usually we just call the user-provided function | 
|  | 182         return self.get_item(i) | 
|  | 183 | 
|  | 184     def __iter__(self): | 
|  | 185         for i in _xrange(int(self.length)): | 
|  | 186             yield self.get_item(i) | 
|  | 187 | 
|  | 188 | 
|  | 189 def _sign(x): | 
|  | 190     if x > 0: | 
|  | 191         return 1 | 
|  | 192     else: | 
|  | 193         return -1 | 
|  | 194 | 
|  | 195 | 
|  | 196 class SlicedSequence(WrappedSequence): | 
|  | 197     """This is part of an immutable and potentially arbitrarily long list.""" | 
|  | 198 | 
|  | 199     def __init__(self, raw, slicer=None): | 
|  | 200         # Derived classes will likely override this constructor | 
|  | 201         self.raw = raw | 
|  | 202         if slicer is None: | 
|  | 203             self.start, self.stop, self.step = 0, raw.__len__(), 1 | 
|  | 204         else: | 
|  | 205             self.start, self.stop, self.step = slice_indices(slicer, raw.__len__()) | 
|  | 206 | 
|  | 207         # Integer round up, depending on step direction | 
|  | 208         self.length = ((self.stop - self.start + self.step - _sign(self.step)) / | 
|  | 209                        self.step) | 
|  | 210 | 
|  | 211     def get_item(self, i, d=None): | 
|  | 212         j = i * self.step + self.start | 
|  | 213         return self.raw[j] | 
|  | 214 | 
|  | 215 | 
|  | 216 class ConcatenatedSequence(WrappedSequence): | 
|  | 217     """This is equivalent to using extend() but without unpacking the lists.""" | 
|  | 218 | 
|  | 219     def __init__(self, *alternatives): | 
|  | 220         self.list_lengths = [(a, a.__len__()) for a in alternatives] | 
|  | 221         self.length = sum(a_len for _, a_len in self.list_lengths) | 
|  | 222 | 
|  | 223     def get_item(self, i, d=None): | 
|  | 224         for a, a_len in self.list_lengths: | 
|  | 225             if i < a_len: | 
|  | 226                 return a[i] | 
|  | 227             i -= a_len | 
|  | 228         raise IndexError('Too Big') | 
|  | 229 | 
|  | 230     def __contains__(self, item): | 
|  | 231         for a, _ in self.list_lengths: | 
|  | 232             if item in a: | 
|  | 233                 return True | 
|  | 234         return False | 
|  | 235 | 
|  | 236     def __repr__(self): | 
|  | 237         return '{concat ' + repr(self.list_lengths) + '}' | 
|  | 238 | 
|  | 239 | 
|  | 240 class CombinatoricsSequence(WrappedSequence): | 
|  | 241     """This uses all combinations of one item from each passed list.""" | 
|  | 242 | 
|  | 243     def __init__(self, *components): | 
|  | 244         self.list_lengths = [(a, a.__len__()) for a in components] | 
|  | 245         self.length = 1 | 
|  | 246         for _, c_len in self.list_lengths: | 
|  | 247             self.length *= c_len | 
|  | 248 | 
|  | 249     def get_item(self, i, d=None): | 
|  | 250         result = [] | 
|  | 251         if i < 0: | 
|  | 252             i += self.length | 
|  | 253         if i < 0 or i >= self.length: | 
|  | 254             raise IndexError("Index %d out of bounds" % (i,)) | 
|  | 255 | 
|  | 256         if len(self.list_lengths) == 1: | 
|  | 257             # skip unnecessary ''.join -- big speedup | 
|  | 258             return self.list_lengths[0][0][i] | 
|  | 259 | 
|  | 260         for c, c_len in self.list_lengths: | 
|  | 261             i, mod = divmod(i, c_len) | 
|  | 262             if hasattr(c, 'get_item'): | 
|  | 263                 result.append(c.get_item(mod, d)) | 
|  | 264             else: | 
|  | 265                 result.append(c[mod]) | 
|  | 266         return ''.join(result) | 
|  | 267 | 
|  | 268     def __repr__(self): | 
|  | 269         return '{combin ' + repr(self.list_lengths) + '}' | 
|  | 270 | 
|  | 271 | 
|  | 272 class RepetitiveSequence(WrappedSequence): | 
|  | 273     """This chooses an entry from a list, many times, and concatenates.""" | 
|  | 274 | 
|  | 275     def __init__(self, content, lowest=1, highest=1): | 
|  | 276         self.content = content | 
|  | 277         self.content_length = content.__len__() | 
|  | 278         self.length = fastdivmod.powersum(self.content_length, lowest, highest) | 
|  | 279         self.lowest = lowest | 
|  | 280         self.highest = highest | 
|  | 281 | 
|  | 282         def arbitrary_entry(i): | 
|  | 283             return (fastdivmod.powersum(self.content_length, lowest, i+lowest-1), i+lowest) | 
|  | 284 | 
|  | 285         def entry_from_prev(i, prev): | 
|  | 286             return (prev[0] + (self.content_length ** prev[1]), prev[1] + 1) | 
|  | 287 | 
|  | 288         self.offsets = cachingseq.CachingFuncSequence( | 
|  | 289             arbitrary_entry, highest - lowest+1, entry_from_prev) | 
|  | 290         # This needs to be a constant in order to reuse caclulations in future | 
|  | 291         # calls to bisect (a moving target will produce more misses). | 
|  | 292         if self.offsets[-1][0] > sys.maxsize: | 
|  | 293             i = 0 | 
|  | 294             while i + 2 < len(self.offsets): | 
|  | 295                 if self.offsets[i+1][0] > sys.maxsize: | 
|  | 296                     self.index_of_offset = i | 
|  | 297                     self.offset_break = self.offsets[i][0] | 
|  | 298                     break | 
|  | 299                 i += 1 | 
|  | 300         else: | 
|  | 301             self.index_of_offset = len(self.offsets) | 
|  | 302             self.offset_break = sys.maxsize | 
|  | 303 | 
|  | 304     def get_item(self, i, d=None): | 
|  | 305         """Finds out how many repeats this index implies, then picks strings.""" | 
|  | 306         if i < self.offset_break: | 
|  | 307             by_bisect = bisect.bisect_left(self.offsets, (i, -1), hi=self.index_of_offset) | 
|  | 308         else: | 
|  | 309             by_bisect = bisect.bisect_left(self.offsets, (i, -1), lo=self.index_of_offset) | 
|  | 310 | 
|  | 311         if by_bisect == len(self.offsets) or self.offsets[by_bisect][0] > i: | 
|  | 312             by_bisect -= 1 | 
|  | 313 | 
|  | 314         num = i - self.offsets[by_bisect][0] | 
|  | 315         count = self.offsets[by_bisect][1] | 
|  | 316 | 
|  | 317         if count > 100 and self.content_length < 1000: | 
|  | 318             content = list(self.content) | 
|  | 319         else: | 
|  | 320             content = self.content | 
|  | 321 | 
|  | 322         result = [] | 
|  | 323 | 
|  | 324         if count == 0: | 
|  | 325             return '' | 
|  | 326 | 
|  | 327         for modulus in fastdivmod.divmod_iter(num, self.content_length): | 
|  | 328             result.append(content[modulus]) | 
|  | 329 | 
|  | 330         leftover = count - len(result) | 
|  | 331         if leftover: | 
|  | 332             assert leftover > 0 | 
|  | 333             result.extend([content[0]] * leftover) | 
|  | 334 | 
|  | 335         # smallest place value ends up on the right | 
|  | 336         return ''.join(result[::-1]) | 
|  | 337 | 
|  | 338     def __repr__(self): | 
|  | 339         return '{repeat base=%d low=%d high=%d}' % (self.content_length, self.lowest, self.highest) | 
|  | 340 | 
|  | 341 | 
|  | 342 class SaveCaptureGroup(WrappedSequence): | 
|  | 343     def __init__(self, parsed, key): | 
|  | 344         self.key = key | 
|  | 345         super(SaveCaptureGroup, self).__init__(parsed) | 
|  | 346 | 
|  | 347     def get_item(self, n, d=None): | 
|  | 348         rv = super(SaveCaptureGroup, self).get_item(n, d) | 
|  | 349         if d is not None: | 
|  | 350             d[self.key] = rv | 
|  | 351         return rv | 
|  | 352 | 
|  | 353 | 
|  | 354 class ReadCaptureGroup(WrappedSequence): | 
|  | 355     def __init__(self, n): | 
|  | 356         self.num = n | 
|  | 357         self.length = 1 | 
|  | 358 | 
|  | 359     def get_item(self, i, d=None): | 
|  | 360         if i != 0: | 
|  | 361             raise IndexError(i) | 
|  | 362         if d is None: | 
|  | 363             raise ValueError('ReadCaptureGroup with no dict') | 
|  | 364         return d.get(self.num, "fail") | 
|  | 365 | 
|  | 366 | 
|  | 367 class RegexMembershipSequence(WrappedSequence): | 
|  | 368     """Creates a sequence from the regex, knows how to test membership.""" | 
|  | 369 | 
|  | 370     def empty_list(self, *_): | 
|  | 371         return [] | 
|  | 372 | 
|  | 373     def nothing_added(self, *_): | 
|  | 374         return [''] | 
|  | 375 | 
|  | 376     def branch_values(self, _, items): | 
|  | 377         """Converts SRE parser data into literals and merges those lists.""" | 
|  | 378         return ConcatenatedSequence( | 
|  | 379             *[self.sub_values(parsed) for parsed in items]) | 
|  | 380 | 
|  | 381     def max_repeat_values(self, min_count, max_count, items): | 
|  | 382         """Sequential expansion of the count to be combinatorics.""" | 
|  | 383         max_count = min(max_count, self.max_count) | 
|  | 384         return RepetitiveSequence( | 
|  | 385             self.sub_values(items), min_count, max_count) | 
|  | 386 | 
|  | 387     def in_values(self, items): | 
|  | 388         # Special case which distinguishes branch from charset operator | 
|  | 389         if items and items[0][0] == sre_constants.NEGATE: | 
|  | 390             items = self.branch_values(None, items[1:]) | 
|  | 391             return [item for item in self.charset if item not in items] | 
|  | 392         return self.branch_values(None, items) | 
|  | 393 | 
|  | 394     def not_literal(self, y): | 
|  | 395         return self.in_values(((sre_constants.NEGATE,), | 
|  | 396                               (sre_constants.LITERAL, y),)) | 
|  | 397 | 
|  | 398     def category(self, y): | 
|  | 399         return CATEGORIES[y] | 
|  | 400 | 
|  | 401     def groupref(self, n): | 
|  | 402         self.has_groupref = True | 
|  | 403         return ReadCaptureGroup(n) | 
|  | 404 | 
|  | 405     def get_item(self, i, d=None): | 
|  | 406         """Typically only pass i.  d is an internal detail, for consistency with other classes. | 
|  | 407 | 
|  | 408         If you care about the capture groups, you should use | 
|  | 409         RegexMembershipSequenceMatches instead, which returns a Match object | 
|  | 410         instead of a string.""" | 
|  | 411         if self.has_groupref or d is not None: | 
|  | 412             if d is None: | 
|  | 413                 d = {} | 
|  | 414             return super(RegexMembershipSequence, self).get_item(i, d) | 
|  | 415         else: | 
|  | 416             return super(RegexMembershipSequence, self).get_item(i) | 
|  | 417 | 
|  | 418     def sub_values(self, parsed): | 
|  | 419         """This knows how to convert one piece of parsed pattern.""" | 
|  | 420         # If this is a subpattern object, we just want its data | 
|  | 421         if isinstance(parsed, sre_parse.SubPattern): | 
|  | 422             parsed = parsed.data | 
|  | 423         # A list indicates sequential elements of a string | 
|  | 424         if isinstance(parsed, list): | 
|  | 425             elements = [self.sub_values(p) for p in parsed] | 
|  | 426             return CombinatoricsSequence(*elements) | 
|  | 427         # If not a list, a tuple represents a specific match type | 
|  | 428         if isinstance(parsed, tuple) and parsed: | 
|  | 429             matcher, arguments = parsed | 
|  | 430             if not isinstance(arguments, tuple): | 
|  | 431                 arguments = (arguments,) | 
|  | 432             if matcher in self.backends: | 
|  | 433                 self.check_anchor_state(matcher, arguments) | 
|  | 434                 return self.backends[matcher](*arguments) | 
|  | 435         # No idea what to do here | 
|  | 436         raise ParseError(repr(parsed)) | 
|  | 437 | 
|  | 438     def maybe_save(self, *args): | 
|  | 439         # Python 3.6 has group, add_flags, del_flags, parsed | 
|  | 440         # while earlier versions just have group, parsed | 
|  | 441         group = args[0] | 
|  | 442         parsed = args[-1] | 
|  | 443         rv = self.sub_values(parsed) | 
|  | 444         if group is not None: | 
|  | 445             rv = SaveCaptureGroup(rv, group) | 
|  | 446         return rv | 
|  | 447 | 
|  | 448     def check_anchor_state(self, matcher, arguments): | 
|  | 449         # A bit of a hack to support zero-width leading anchors.  The goal is | 
|  | 450         # that /^(a|b)$/ will match properly, and that /a^b/ or /a\bb/ throws | 
|  | 451         # an error.  (It's unfortunate that I couldn't easily handle /$^/ which | 
|  | 452         # matches the empty string; I went for the common case.) | 
|  | 453         # | 
|  | 454         # There are three states, for example: | 
|  | 455         # / STATE_START | 
|  | 456         # | / STATE_START (^ causes no transition here, but is illegal at STATE_MIDDLE or STATE_END) | 
|  | 457         # | |  / STATE_START (\b causes no transition here, but advances MIDDLE to END) | 
|  | 458         # | |  | / (same as above for ^) | 
|  | 459         # | |  | | / STATE_MIDDLE (anything besides ^ and \b advances START to MIDDLE) | 
|  | 460         # | |  | | | / still STATE_MIDDLE | 
|  | 461         # . .  . . . .  / advances MIDDLE to END | 
|  | 462         #  ^ \b ^ X Y \b $ | 
|  | 463         old_state = self.state | 
|  | 464         if self.state == STATE_START: | 
|  | 465             if matcher == sre_constants.AT: | 
|  | 466                 if arguments[0] in (sre_constants.AT_END, sre_constants.AT_END_STRING): | 
|  | 467                     self.state = STATE_END | 
|  | 468                 elif arguments[0] == sre_constants.AT_NON_BOUNDARY: | 
|  | 469                     # This is nonsensical at beginning of string | 
|  | 470                     raise ParseError('Anchor %r found at START state' % (arguments[0],)) | 
|  | 471                 # All others (AT_BEGINNING, AT_BEGINNING_STRING, and AT_BOUNDARY) remain in START. | 
|  | 472             elif matcher != sre_constants.SUBPATTERN: | 
|  | 473                 self.state = STATE_MIDDLE | 
|  | 474             # subpattern remains in START | 
|  | 475         elif self.state == STATE_END: | 
|  | 476             if matcher == sre_constants.AT: | 
|  | 477                 if arguments[0] not in ( | 
|  | 478                     sre_constants.AT_END, sre_constants.AT_END_STRING, | 
|  | 479                     sre_constants.AT_BOUNDARY): | 
|  | 480                     raise ParseError('Anchor %r found at END state' % (arguments[0],)) | 
|  | 481                 # those three remain in END | 
|  | 482             elif matcher != sre_constants.SUBPATTERN: | 
|  | 483                 raise ParseError('Non-end-anchor %r found at END state' % (arguments[0],)) | 
|  | 484             # subpattern remains in END | 
|  | 485         else:  # self.state == STATE_MIDDLE | 
|  | 486             if matcher == sre_constants.AT: | 
|  | 487                 if arguments[0] not in ( | 
|  | 488                     sre_constants.AT_END, sre_constants.AT_END_STRING, | 
|  | 489                     sre_constants.AT_BOUNDARY): | 
|  | 490                     raise ParseError('Anchor %r found at MIDDLE state' % (arguments[0],)) | 
|  | 491                 # All others (AT_END, AT_END_STRING, AT_BOUNDARY) advance to END. | 
|  | 492                 self.state = STATE_END | 
|  | 493 | 
|  | 494     def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None): | 
|  | 495         # If the RE module cannot compile it, we give up quickly | 
|  | 496         self.matcher = re.compile(r'(?:%s)\Z' % pattern, flags) | 
|  | 497         if not flags & re.DOTALL: | 
|  | 498             charset = ''.join(c for c in charset if c != '\n') | 
|  | 499         self.charset = charset | 
|  | 500 | 
|  | 501         self.named_group_lookup = self.matcher.groupindex | 
|  | 502 | 
|  | 503         flags |= DEFAULT_RE_FLAGS # https://github.com/google/sre_yield/issues/3 | 
|  | 504         if flags & re.IGNORECASE: | 
|  | 505             raise ParseError('Flag "i" not supported. https://github.com/google/sre_yield/issues/4') | 
|  | 506         elif flags & re.UNICODE: | 
|  | 507             raise ParseError('Flag "u" not supported. https://github.com/google/sre_yield/issues/3') | 
|  | 508         elif flags & re.LOCALE: | 
|  | 509             raise ParseError('Flag "l" not supported. https://github.com/google/sre_yield/issues/5') | 
|  | 510 | 
|  | 511         if max_count is None: | 
|  | 512             self.max_count = MAX_REPEAT_COUNT | 
|  | 513         else: | 
|  | 514             self.max_count = max_count | 
|  | 515 | 
|  | 516         self.has_groupref = False | 
|  | 517 | 
|  | 518         # Configure the parser backends | 
|  | 519         self.backends = { | 
|  | 520             sre_constants.LITERAL: lambda y: [chr(y)], | 
|  | 521             sre_constants.RANGE: lambda l, h: [chr(c) for c in range(l, h+1)], | 
|  | 522             sre_constants.SUBPATTERN: self.maybe_save, | 
|  | 523             sre_constants.BRANCH: self.branch_values, | 
|  | 524             sre_constants.MIN_REPEAT: self.max_repeat_values, | 
|  | 525             sre_constants.MAX_REPEAT: self.max_repeat_values, | 
|  | 526             sre_constants.AT: self.nothing_added, | 
|  | 527             sre_constants.ASSERT: self.empty_list, | 
|  | 528             sre_constants.ASSERT_NOT: self.empty_list, | 
|  | 529             sre_constants.ANY: | 
|  | 530                 lambda _: self.in_values(((sre_constants.NEGATE,),)), | 
|  | 531             sre_constants.IN: self.in_values, | 
|  | 532             sre_constants.NOT_LITERAL: self.not_literal, | 
|  | 533             sre_constants.CATEGORY: self.category, | 
|  | 534             sre_constants.GROUPREF: self.groupref, | 
|  | 535         } | 
|  | 536         self.state = STATE_START | 
|  | 537         # Now build a generator that knows all possible patterns | 
|  | 538         self.raw = self.sub_values(sre_parse.parse(pattern, flags)) | 
|  | 539         # Configure this class instance to know about that result | 
|  | 540         self.length = self.raw.__len__() | 
|  | 541 | 
|  | 542     def __contains__(self, item): | 
|  | 543         # Since we have a regex, we can search the list really cheaply | 
|  | 544         return self.matcher.match(item) is not None | 
|  | 545 | 
|  | 546 | 
|  | 547 class RegexMembershipSequenceMatches(RegexMembershipSequence): | 
|  | 548     def __getitem__(self, i): | 
|  | 549         if isinstance(i, slice): | 
|  | 550             result = SlicedSequence(self, slicer=i) | 
|  | 551             if result.__len__() < 16: | 
|  | 552                 # Short lists are unpacked | 
|  | 553                 result = [item for item in result] | 
|  | 554             return result | 
|  | 555 | 
|  | 556         d = {} | 
|  | 557         s = super(RegexMembershipSequenceMatches, self).get_item(i, d) | 
|  | 558         return Match(s, d, self.named_group_lookup) | 
|  | 559 | 
|  | 560 | 
|  | 561 def AllStrings(regex, flags=0, charset=CHARSET, max_count=None): | 
|  | 562     """Constructs an object that will generate all matching strings.""" | 
|  | 563     return RegexMembershipSequence(regex, flags, charset, max_count=max_count) | 
|  | 564 | 
|  | 565 Values = AllStrings | 
|  | 566 | 
|  | 567 | 
|  | 568 class Match(object): | 
|  | 569     def __init__(self, string, groups, named_groups): | 
|  | 570         # TODO keep group(0) only, and spans for the rest. | 
|  | 571         self._string = string | 
|  | 572         self._groups = groups | 
|  | 573         self._named_groups = named_groups | 
|  | 574         self.lastindex = len(groups) + 1 | 
|  | 575 | 
|  | 576     def group(self, n=0): | 
|  | 577         if n == 0: | 
|  | 578             return self._string | 
|  | 579         if not isinstance(n, int): | 
|  | 580             n = self._named_groups[n] | 
|  | 581         return self._groups[n] | 
|  | 582 | 
|  | 583     def groups(self): | 
|  | 584         return tuple(self._groups[i] for i in range(1, self.lastindex)) | 
|  | 585 | 
|  | 586     def groupdict(self): | 
|  | 587         d = {} | 
|  | 588         for k, v in self._named_groups.items(): | 
|  | 589             d[k] = self._groups[v] | 
|  | 590         return d | 
|  | 591 | 
|  | 592     def span(self, n=0): | 
|  | 593         raise NotImplementedError() | 
|  | 594 | 
|  | 595 | 
|  | 596 def AllMatches(regex, flags=0, charset=CHARSET, max_count=None): | 
|  | 597     """Constructs an object that will generate all matching strings.""" | 
|  | 598     return RegexMembershipSequenceMatches(regex, flags, charset, max_count=max_count) | 
|  | 599 | 
|  | 600 | 
|  | 601 def main(argv=None): | 
|  | 602     """This module can be executed on the command line for testing.""" | 
|  | 603     if argv is None: | 
|  | 604         argv = sys.argv | 
|  | 605     for arg in argv[1:]: | 
|  | 606         for i in AllStrings(arg): | 
|  | 607             print(i) | 
|  | 608 | 
|  | 609 | 
|  | 610 if __name__ == '__main__': | 
|  | 611     main() |