| 
41
 | 
     1 #!/usr/bin/env python2
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright 2011-2016 Google Inc.
 | 
| 
 | 
     4 #
 | 
| 
 | 
     5 # Licensed under the Apache License, Version 2.0 (the "License");
 | 
| 
 | 
     6 # you may not use this file except in compliance with the License.
 | 
| 
 | 
     7 # You may obtain a copy of the License at
 | 
| 
 | 
     8 #
 | 
| 
 | 
     9 #      http://www.apache.org/licenses/LICENSE-2.0
 | 
| 
 | 
    10 #
 | 
| 
 | 
    11 # Unless required by applicable law or agreed to in writing, software
 | 
| 
 | 
    12 # distributed under the License is distributed on an "AS IS" BASIS,
 | 
| 
 | 
    13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
| 
 | 
    14 # See the License for the specific language governing permissions and
 | 
| 
 | 
    15 # limitations under the License.
 | 
| 
 | 
    16 #
 | 
| 
 | 
    17 # vim: sw=2 sts=2 et
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 """This module can generate all strings that match a regular expression.
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 The regex is parsed using the SRE module that is standard in python,
 | 
| 
 | 
    22 then the data structure is executed to form a bunch of iterators.
 | 
| 
 | 
    23 """
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 __author__ = 'alexperry@google.com (Alex Perry)'
 | 
| 
 | 
    26 __all__ = ['Values', 'AllStrings', 'AllMatches', 'ParseError']
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 import bisect
 | 
| 
 | 
    30 import math
 | 
| 
 | 
    31 import re
 | 
| 
 | 
    32 import sre_constants
 | 
| 
 | 
    33 import sre_parse
 | 
| 
 | 
    34 import string
 | 
| 
 | 
    35 import sys
 | 
| 
 | 
    36 import types
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 import cachingseq
 | 
| 
 | 
    39 import fastdivmod
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 try:
 | 
| 
 | 
    42     xrange = xrange
 | 
| 
 | 
    43 except NameError:
 | 
| 
 | 
    44     xrange = range
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 _RE_METACHARS = r'$^{}*+\\'
 | 
| 
 | 
    47 _ESCAPED_METACHAR = r'\\[' + _RE_METACHARS + r']'
 | 
| 
 | 
    48 ESCAPED_METACHAR_RE = re.compile(_ESCAPED_METACHAR)
 | 
| 
 | 
    49 # ASCII by default, see https://github.com/google/sre_yield/issues/3
 | 
| 
 | 
    50 CHARSET = [chr(c) for c in range(256)]
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 WORD = string.ascii_letters + string.digits + '_'
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 try:
 | 
| 
 | 
    55     DEFAULT_RE_FLAGS = re.ASCII
 | 
| 
 | 
    56 except AttributeError:
 | 
| 
 | 
    57     DEFAULT_RE_FLAGS = 0
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 STATE_START, STATE_MIDDLE, STATE_END = list(range(3))
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 def Not(chars):
 | 
| 
 | 
    62     return ''.join(sorted(set(CHARSET) - set(chars)))
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 CATEGORIES = {
 | 
| 
 | 
    66     sre_constants.CATEGORY_WORD: WORD,
 | 
| 
 | 
    67     sre_constants.CATEGORY_NOT_WORD: Not(WORD),
 | 
| 
 | 
    68     sre_constants.CATEGORY_DIGIT: string.digits,
 | 
| 
 | 
    69     sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits),
 | 
| 
 | 
    70     sre_constants.CATEGORY_SPACE: string.whitespace,
 | 
| 
 | 
    71     sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace),
 | 
| 
 | 
    72 }
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 # This constant varies between builds of Python; this is the lower value.
 | 
| 
 | 
    75 MAX_REPEAT_COUNT = 65535
 | 
| 
 | 
    76 
 | 
| 
 | 
    77 
 | 
| 
 | 
    78 class ParseError(Exception):
 | 
| 
 | 
    79     pass
 | 
| 
 | 
    80 
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 def slice_indices(slice_obj, size):
 | 
| 
 | 
    83     """slice_obj.indices() except this one supports longs."""
 | 
| 
 | 
    84     # start stop step
 | 
| 
 | 
    85     start = slice_obj.start
 | 
| 
 | 
    86     stop = slice_obj.stop
 | 
| 
 | 
    87     step = slice_obj.step
 | 
| 
 | 
    88 
 | 
| 
 | 
    89     # We don't always update a value for negative indices (if we wrote it here
 | 
| 
 | 
    90     # due to None).
 | 
| 
 | 
    91     if step is None:
 | 
| 
 | 
    92         step = 1
 | 
| 
 | 
    93     if start is None:
 | 
| 
 | 
    94         if step > 0:
 | 
| 
 | 
    95             start = 0
 | 
| 
 | 
    96         else:
 | 
| 
 | 
    97             start = size - 1
 | 
| 
 | 
    98     else:
 | 
| 
 | 
    99         start = _adjust_index(start, size)
 | 
| 
 | 
   100 
 | 
| 
 | 
   101     if stop is None:
 | 
| 
 | 
   102         if step > 0:
 | 
| 
 | 
   103             stop = size
 | 
| 
 | 
   104         else:
 | 
| 
 | 
   105             stop = -1
 | 
| 
 | 
   106     else:
 | 
| 
 | 
   107         stop = _adjust_index(stop, size)
 | 
| 
 | 
   108 
 | 
| 
 | 
   109     return (start, stop, step)
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 def _adjust_index(n, size):
 | 
| 
 | 
   113     if n < 0:
 | 
| 
 | 
   114         n += size
 | 
| 
 | 
   115 
 | 
| 
 | 
   116     if n < 0:
 | 
| 
 | 
   117         raise IndexError("Out of range")
 | 
| 
 | 
   118     if n > size:
 | 
| 
 | 
   119         n = size
 | 
| 
 | 
   120     return n
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 def _xrange(*args):
 | 
| 
 | 
   124     """Because xrange doesn't support longs :("""
 | 
| 
 | 
   125     # prefer real xrange if it works
 | 
| 
 | 
   126     try:
 | 
| 
 | 
   127         return xrange(*args)
 | 
| 
 | 
   128     except OverflowError:
 | 
| 
 | 
   129         return _bigrange(*args)
 | 
| 
 | 
   130 
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 def _bigrange(*args):
 | 
| 
 | 
   133     if len(args) == 1:
 | 
| 
 | 
   134         start = 0; stop = args[0]; step = 1
 | 
| 
 | 
   135     elif len(args) == 2:
 | 
| 
 | 
   136         start, stop = args
 | 
| 
 | 
   137         step = 1
 | 
| 
 | 
   138     elif len(args) == 3:
 | 
| 
 | 
   139         start, stop, step = args
 | 
| 
 | 
   140     else:
 | 
| 
 | 
   141         raise ValueError("Too many args for _bigrange")
 | 
| 
 | 
   142 
 | 
| 
 | 
   143     i = start
 | 
| 
 | 
   144     while True:
 | 
| 
 | 
   145         yield i
 | 
| 
 | 
   146         i += step
 | 
| 
 | 
   147         if step < 0 and i <= stop:
 | 
| 
 | 
   148             break
 | 
| 
 | 
   149         if step > 0 and i >= stop:
 | 
| 
 | 
   150             break
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 
 | 
| 
 | 
   153 class WrappedSequence(object):
 | 
| 
 | 
   154     """This wraps a sequence, purely as a base clase for the other uses."""
 | 
| 
 | 
   155 
 | 
| 
 | 
   156     def __init__(self, raw):
 | 
| 
 | 
   157         # Derived classes will likely override this constructor
 | 
| 
 | 
   158         self.raw = raw
 | 
| 
 | 
   159         # Note that we can't use the function len() because it insists on trying
 | 
| 
 | 
   160         # to convert the returned number from a long-int to an ordinary int.
 | 
| 
 | 
   161         self.length = raw.__len__()
 | 
| 
 | 
   162 
 | 
| 
 | 
   163     def get_item(self, i, d=None):
 | 
| 
 | 
   164         i = _adjust_index(i, self.length)
 | 
| 
 | 
   165         if hasattr(self.raw, 'get_item'):
 | 
| 
 | 
   166             return self.raw.get_item(i, d)
 | 
| 
 | 
   167         return self.raw[i]
 | 
| 
 | 
   168 
 | 
| 
 | 
   169     def __len__(self):
 | 
| 
 | 
   170         return self.length
 | 
| 
 | 
   171 
 | 
| 
 | 
   172     def __getitem__(self, i):
 | 
| 
 | 
   173         # If the user wanted a slice, we provide a wrapper
 | 
| 
 | 
   174         if isinstance(i, slice):
 | 
| 
 | 
   175             result = SlicedSequence(self, slicer=i)
 | 
| 
 | 
   176             if result.__len__() < 16:
 | 
| 
 | 
   177                 # Short lists are unpacked
 | 
| 
 | 
   178                 result = [item for item in result]
 | 
| 
 | 
   179             return result
 | 
| 
 | 
   180         i = _adjust_index(i, self.length)
 | 
| 
 | 
   181         # Usually we just call the user-provided function
 | 
| 
 | 
   182         return self.get_item(i)
 | 
| 
 | 
   183 
 | 
| 
 | 
   184     def __iter__(self):
 | 
| 
 | 
   185         for i in _xrange(int(self.length)):
 | 
| 
 | 
   186             yield self.get_item(i)
 | 
| 
 | 
   187 
 | 
| 
 | 
   188 
 | 
| 
 | 
   189 def _sign(x):
 | 
| 
 | 
   190     if x > 0:
 | 
| 
 | 
   191         return 1
 | 
| 
 | 
   192     else:
 | 
| 
 | 
   193         return -1
 | 
| 
 | 
   194 
 | 
| 
 | 
   195 
 | 
| 
 | 
   196 class SlicedSequence(WrappedSequence):
 | 
| 
 | 
   197     """This is part of an immutable and potentially arbitrarily long list."""
 | 
| 
 | 
   198 
 | 
| 
 | 
   199     def __init__(self, raw, slicer=None):
 | 
| 
 | 
   200         # Derived classes will likely override this constructor
 | 
| 
 | 
   201         self.raw = raw
 | 
| 
 | 
   202         if slicer is None:
 | 
| 
 | 
   203             self.start, self.stop, self.step = 0, raw.__len__(), 1
 | 
| 
 | 
   204         else:
 | 
| 
 | 
   205             self.start, self.stop, self.step = slice_indices(slicer, raw.__len__())
 | 
| 
 | 
   206 
 | 
| 
 | 
   207         # Integer round up, depending on step direction
 | 
| 
 | 
   208         self.length = ((self.stop - self.start + self.step - _sign(self.step)) /
 | 
| 
 | 
   209                        self.step)
 | 
| 
 | 
   210 
 | 
| 
 | 
   211     def get_item(self, i, d=None):
 | 
| 
 | 
   212         j = i * self.step + self.start
 | 
| 
 | 
   213         return self.raw[j]
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 
 | 
| 
 | 
   216 class ConcatenatedSequence(WrappedSequence):
 | 
| 
 | 
   217     """This is equivalent to using extend() but without unpacking the lists."""
 | 
| 
 | 
   218 
 | 
| 
 | 
   219     def __init__(self, *alternatives):
 | 
| 
 | 
   220         self.list_lengths = [(a, a.__len__()) for a in alternatives]
 | 
| 
 | 
   221         self.length = sum(a_len for _, a_len in self.list_lengths)
 | 
| 
 | 
   222 
 | 
| 
 | 
   223     def get_item(self, i, d=None):
 | 
| 
 | 
   224         for a, a_len in self.list_lengths:
 | 
| 
 | 
   225             if i < a_len:
 | 
| 
 | 
   226                 return a[i]
 | 
| 
 | 
   227             i -= a_len
 | 
| 
 | 
   228         raise IndexError('Too Big')
 | 
| 
 | 
   229 
 | 
| 
 | 
   230     def __contains__(self, item):
 | 
| 
 | 
   231         for a, _ in self.list_lengths:
 | 
| 
 | 
   232             if item in a:
 | 
| 
 | 
   233                 return True
 | 
| 
 | 
   234         return False
 | 
| 
 | 
   235 
 | 
| 
 | 
   236     def __repr__(self):
 | 
| 
 | 
   237         return '{concat ' + repr(self.list_lengths) + '}'
 | 
| 
 | 
   238 
 | 
| 
 | 
   239 
 | 
| 
 | 
   240 class CombinatoricsSequence(WrappedSequence):
 | 
| 
 | 
   241     """This uses all combinations of one item from each passed list."""
 | 
| 
 | 
   242 
 | 
| 
 | 
   243     def __init__(self, *components):
 | 
| 
 | 
   244         self.list_lengths = [(a, a.__len__()) for a in components]
 | 
| 
 | 
   245         self.length = 1
 | 
| 
 | 
   246         for _, c_len in self.list_lengths:
 | 
| 
 | 
   247             self.length *= c_len
 | 
| 
 | 
   248 
 | 
| 
 | 
   249     def get_item(self, i, d=None):
 | 
| 
 | 
   250         result = []
 | 
| 
 | 
   251         if i < 0:
 | 
| 
 | 
   252             i += self.length
 | 
| 
 | 
   253         if i < 0 or i >= self.length:
 | 
| 
 | 
   254             raise IndexError("Index %d out of bounds" % (i,))
 | 
| 
 | 
   255 
 | 
| 
 | 
   256         if len(self.list_lengths) == 1:
 | 
| 
 | 
   257             # skip unnecessary ''.join -- big speedup
 | 
| 
 | 
   258             return self.list_lengths[0][0][i]
 | 
| 
 | 
   259 
 | 
| 
 | 
   260         for c, c_len in self.list_lengths:
 | 
| 
 | 
   261             i, mod = divmod(i, c_len)
 | 
| 
 | 
   262             if hasattr(c, 'get_item'):
 | 
| 
 | 
   263                 result.append(c.get_item(mod, d))
 | 
| 
 | 
   264             else:
 | 
| 
 | 
   265                 result.append(c[mod])
 | 
| 
 | 
   266         return ''.join(result)
 | 
| 
 | 
   267 
 | 
| 
 | 
   268     def __repr__(self):
 | 
| 
 | 
   269         return '{combin ' + repr(self.list_lengths) + '}'
 | 
| 
 | 
   270 
 | 
| 
 | 
   271 
 | 
| 
 | 
   272 class RepetitiveSequence(WrappedSequence):
 | 
| 
 | 
   273     """This chooses an entry from a list, many times, and concatenates."""
 | 
| 
 | 
   274 
 | 
| 
 | 
   275     def __init__(self, content, lowest=1, highest=1):
 | 
| 
 | 
   276         self.content = content
 | 
| 
 | 
   277         self.content_length = content.__len__()
 | 
| 
 | 
   278         self.length = fastdivmod.powersum(self.content_length, lowest, highest)
 | 
| 
 | 
   279         self.lowest = lowest
 | 
| 
 | 
   280         self.highest = highest
 | 
| 
 | 
   281 
 | 
| 
 | 
   282         def arbitrary_entry(i):
 | 
| 
 | 
   283             return (fastdivmod.powersum(self.content_length, lowest, i+lowest-1), i+lowest)
 | 
| 
 | 
   284 
 | 
| 
 | 
   285         def entry_from_prev(i, prev):
 | 
| 
 | 
   286             return (prev[0] + (self.content_length ** prev[1]), prev[1] + 1)
 | 
| 
 | 
   287 
 | 
| 
 | 
   288         self.offsets = cachingseq.CachingFuncSequence(
 | 
| 
 | 
   289             arbitrary_entry, highest - lowest+1, entry_from_prev)
 | 
| 
 | 
   290         # This needs to be a constant in order to reuse caclulations in future
 | 
| 
 | 
   291         # calls to bisect (a moving target will produce more misses).
 | 
| 
 | 
   292         if self.offsets[-1][0] > sys.maxsize:
 | 
| 
 | 
   293             i = 0
 | 
| 
 | 
   294             while i + 2 < len(self.offsets):
 | 
| 
 | 
   295                 if self.offsets[i+1][0] > sys.maxsize:
 | 
| 
 | 
   296                     self.index_of_offset = i
 | 
| 
 | 
   297                     self.offset_break = self.offsets[i][0]
 | 
| 
 | 
   298                     break
 | 
| 
 | 
   299                 i += 1
 | 
| 
 | 
   300         else:
 | 
| 
 | 
   301             self.index_of_offset = len(self.offsets)
 | 
| 
 | 
   302             self.offset_break = sys.maxsize
 | 
| 
 | 
   303 
 | 
| 
 | 
   304     def get_item(self, i, d=None):
 | 
| 
 | 
   305         """Finds out how many repeats this index implies, then picks strings."""
 | 
| 
 | 
   306         if i < self.offset_break:
 | 
| 
 | 
   307             by_bisect = bisect.bisect_left(self.offsets, (i, -1), hi=self.index_of_offset)
 | 
| 
 | 
   308         else:
 | 
| 
 | 
   309             by_bisect = bisect.bisect_left(self.offsets, (i, -1), lo=self.index_of_offset)
 | 
| 
 | 
   310 
 | 
| 
 | 
   311         if by_bisect == len(self.offsets) or self.offsets[by_bisect][0] > i:
 | 
| 
 | 
   312             by_bisect -= 1
 | 
| 
 | 
   313 
 | 
| 
 | 
   314         num = i - self.offsets[by_bisect][0]
 | 
| 
 | 
   315         count = self.offsets[by_bisect][1]
 | 
| 
 | 
   316 
 | 
| 
 | 
   317         if count > 100 and self.content_length < 1000:
 | 
| 
 | 
   318             content = list(self.content)
 | 
| 
 | 
   319         else:
 | 
| 
 | 
   320             content = self.content
 | 
| 
 | 
   321 
 | 
| 
 | 
   322         result = []
 | 
| 
 | 
   323 
 | 
| 
 | 
   324         if count == 0:
 | 
| 
 | 
   325             return ''
 | 
| 
 | 
   326 
 | 
| 
 | 
   327         for modulus in fastdivmod.divmod_iter(num, self.content_length):
 | 
| 
 | 
   328             result.append(content[modulus])
 | 
| 
 | 
   329 
 | 
| 
 | 
   330         leftover = count - len(result)
 | 
| 
 | 
   331         if leftover:
 | 
| 
 | 
   332             assert leftover > 0
 | 
| 
 | 
   333             result.extend([content[0]] * leftover)
 | 
| 
 | 
   334 
 | 
| 
 | 
   335         # smallest place value ends up on the right
 | 
| 
 | 
   336         return ''.join(result[::-1])
 | 
| 
 | 
   337 
 | 
| 
 | 
   338     def __repr__(self):
 | 
| 
 | 
   339         return '{repeat base=%d low=%d high=%d}' % (self.content_length, self.lowest, self.highest)
 | 
| 
 | 
   340 
 | 
| 
 | 
   341 
 | 
| 
 | 
   342 class SaveCaptureGroup(WrappedSequence):
 | 
| 
 | 
   343     def __init__(self, parsed, key):
 | 
| 
 | 
   344         self.key = key
 | 
| 
 | 
   345         super(SaveCaptureGroup, self).__init__(parsed)
 | 
| 
 | 
   346 
 | 
| 
 | 
   347     def get_item(self, n, d=None):
 | 
| 
 | 
   348         rv = super(SaveCaptureGroup, self).get_item(n, d)
 | 
| 
 | 
   349         if d is not None:
 | 
| 
 | 
   350             d[self.key] = rv
 | 
| 
 | 
   351         return rv
 | 
| 
 | 
   352 
 | 
| 
 | 
   353 
 | 
| 
 | 
   354 class ReadCaptureGroup(WrappedSequence):
 | 
| 
 | 
   355     def __init__(self, n):
 | 
| 
 | 
   356         self.num = n
 | 
| 
 | 
   357         self.length = 1
 | 
| 
 | 
   358 
 | 
| 
 | 
   359     def get_item(self, i, d=None):
 | 
| 
 | 
   360         if i != 0:
 | 
| 
 | 
   361             raise IndexError(i)
 | 
| 
 | 
   362         if d is None:
 | 
| 
 | 
   363             raise ValueError('ReadCaptureGroup with no dict')
 | 
| 
 | 
   364         return d.get(self.num, "fail")
 | 
| 
 | 
   365 
 | 
| 
 | 
   366 
 | 
| 
 | 
   367 class RegexMembershipSequence(WrappedSequence):
 | 
| 
 | 
   368     """Creates a sequence from the regex, knows how to test membership."""
 | 
| 
 | 
   369 
 | 
| 
 | 
   370     def empty_list(self, *_):
 | 
| 
 | 
   371         return []
 | 
| 
 | 
   372 
 | 
| 
 | 
   373     def nothing_added(self, *_):
 | 
| 
 | 
   374         return ['']
 | 
| 
 | 
   375 
 | 
| 
 | 
   376     def branch_values(self, _, items):
 | 
| 
 | 
   377         """Converts SRE parser data into literals and merges those lists."""
 | 
| 
 | 
   378         return ConcatenatedSequence(
 | 
| 
 | 
   379             *[self.sub_values(parsed) for parsed in items])
 | 
| 
 | 
   380 
 | 
| 
 | 
   381     def max_repeat_values(self, min_count, max_count, items):
 | 
| 
 | 
   382         """Sequential expansion of the count to be combinatorics."""
 | 
| 
 | 
   383         max_count = min(max_count, self.max_count)
 | 
| 
 | 
   384         return RepetitiveSequence(
 | 
| 
 | 
   385             self.sub_values(items), min_count, max_count)
 | 
| 
 | 
   386 
 | 
| 
 | 
   387     def in_values(self, items):
 | 
| 
 | 
   388         # Special case which distinguishes branch from charset operator
 | 
| 
 | 
   389         if items and items[0][0] == sre_constants.NEGATE:
 | 
| 
 | 
   390             items = self.branch_values(None, items[1:])
 | 
| 
 | 
   391             return [item for item in self.charset if item not in items]
 | 
| 
 | 
   392         return self.branch_values(None, items)
 | 
| 
 | 
   393 
 | 
| 
 | 
   394     def not_literal(self, y):
 | 
| 
 | 
   395         return self.in_values(((sre_constants.NEGATE,),
 | 
| 
 | 
   396                               (sre_constants.LITERAL, y),))
 | 
| 
 | 
   397 
 | 
| 
 | 
   398     def category(self, y):
 | 
| 
 | 
   399         return CATEGORIES[y]
 | 
| 
 | 
   400 
 | 
| 
 | 
   401     def groupref(self, n):
 | 
| 
 | 
   402         self.has_groupref = True
 | 
| 
 | 
   403         return ReadCaptureGroup(n)
 | 
| 
 | 
   404 
 | 
| 
 | 
   405     def get_item(self, i, d=None):
 | 
| 
 | 
   406         """Typically only pass i.  d is an internal detail, for consistency with other classes.
 | 
| 
 | 
   407 
 | 
| 
 | 
   408         If you care about the capture groups, you should use
 | 
| 
 | 
   409         RegexMembershipSequenceMatches instead, which returns a Match object
 | 
| 
 | 
   410         instead of a string."""
 | 
| 
 | 
   411         if self.has_groupref or d is not None:
 | 
| 
 | 
   412             if d is None:
 | 
| 
 | 
   413                 d = {}
 | 
| 
 | 
   414             return super(RegexMembershipSequence, self).get_item(i, d)
 | 
| 
 | 
   415         else:
 | 
| 
 | 
   416             return super(RegexMembershipSequence, self).get_item(i)
 | 
| 
 | 
   417 
 | 
| 
 | 
   418     def sub_values(self, parsed):
 | 
| 
 | 
   419         """This knows how to convert one piece of parsed pattern."""
 | 
| 
 | 
   420         # If this is a subpattern object, we just want its data
 | 
| 
 | 
   421         if isinstance(parsed, sre_parse.SubPattern):
 | 
| 
 | 
   422             parsed = parsed.data
 | 
| 
 | 
   423         # A list indicates sequential elements of a string
 | 
| 
 | 
   424         if isinstance(parsed, list):
 | 
| 
 | 
   425             elements = [self.sub_values(p) for p in parsed]
 | 
| 
 | 
   426             return CombinatoricsSequence(*elements)
 | 
| 
 | 
   427         # If not a list, a tuple represents a specific match type
 | 
| 
 | 
   428         if isinstance(parsed, tuple) and parsed:
 | 
| 
 | 
   429             matcher, arguments = parsed
 | 
| 
 | 
   430             if not isinstance(arguments, tuple):
 | 
| 
 | 
   431                 arguments = (arguments,)
 | 
| 
 | 
   432             if matcher in self.backends:
 | 
| 
 | 
   433                 self.check_anchor_state(matcher, arguments)
 | 
| 
 | 
   434                 return self.backends[matcher](*arguments)
 | 
| 
 | 
   435         # No idea what to do here
 | 
| 
 | 
   436         raise ParseError(repr(parsed))
 | 
| 
 | 
   437 
 | 
| 
 | 
   438     def maybe_save(self, *args):
 | 
| 
 | 
   439         # Python 3.6 has group, add_flags, del_flags, parsed
 | 
| 
 | 
   440         # while earlier versions just have group, parsed
 | 
| 
 | 
   441         group = args[0]
 | 
| 
 | 
   442         parsed = args[-1]
 | 
| 
 | 
   443         rv = self.sub_values(parsed)
 | 
| 
 | 
   444         if group is not None:
 | 
| 
 | 
   445             rv = SaveCaptureGroup(rv, group)
 | 
| 
 | 
   446         return rv
 | 
| 
 | 
   447 
 | 
| 
 | 
   448     def check_anchor_state(self, matcher, arguments):
 | 
| 
 | 
   449         # A bit of a hack to support zero-width leading anchors.  The goal is
 | 
| 
 | 
   450         # that /^(a|b)$/ will match properly, and that /a^b/ or /a\bb/ throws
 | 
| 
 | 
   451         # an error.  (It's unfortunate that I couldn't easily handle /$^/ which
 | 
| 
 | 
   452         # matches the empty string; I went for the common case.)
 | 
| 
 | 
   453         #
 | 
| 
 | 
   454         # There are three states, for example:
 | 
| 
 | 
   455         # / STATE_START
 | 
| 
 | 
   456         # | / STATE_START (^ causes no transition here, but is illegal at STATE_MIDDLE or STATE_END)
 | 
| 
 | 
   457         # | |  / STATE_START (\b causes no transition here, but advances MIDDLE to END)
 | 
| 
 | 
   458         # | |  | / (same as above for ^)
 | 
| 
 | 
   459         # | |  | | / STATE_MIDDLE (anything besides ^ and \b advances START to MIDDLE)
 | 
| 
 | 
   460         # | |  | | | / still STATE_MIDDLE
 | 
| 
 | 
   461         # . .  . . . .  / advances MIDDLE to END
 | 
| 
 | 
   462         #  ^ \b ^ X Y \b $
 | 
| 
 | 
   463         old_state = self.state
 | 
| 
 | 
   464         if self.state == STATE_START:
 | 
| 
 | 
   465             if matcher == sre_constants.AT:
 | 
| 
 | 
   466                 if arguments[0] in (sre_constants.AT_END, sre_constants.AT_END_STRING):
 | 
| 
 | 
   467                     self.state = STATE_END
 | 
| 
 | 
   468                 elif arguments[0] == sre_constants.AT_NON_BOUNDARY:
 | 
| 
 | 
   469                     # This is nonsensical at beginning of string
 | 
| 
 | 
   470                     raise ParseError('Anchor %r found at START state' % (arguments[0],))
 | 
| 
 | 
   471                 # All others (AT_BEGINNING, AT_BEGINNING_STRING, and AT_BOUNDARY) remain in START.
 | 
| 
 | 
   472             elif matcher != sre_constants.SUBPATTERN:
 | 
| 
 | 
   473                 self.state = STATE_MIDDLE
 | 
| 
 | 
   474             # subpattern remains in START
 | 
| 
 | 
   475         elif self.state == STATE_END:
 | 
| 
 | 
   476             if matcher == sre_constants.AT:
 | 
| 
 | 
   477                 if arguments[0] not in (
 | 
| 
 | 
   478                     sre_constants.AT_END, sre_constants.AT_END_STRING,
 | 
| 
 | 
   479                     sre_constants.AT_BOUNDARY):
 | 
| 
 | 
   480                     raise ParseError('Anchor %r found at END state' % (arguments[0],))
 | 
| 
 | 
   481                 # those three remain in END
 | 
| 
 | 
   482             elif matcher != sre_constants.SUBPATTERN:
 | 
| 
 | 
   483                 raise ParseError('Non-end-anchor %r found at END state' % (arguments[0],))
 | 
| 
 | 
   484             # subpattern remains in END
 | 
| 
 | 
   485         else:  # self.state == STATE_MIDDLE
 | 
| 
 | 
   486             if matcher == sre_constants.AT:
 | 
| 
 | 
   487                 if arguments[0] not in (
 | 
| 
 | 
   488                     sre_constants.AT_END, sre_constants.AT_END_STRING,
 | 
| 
 | 
   489                     sre_constants.AT_BOUNDARY):
 | 
| 
 | 
   490                     raise ParseError('Anchor %r found at MIDDLE state' % (arguments[0],))
 | 
| 
 | 
   491                 # All others (AT_END, AT_END_STRING, AT_BOUNDARY) advance to END.
 | 
| 
 | 
   492                 self.state = STATE_END
 | 
| 
 | 
   493 
 | 
| 
 | 
   494     def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None):
 | 
| 
 | 
   495         # If the RE module cannot compile it, we give up quickly
 | 
| 
 | 
   496         self.matcher = re.compile(r'(?:%s)\Z' % pattern, flags)
 | 
| 
 | 
   497         if not flags & re.DOTALL:
 | 
| 
 | 
   498             charset = ''.join(c for c in charset if c != '\n')
 | 
| 
 | 
   499         self.charset = charset
 | 
| 
 | 
   500 
 | 
| 
 | 
   501         self.named_group_lookup = self.matcher.groupindex
 | 
| 
 | 
   502 
 | 
| 
 | 
   503         flags |= DEFAULT_RE_FLAGS # https://github.com/google/sre_yield/issues/3
 | 
| 
 | 
   504         if flags & re.IGNORECASE:
 | 
| 
 | 
   505             raise ParseError('Flag "i" not supported. https://github.com/google/sre_yield/issues/4')
 | 
| 
 | 
   506         elif flags & re.UNICODE:
 | 
| 
 | 
   507             raise ParseError('Flag "u" not supported. https://github.com/google/sre_yield/issues/3')
 | 
| 
 | 
   508         elif flags & re.LOCALE:
 | 
| 
 | 
   509             raise ParseError('Flag "l" not supported. https://github.com/google/sre_yield/issues/5')
 | 
| 
 | 
   510 
 | 
| 
 | 
   511         if max_count is None:
 | 
| 
 | 
   512             self.max_count = MAX_REPEAT_COUNT
 | 
| 
 | 
   513         else:
 | 
| 
 | 
   514             self.max_count = max_count
 | 
| 
 | 
   515 
 | 
| 
 | 
   516         self.has_groupref = False
 | 
| 
 | 
   517 
 | 
| 
 | 
   518         # Configure the parser backends
 | 
| 
 | 
   519         self.backends = {
 | 
| 
 | 
   520             sre_constants.LITERAL: lambda y: [chr(y)],
 | 
| 
 | 
   521             sre_constants.RANGE: lambda l, h: [chr(c) for c in range(l, h+1)],
 | 
| 
 | 
   522             sre_constants.SUBPATTERN: self.maybe_save,
 | 
| 
 | 
   523             sre_constants.BRANCH: self.branch_values,
 | 
| 
 | 
   524             sre_constants.MIN_REPEAT: self.max_repeat_values,
 | 
| 
 | 
   525             sre_constants.MAX_REPEAT: self.max_repeat_values,
 | 
| 
 | 
   526             sre_constants.AT: self.nothing_added,
 | 
| 
 | 
   527             sre_constants.ASSERT: self.empty_list,
 | 
| 
 | 
   528             sre_constants.ASSERT_NOT: self.empty_list,
 | 
| 
 | 
   529             sre_constants.ANY:
 | 
| 
 | 
   530                 lambda _: self.in_values(((sre_constants.NEGATE,),)),
 | 
| 
 | 
   531             sre_constants.IN: self.in_values,
 | 
| 
 | 
   532             sre_constants.NOT_LITERAL: self.not_literal,
 | 
| 
 | 
   533             sre_constants.CATEGORY: self.category,
 | 
| 
 | 
   534             sre_constants.GROUPREF: self.groupref,
 | 
| 
 | 
   535         }
 | 
| 
 | 
   536         self.state = STATE_START
 | 
| 
 | 
   537         # Now build a generator that knows all possible patterns
 | 
| 
 | 
   538         self.raw = self.sub_values(sre_parse.parse(pattern, flags))
 | 
| 
 | 
   539         # Configure this class instance to know about that result
 | 
| 
 | 
   540         self.length = self.raw.__len__()
 | 
| 
 | 
   541 
 | 
| 
 | 
   542     def __contains__(self, item):
 | 
| 
 | 
   543         # Since we have a regex, we can search the list really cheaply
 | 
| 
 | 
   544         return self.matcher.match(item) is not None
 | 
| 
 | 
   545 
 | 
| 
 | 
   546 
 | 
| 
 | 
   547 class RegexMembershipSequenceMatches(RegexMembershipSequence):
 | 
| 
 | 
   548     def __getitem__(self, i):
 | 
| 
 | 
   549         if isinstance(i, slice):
 | 
| 
 | 
   550             result = SlicedSequence(self, slicer=i)
 | 
| 
 | 
   551             if result.__len__() < 16:
 | 
| 
 | 
   552                 # Short lists are unpacked
 | 
| 
 | 
   553                 result = [item for item in result]
 | 
| 
 | 
   554             return result
 | 
| 
 | 
   555 
 | 
| 
 | 
   556         d = {}
 | 
| 
 | 
   557         s = super(RegexMembershipSequenceMatches, self).get_item(i, d)
 | 
| 
 | 
   558         return Match(s, d, self.named_group_lookup)
 | 
| 
 | 
   559 
 | 
| 
 | 
   560 
 | 
| 
 | 
   561 def AllStrings(regex, flags=0, charset=CHARSET, max_count=None):
 | 
| 
 | 
   562     """Constructs an object that will generate all matching strings."""
 | 
| 
 | 
   563     return RegexMembershipSequence(regex, flags, charset, max_count=max_count)
 | 
| 
 | 
   564 
 | 
| 
 | 
   565 Values = AllStrings
 | 
| 
 | 
   566 
 | 
| 
 | 
   567 
 | 
| 
 | 
   568 class Match(object):
 | 
| 
 | 
   569     def __init__(self, string, groups, named_groups):
 | 
| 
 | 
   570         # TODO keep group(0) only, and spans for the rest.
 | 
| 
 | 
   571         self._string = string
 | 
| 
 | 
   572         self._groups = groups
 | 
| 
 | 
   573         self._named_groups = named_groups
 | 
| 
 | 
   574         self.lastindex = len(groups) + 1
 | 
| 
 | 
   575 
 | 
| 
 | 
   576     def group(self, n=0):
 | 
| 
 | 
   577         if n == 0:
 | 
| 
 | 
   578             return self._string
 | 
| 
 | 
   579         if not isinstance(n, int):
 | 
| 
 | 
   580             n = self._named_groups[n]
 | 
| 
 | 
   581         return self._groups[n]
 | 
| 
 | 
   582 
 | 
| 
 | 
   583     def groups(self):
 | 
| 
 | 
   584         return tuple(self._groups[i] for i in range(1, self.lastindex))
 | 
| 
 | 
   585 
 | 
| 
 | 
   586     def groupdict(self):
 | 
| 
 | 
   587         d = {}
 | 
| 
 | 
   588         for k, v in self._named_groups.items():
 | 
| 
 | 
   589             d[k] = self._groups[v]
 | 
| 
 | 
   590         return d
 | 
| 
 | 
   591 
 | 
| 
 | 
   592     def span(self, n=0):
 | 
| 
 | 
   593         raise NotImplementedError()
 | 
| 
 | 
   594 
 | 
| 
 | 
   595 
 | 
| 
 | 
   596 def AllMatches(regex, flags=0, charset=CHARSET, max_count=None):
 | 
| 
 | 
   597     """Constructs an object that will generate all matching strings."""
 | 
| 
 | 
   598     return RegexMembershipSequenceMatches(regex, flags, charset, max_count=max_count)
 | 
| 
 | 
   599 
 | 
| 
 | 
   600 
 | 
| 
 | 
   601 def main(argv=None):
 | 
| 
 | 
   602     """This module can be executed on the command line for testing."""
 | 
| 
 | 
   603     if argv is None:
 | 
| 
 | 
   604         argv = sys.argv
 | 
| 
 | 
   605     for arg in argv[1:]:
 | 
| 
 | 
   606         for i in AllStrings(arg):
 | 
| 
 | 
   607             print(i)
 | 
| 
 | 
   608 
 | 
| 
 | 
   609 
 | 
| 
 | 
   610 if __name__ == '__main__':
 | 
| 
 | 
   611     main()
 |