Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/boltons/cacheutils.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """``cacheutils`` contains consistent implementations of fundamental | |
3 cache types. Currently there are two to choose from: | |
4 | |
5 * :class:`LRI` - Least-recently inserted | |
6 * :class:`LRU` - Least-recently used | |
7 | |
8 Both caches are :class:`dict` subtypes, designed to be as | |
9 interchangeable as possible, to facilitate experimentation. A key | |
10 practice with performance enhancement with caching is ensuring that | |
11 the caching strategy is working. If the cache is constantly missing, | |
12 it is just adding more overhead and code complexity. The standard | |
13 statistics are: | |
14 | |
15 * ``hit_count`` - the number of times the queried key has been in | |
16 the cache | |
17 * ``miss_count`` - the number of times a key has been absent and/or | |
18 fetched by the cache | |
19 * ``soft_miss_count`` - the number of times a key has been absent, | |
20 but a default has been provided by the caller, as with | |
21 :meth:`dict.get` and :meth:`dict.setdefault`. Soft misses are a | |
22 subset of misses, so this number is always less than or equal to | |
23 ``miss_count``. | |
24 | |
25 Additionally, ``cacheutils`` provides :class:`ThresholdCounter`, a | |
26 cache-like bounded counter useful for online statistics collection. | |
27 | |
28 Learn more about `caching algorithms on Wikipedia | |
29 <https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_. | |
30 | |
31 """ | |
32 | |
33 # TODO: TimedLRI | |
34 # TODO: support 0 max_size? | |
35 | |
36 | |
37 import heapq | |
38 import weakref | |
39 import itertools | |
40 from operator import attrgetter | |
41 | |
42 try: | |
43 from threading import RLock | |
44 except Exception: | |
45 class RLock(object): | |
46 'Dummy reentrant lock for builds without threads' | |
47 def __enter__(self): | |
48 pass | |
49 | |
50 def __exit__(self, exctype, excinst, exctb): | |
51 pass | |
52 | |
53 try: | |
54 from boltons.typeutils import make_sentinel | |
55 _MISSING = make_sentinel(var_name='_MISSING') | |
56 _KWARG_MARK = make_sentinel(var_name='_KWARG_MARK') | |
57 except ImportError: | |
58 _MISSING = object() | |
59 _KWARG_MARK = object() | |
60 | |
61 try: | |
62 xrange | |
63 except NameError: | |
64 # py3 | |
65 xrange = range | |
66 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes) | |
67 | |
68 PREV, NEXT, KEY, VALUE = range(4) # names for the link fields | |
69 DEFAULT_MAX_SIZE = 128 | |
70 | |
71 | |
72 class LRI(dict): | |
73 """The ``LRI`` implements the basic *Least Recently Inserted* strategy to | |
74 caching. One could also think of this as a ``SizeLimitedDefaultDict``. | |
75 | |
76 *on_miss* is a callable that accepts the missing key (as opposed | |
77 to :class:`collections.defaultdict`'s "default_factory", which | |
78 accepts no arguments.) Also note that, like the :class:`LRI`, | |
79 the ``LRI`` is instrumented with statistics tracking. | |
80 | |
81 >>> cap_cache = LRI(max_size=2) | |
82 >>> cap_cache['a'], cap_cache['b'] = 'A', 'B' | |
83 >>> from pprint import pprint as pp | |
84 >>> pp(dict(cap_cache)) | |
85 {'a': 'A', 'b': 'B'} | |
86 >>> [cap_cache['b'] for i in range(3)][0] | |
87 'B' | |
88 >>> cap_cache['c'] = 'C' | |
89 >>> print(cap_cache.get('a')) | |
90 None | |
91 >>> cap_cache.hit_count, cap_cache.miss_count, cap_cache.soft_miss_count | |
92 (3, 1, 1) | |
93 """ | |
94 def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None, | |
95 on_miss=None): | |
96 if max_size <= 0: | |
97 raise ValueError('expected max_size > 0, not %r' % max_size) | |
98 self.hit_count = self.miss_count = self.soft_miss_count = 0 | |
99 self.max_size = max_size | |
100 self._lock = RLock() | |
101 self._init_ll() | |
102 | |
103 if on_miss is not None and not callable(on_miss): | |
104 raise TypeError('expected on_miss to be a callable' | |
105 ' (or None), not %r' % on_miss) | |
106 self.on_miss = on_miss | |
107 | |
108 if values: | |
109 self.update(values) | |
110 | |
111 # TODO: fromkeys()? | |
112 | |
113 # linked list manipulation methods. | |
114 # | |
115 # invariants: | |
116 # 1) 'anchor' is the sentinel node in the doubly linked list. there is | |
117 # always only one, and its KEY and VALUE are both _MISSING. | |
118 # 2) the most recently accessed node comes immediately before 'anchor'. | |
119 # 3) the least recently accessed node comes immediately after 'anchor'. | |
120 def _init_ll(self): | |
121 anchor = [] | |
122 anchor[:] = [anchor, anchor, _MISSING, _MISSING] | |
123 # a link lookup table for finding linked list links in O(1) | |
124 # time. | |
125 self._link_lookup = {} | |
126 self._anchor = anchor | |
127 | |
128 def _print_ll(self): | |
129 print('***') | |
130 for (key, val) in self._get_flattened_ll(): | |
131 print(key, val) | |
132 print('***') | |
133 return | |
134 | |
135 def _get_flattened_ll(self): | |
136 flattened_list = [] | |
137 link = self._anchor | |
138 while True: | |
139 flattened_list.append((link[KEY], link[VALUE])) | |
140 link = link[NEXT] | |
141 if link is self._anchor: | |
142 break | |
143 return flattened_list | |
144 | |
145 def _get_link_and_move_to_front_of_ll(self, key): | |
146 # find what will become the newest link. this may raise a | |
147 # KeyError, which is useful to __getitem__ and __setitem__ | |
148 newest = self._link_lookup[key] | |
149 | |
150 # splice out what will become the newest link. | |
151 newest[PREV][NEXT] = newest[NEXT] | |
152 newest[NEXT][PREV] = newest[PREV] | |
153 | |
154 # move what will become the newest link immediately before | |
155 # anchor (invariant 2) | |
156 anchor = self._anchor | |
157 second_newest = anchor[PREV] | |
158 second_newest[NEXT] = anchor[PREV] = newest | |
159 newest[PREV] = second_newest | |
160 newest[NEXT] = anchor | |
161 return newest | |
162 | |
163 def _set_key_and_add_to_front_of_ll(self, key, value): | |
164 # create a new link and place it immediately before anchor | |
165 # (invariant 2). | |
166 anchor = self._anchor | |
167 second_newest = anchor[PREV] | |
168 newest = [second_newest, anchor, key, value] | |
169 second_newest[NEXT] = anchor[PREV] = newest | |
170 self._link_lookup[key] = newest | |
171 | |
172 def _set_key_and_evict_last_in_ll(self, key, value): | |
173 # the link after anchor is the oldest in the linked list | |
174 # (invariant 3). the current anchor becomes a link that holds | |
175 # the newest key, and the oldest link becomes the new anchor | |
176 # (invariant 1). now the newest link comes before anchor | |
177 # (invariant 2). no links are moved; only their keys | |
178 # and values are changed. | |
179 oldanchor = self._anchor | |
180 oldanchor[KEY] = key | |
181 oldanchor[VALUE] = value | |
182 | |
183 self._anchor = anchor = oldanchor[NEXT] | |
184 evicted = anchor[KEY] | |
185 anchor[KEY] = anchor[VALUE] = _MISSING | |
186 del self._link_lookup[evicted] | |
187 self._link_lookup[key] = oldanchor | |
188 return evicted | |
189 | |
190 def _remove_from_ll(self, key): | |
191 # splice a link out of the list and drop it from our lookup | |
192 # table. | |
193 link = self._link_lookup.pop(key) | |
194 link[PREV][NEXT] = link[NEXT] | |
195 link[NEXT][PREV] = link[PREV] | |
196 | |
197 def __setitem__(self, key, value): | |
198 with self._lock: | |
199 try: | |
200 link = self._get_link_and_move_to_front_of_ll(key) | |
201 except KeyError: | |
202 if len(self) < self.max_size: | |
203 self._set_key_and_add_to_front_of_ll(key, value) | |
204 else: | |
205 evicted = self._set_key_and_evict_last_in_ll(key, value) | |
206 super(LRI, self).__delitem__(evicted) | |
207 super(LRI, self).__setitem__(key, value) | |
208 else: | |
209 link[VALUE] = value | |
210 | |
211 def __getitem__(self, key): | |
212 with self._lock: | |
213 try: | |
214 link = self._link_lookup[key] | |
215 except KeyError: | |
216 self.miss_count += 1 | |
217 if not self.on_miss: | |
218 raise | |
219 ret = self[key] = self.on_miss(key) | |
220 return ret | |
221 | |
222 self.hit_count += 1 | |
223 return link[VALUE] | |
224 | |
225 def get(self, key, default=None): | |
226 try: | |
227 return self[key] | |
228 except KeyError: | |
229 self.soft_miss_count += 1 | |
230 return default | |
231 | |
232 def __delitem__(self, key): | |
233 with self._lock: | |
234 super(LRI, self).__delitem__(key) | |
235 self._remove_from_ll(key) | |
236 | |
237 def pop(self, key, default=_MISSING): | |
238 # NB: hit/miss counts are bypassed for pop() | |
239 with self._lock: | |
240 try: | |
241 ret = super(LRI, self).pop(key) | |
242 except KeyError: | |
243 if default is _MISSING: | |
244 raise | |
245 ret = default | |
246 else: | |
247 self._remove_from_ll(key) | |
248 return ret | |
249 | |
250 def popitem(self): | |
251 with self._lock: | |
252 item = super(LRI, self).popitem() | |
253 self._remove_from_ll(item[0]) | |
254 return item | |
255 | |
256 def clear(self): | |
257 with self._lock: | |
258 super(LRI, self).clear() | |
259 self._init_ll() | |
260 | |
261 def copy(self): | |
262 return self.__class__(max_size=self.max_size, values=self) | |
263 | |
264 def setdefault(self, key, default=None): | |
265 with self._lock: | |
266 try: | |
267 return self[key] | |
268 except KeyError: | |
269 self.soft_miss_count += 1 | |
270 self[key] = default | |
271 return default | |
272 | |
273 def update(self, E, **F): | |
274 # E and F are throwback names to the dict() __doc__ | |
275 with self._lock: | |
276 if E is self: | |
277 return | |
278 setitem = self.__setitem__ | |
279 if callable(getattr(E, 'keys', None)): | |
280 for k in E.keys(): | |
281 setitem(k, E[k]) | |
282 else: | |
283 for k, v in E: | |
284 setitem(k, v) | |
285 for k in F: | |
286 setitem(k, F[k]) | |
287 return | |
288 | |
289 def __eq__(self, other): | |
290 with self._lock: | |
291 if self is other: | |
292 return True | |
293 if len(other) != len(self): | |
294 return False | |
295 if not isinstance(other, LRI): | |
296 return other == self | |
297 return super(LRI, self).__eq__(other) | |
298 | |
299 def __ne__(self, other): | |
300 return not (self == other) | |
301 | |
302 def __repr__(self): | |
303 cn = self.__class__.__name__ | |
304 val_map = super(LRI, self).__repr__() | |
305 return ('%s(max_size=%r, on_miss=%r, values=%s)' | |
306 % (cn, self.max_size, self.on_miss, val_map)) | |
307 | |
308 | |
309 class LRU(LRI): | |
310 """The ``LRU`` is :class:`dict` subtype implementation of the | |
311 *Least-Recently Used* caching strategy. | |
312 | |
313 Args: | |
314 max_size (int): Max number of items to cache. Defaults to ``128``. | |
315 values (iterable): Initial values for the cache. Defaults to ``None``. | |
316 on_miss (callable): a callable which accepts a single argument, the | |
317 key not present in the cache, and returns the value to be cached. | |
318 | |
319 >>> cap_cache = LRU(max_size=2) | |
320 >>> cap_cache['a'], cap_cache['b'] = 'A', 'B' | |
321 >>> from pprint import pprint as pp | |
322 >>> pp(dict(cap_cache)) | |
323 {'a': 'A', 'b': 'B'} | |
324 >>> [cap_cache['b'] for i in range(3)][0] | |
325 'B' | |
326 >>> cap_cache['c'] = 'C' | |
327 >>> print(cap_cache.get('a')) | |
328 None | |
329 | |
330 This cache is also instrumented with statistics | |
331 collection. ``hit_count``, ``miss_count``, and ``soft_miss_count`` | |
332 are all integer members that can be used to introspect the | |
333 performance of the cache. ("Soft" misses are misses that did not | |
334 raise :exc:`KeyError`, e.g., ``LRU.get()`` or ``on_miss`` was used to | |
335 cache a default. | |
336 | |
337 >>> cap_cache.hit_count, cap_cache.miss_count, cap_cache.soft_miss_count | |
338 (3, 1, 1) | |
339 | |
340 Other than the size-limiting caching behavior and statistics, | |
341 ``LRU`` acts like its parent class, the built-in Python :class:`dict`. | |
342 """ | |
343 def __getitem__(self, key): | |
344 with self._lock: | |
345 try: | |
346 link = self._get_link_and_move_to_front_of_ll(key) | |
347 except KeyError: | |
348 self.miss_count += 1 | |
349 if not self.on_miss: | |
350 raise | |
351 ret = self[key] = self.on_miss(key) | |
352 return ret | |
353 | |
354 self.hit_count += 1 | |
355 return link[VALUE] | |
356 | |
357 | |
358 ### Cached decorator | |
359 # Key-making technique adapted from Python 3.4's functools | |
360 | |
361 class _HashedKey(list): | |
362 """The _HashedKey guarantees that hash() will be called no more than once | |
363 per cached function invocation. | |
364 """ | |
365 __slots__ = 'hash_value' | |
366 | |
367 def __init__(self, key): | |
368 self[:] = key | |
369 self.hash_value = hash(tuple(key)) | |
370 | |
371 def __hash__(self): | |
372 return self.hash_value | |
373 | |
374 def __repr__(self): | |
375 return '%s(%s)' % (self.__class__.__name__, list.__repr__(self)) | |
376 | |
377 | |
378 def make_cache_key(args, kwargs, typed=False, | |
379 kwarg_mark=_KWARG_MARK, | |
380 fasttypes=frozenset([int, str, frozenset, type(None)])): | |
381 """Make a generic key from a function's positional and keyword | |
382 arguments, suitable for use in caches. Arguments within *args* and | |
383 *kwargs* must be `hashable`_. If *typed* is ``True``, ``3`` and | |
384 ``3.0`` will be treated as separate keys. | |
385 | |
386 The key is constructed in a way that is flat as possible rather than | |
387 as a nested structure that would take more memory. | |
388 | |
389 If there is only a single argument and its data type is known to cache | |
390 its hash value, then that argument is returned without a wrapper. This | |
391 saves space and improves lookup speed. | |
392 | |
393 >>> tuple(make_cache_key(('a', 'b'), {'c': ('d')})) | |
394 ('a', 'b', _KWARG_MARK, ('c', 'd')) | |
395 | |
396 .. _hashable: https://docs.python.org/2/glossary.html#term-hashable | |
397 """ | |
398 | |
399 # key = [func_name] if func_name else [] | |
400 # key.extend(args) | |
401 key = list(args) | |
402 if kwargs: | |
403 sorted_items = sorted(kwargs.items()) | |
404 key.append(kwarg_mark) | |
405 key.extend(sorted_items) | |
406 if typed: | |
407 key.extend([type(v) for v in args]) | |
408 if kwargs: | |
409 key.extend([type(v) for k, v in sorted_items]) | |
410 elif len(key) == 1 and type(key[0]) in fasttypes: | |
411 return key[0] | |
412 return _HashedKey(key) | |
413 | |
414 # for backwards compatibility in case someone was importing it | |
415 _make_cache_key = make_cache_key | |
416 | |
417 | |
418 class CachedFunction(object): | |
419 """This type is used by :func:`cached`, below. Instances of this | |
420 class are used to wrap functions in caching logic. | |
421 """ | |
422 def __init__(self, func, cache, scoped=True, typed=False, key=None): | |
423 self.func = func | |
424 if callable(cache): | |
425 self.get_cache = cache | |
426 elif not (callable(getattr(cache, '__getitem__', None)) | |
427 and callable(getattr(cache, '__setitem__', None))): | |
428 raise TypeError('expected cache to be a dict-like object,' | |
429 ' or callable returning a dict-like object, not %r' | |
430 % cache) | |
431 else: | |
432 def _get_cache(): | |
433 return cache | |
434 self.get_cache = _get_cache | |
435 self.scoped = scoped | |
436 self.typed = typed | |
437 self.key_func = key or make_cache_key | |
438 | |
439 def __call__(self, *args, **kwargs): | |
440 cache = self.get_cache() | |
441 key = self.key_func(args, kwargs, typed=self.typed) | |
442 try: | |
443 ret = cache[key] | |
444 except KeyError: | |
445 ret = cache[key] = self.func(*args, **kwargs) | |
446 return ret | |
447 | |
448 def __repr__(self): | |
449 cn = self.__class__.__name__ | |
450 if self.typed or not self.scoped: | |
451 return ("%s(func=%r, scoped=%r, typed=%r)" | |
452 % (cn, self.func, self.scoped, self.typed)) | |
453 return "%s(func=%r)" % (cn, self.func) | |
454 | |
455 | |
456 class CachedMethod(object): | |
457 """Similar to :class:`CachedFunction`, this type is used by | |
458 :func:`cachedmethod` to wrap methods in caching logic. | |
459 """ | |
460 def __init__(self, func, cache, scoped=True, typed=False, key=None): | |
461 self.func = func | |
462 self.__isabstractmethod__ = getattr(func, '__isabstractmethod__', False) | |
463 if isinstance(cache, basestring): | |
464 self.get_cache = attrgetter(cache) | |
465 elif callable(cache): | |
466 self.get_cache = cache | |
467 elif not (callable(getattr(cache, '__getitem__', None)) | |
468 and callable(getattr(cache, '__setitem__', None))): | |
469 raise TypeError('expected cache to be an attribute name,' | |
470 ' dict-like object, or callable returning' | |
471 ' a dict-like object, not %r' % cache) | |
472 else: | |
473 def _get_cache(obj): | |
474 return cache | |
475 self.get_cache = _get_cache | |
476 self.scoped = scoped | |
477 self.typed = typed | |
478 self.key_func = key or make_cache_key | |
479 self.bound_to = None | |
480 | |
481 def __get__(self, obj, objtype=None): | |
482 if obj is None: | |
483 return self | |
484 cls = self.__class__ | |
485 ret = cls(self.func, self.get_cache, typed=self.typed, | |
486 scoped=self.scoped, key=self.key_func) | |
487 ret.bound_to = obj | |
488 return ret | |
489 | |
490 def __call__(self, *args, **kwargs): | |
491 obj = args[0] if self.bound_to is None else self.bound_to | |
492 cache = self.get_cache(obj) | |
493 key_args = (self.bound_to, self.func) + args if self.scoped else args | |
494 key = self.key_func(key_args, kwargs, typed=self.typed) | |
495 try: | |
496 ret = cache[key] | |
497 except KeyError: | |
498 if self.bound_to is not None: | |
499 args = (self.bound_to,) + args | |
500 ret = cache[key] = self.func(*args, **kwargs) | |
501 return ret | |
502 | |
503 def __repr__(self): | |
504 cn = self.__class__.__name__ | |
505 args = (cn, self.func, self.scoped, self.typed) | |
506 if self.bound_to is not None: | |
507 args += (self.bound_to,) | |
508 return ('<%s func=%r scoped=%r typed=%r bound_to=%r>' % args) | |
509 return ("%s(func=%r, scoped=%r, typed=%r)" % args) | |
510 | |
511 | |
512 def cached(cache, scoped=True, typed=False, key=None): | |
513 """Cache any function with the cache object of your choosing. Note | |
514 that the function wrapped should take only `hashable`_ arguments. | |
515 | |
516 Args: | |
517 cache (Mapping): Any :class:`dict`-like object suitable for | |
518 use as a cache. Instances of the :class:`LRU` and | |
519 :class:`LRI` are good choices, but a plain :class:`dict` | |
520 can work in some cases, as well. This argument can also be | |
521 a callable which accepts no arguments and returns a mapping. | |
522 scoped (bool): Whether the function itself is part of the | |
523 cache key. ``True`` by default, different functions will | |
524 not read one another's cache entries, but can evict one | |
525 another's results. ``False`` can be useful for certain | |
526 shared cache use cases. More advanced behavior can be | |
527 produced through the *key* argument. | |
528 typed (bool): Whether to factor argument types into the cache | |
529 check. Default ``False``, setting to ``True`` causes the | |
530 cache keys for ``3`` and ``3.0`` to be considered unequal. | |
531 | |
532 >>> my_cache = LRU() | |
533 >>> @cached(my_cache) | |
534 ... def cached_lower(x): | |
535 ... return x.lower() | |
536 ... | |
537 >>> cached_lower("CaChInG's FuN AgAiN!") | |
538 "caching's fun again!" | |
539 >>> len(my_cache) | |
540 1 | |
541 | |
542 .. _hashable: https://docs.python.org/2/glossary.html#term-hashable | |
543 | |
544 """ | |
545 def cached_func_decorator(func): | |
546 return CachedFunction(func, cache, scoped=scoped, typed=typed, key=key) | |
547 return cached_func_decorator | |
548 | |
549 | |
550 def cachedmethod(cache, scoped=True, typed=False, key=None): | |
551 """Similar to :func:`cached`, ``cachedmethod`` is used to cache | |
552 methods based on their arguments, using any :class:`dict`-like | |
553 *cache* object. | |
554 | |
555 Args: | |
556 cache (str/Mapping/callable): Can be the name of an attribute | |
557 on the instance, any Mapping/:class:`dict`-like object, or | |
558 a callable which returns a Mapping. | |
559 scoped (bool): Whether the method itself and the object it is | |
560 bound to are part of the cache keys. ``True`` by default, | |
561 different methods will not read one another's cache | |
562 results. ``False`` can be useful for certain shared cache | |
563 use cases. More advanced behavior can be produced through | |
564 the *key* arguments. | |
565 typed (bool): Whether to factor argument types into the cache | |
566 check. Default ``False``, setting to ``True`` causes the | |
567 cache keys for ``3`` and ``3.0`` to be considered unequal. | |
568 key (callable): A callable with a signature that matches | |
569 :func:`make_cache_key` that returns a tuple of hashable | |
570 values to be used as the key in the cache. | |
571 | |
572 >>> class Lowerer(object): | |
573 ... def __init__(self): | |
574 ... self.cache = LRI() | |
575 ... | |
576 ... @cachedmethod('cache') | |
577 ... def lower(self, text): | |
578 ... return text.lower() | |
579 ... | |
580 >>> lowerer = Lowerer() | |
581 >>> lowerer.lower('WOW WHO COULD GUESS CACHING COULD BE SO NEAT') | |
582 'wow who could guess caching could be so neat' | |
583 >>> len(lowerer.cache) | |
584 1 | |
585 | |
586 """ | |
587 def cached_method_decorator(func): | |
588 return CachedMethod(func, cache, scoped=scoped, typed=typed, key=key) | |
589 return cached_method_decorator | |
590 | |
591 | |
592 class cachedproperty(object): | |
593 """The ``cachedproperty`` is used similar to :class:`property`, except | |
594 that the wrapped method is only called once. This is commonly used | |
595 to implement lazy attributes. | |
596 | |
597 After the property has been accessed, the value is stored on the | |
598 instance itself, using the same name as the cachedproperty. This | |
599 allows the cache to be cleared with :func:`delattr`, or through | |
600 manipulating the object's ``__dict__``. | |
601 """ | |
602 def __init__(self, func): | |
603 self.__doc__ = getattr(func, '__doc__') | |
604 self.__isabstractmethod__ = getattr(func, '__isabstractmethod__', False) | |
605 self.func = func | |
606 | |
607 def __get__(self, obj, objtype=None): | |
608 if obj is None: | |
609 return self | |
610 value = obj.__dict__[self.func.__name__] = self.func(obj) | |
611 return value | |
612 | |
613 def __repr__(self): | |
614 cn = self.__class__.__name__ | |
615 return '<%s func=%s>' % (cn, self.func) | |
616 | |
617 | |
618 class ThresholdCounter(object): | |
619 """A **bounded** dict-like Mapping from keys to counts. The | |
620 ThresholdCounter automatically compacts after every (1 / | |
621 *threshold*) additions, maintaining exact counts for any keys | |
622 whose count represents at least a *threshold* ratio of the total | |
623 data. In other words, if a particular key is not present in the | |
624 ThresholdCounter, its count represents less than *threshold* of | |
625 the total data. | |
626 | |
627 >>> tc = ThresholdCounter(threshold=0.1) | |
628 >>> tc.add(1) | |
629 >>> tc.items() | |
630 [(1, 1)] | |
631 >>> tc.update([2] * 10) | |
632 >>> tc.get(1) | |
633 0 | |
634 >>> tc.add(5) | |
635 >>> 5 in tc | |
636 True | |
637 >>> len(list(tc.elements())) | |
638 11 | |
639 | |
640 As you can see above, the API is kept similar to | |
641 :class:`collections.Counter`. The most notable feature omissions | |
642 being that counted items cannot be set directly, uncounted, or | |
643 removed, as this would disrupt the math. | |
644 | |
645 Use the ThresholdCounter when you need best-effort long-lived | |
646 counts for dynamically-keyed data. Without a bounded datastructure | |
647 such as this one, the dynamic keys often represent a memory leak | |
648 and can impact application reliability. The ThresholdCounter's | |
649 item replacement strategy is fully deterministic and can be | |
650 thought of as *Amortized Least Relevant*. The absolute upper bound | |
651 of keys it will store is *(2/threshold)*, but realistically | |
652 *(1/threshold)* is expected for uniformly random datastreams, and | |
653 one or two orders of magnitude better for real-world data. | |
654 | |
655 This algorithm is an implementation of the Lossy Counting | |
656 algorithm described in "Approximate Frequency Counts over Data | |
657 Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery | |
658 and initial implementation. | |
659 | |
660 """ | |
661 # TODO: hit_count/miss_count? | |
662 def __init__(self, threshold=0.001): | |
663 if not 0 < threshold < 1: | |
664 raise ValueError('expected threshold between 0 and 1, not: %r' | |
665 % threshold) | |
666 | |
667 self.total = 0 | |
668 self._count_map = {} | |
669 self._threshold = threshold | |
670 self._thresh_count = int(1 / threshold) | |
671 self._cur_bucket = 1 | |
672 | |
673 @property | |
674 def threshold(self): | |
675 return self._threshold | |
676 | |
677 def add(self, key): | |
678 """Increment the count of *key* by 1, automatically adding it if it | |
679 does not exist. | |
680 | |
681 Cache compaction is triggered every *1/threshold* additions. | |
682 """ | |
683 self.total += 1 | |
684 try: | |
685 self._count_map[key][0] += 1 | |
686 except KeyError: | |
687 self._count_map[key] = [1, self._cur_bucket - 1] | |
688 | |
689 if self.total % self._thresh_count == 0: | |
690 self._count_map = dict([(k, v) for k, v in self._count_map.items() | |
691 if sum(v) > self._cur_bucket]) | |
692 self._cur_bucket += 1 | |
693 return | |
694 | |
695 def elements(self): | |
696 """Return an iterator of all the common elements tracked by the | |
697 counter. Yields each key as many times as it has been seen. | |
698 """ | |
699 repeaters = itertools.starmap(itertools.repeat, self.iteritems()) | |
700 return itertools.chain.from_iterable(repeaters) | |
701 | |
702 def most_common(self, n=None): | |
703 """Get the top *n* keys and counts as tuples. If *n* is omitted, | |
704 returns all the pairs. | |
705 """ | |
706 if n <= 0: | |
707 return [] | |
708 ret = sorted(self.iteritems(), key=lambda x: x[1], reverse=True) | |
709 if n is None or n >= len(ret): | |
710 return ret | |
711 return ret[:n] | |
712 | |
713 def get_common_count(self): | |
714 """Get the sum of counts for keys exceeding the configured data | |
715 threshold. | |
716 """ | |
717 return sum([count for count, _ in self._count_map.values()]) | |
718 | |
719 def get_uncommon_count(self): | |
720 """Get the sum of counts for keys that were culled because the | |
721 associated counts represented less than the configured | |
722 threshold. The long-tail counts. | |
723 """ | |
724 return self.total - self.get_common_count() | |
725 | |
726 def get_commonality(self): | |
727 """Get a float representation of the effective count accuracy. The | |
728 higher the number, the less uniform the keys being added, and | |
729 the higher accuracy and efficiency of the ThresholdCounter. | |
730 | |
731 If a stronger measure of data cardinality is required, | |
732 consider using hyperloglog. | |
733 """ | |
734 return float(self.get_common_count()) / self.total | |
735 | |
736 def __getitem__(self, key): | |
737 return self._count_map[key][0] | |
738 | |
739 def __len__(self): | |
740 return len(self._count_map) | |
741 | |
742 def __contains__(self, key): | |
743 return key in self._count_map | |
744 | |
745 def iterkeys(self): | |
746 return iter(self._count_map) | |
747 | |
748 def keys(self): | |
749 return list(self.iterkeys()) | |
750 | |
751 def itervalues(self): | |
752 count_map = self._count_map | |
753 for k in count_map: | |
754 yield count_map[k][0] | |
755 | |
756 def values(self): | |
757 return list(self.itervalues()) | |
758 | |
759 def iteritems(self): | |
760 count_map = self._count_map | |
761 for k in count_map: | |
762 yield (k, count_map[k][0]) | |
763 | |
764 def items(self): | |
765 return list(self.iteritems()) | |
766 | |
767 def get(self, key, default=0): | |
768 "Get count for *key*, defaulting to 0." | |
769 try: | |
770 return self[key] | |
771 except KeyError: | |
772 return default | |
773 | |
774 def update(self, iterable, **kwargs): | |
775 """Like dict.update() but add counts instead of replacing them, used | |
776 to add multiple items in one call. | |
777 | |
778 Source can be an iterable of keys to add, or a mapping of keys | |
779 to integer counts. | |
780 """ | |
781 if iterable is not None: | |
782 if callable(getattr(iterable, 'iteritems', None)): | |
783 for key, count in iterable.iteritems(): | |
784 for i in xrange(count): | |
785 self.add(key) | |
786 else: | |
787 for key in iterable: | |
788 self.add(key) | |
789 if kwargs: | |
790 self.update(kwargs) | |
791 | |
792 | |
793 class MinIDMap(object): | |
794 """ | |
795 Assigns arbitrary weakref-able objects the smallest possible unique | |
796 integer IDs, such that no two objects have the same ID at the same | |
797 time. | |
798 | |
799 Maps arbitrary hashable objects to IDs. | |
800 | |
801 Based on https://gist.github.com/kurtbrose/25b48114de216a5e55df | |
802 """ | |
803 def __init__(self): | |
804 self.mapping = weakref.WeakKeyDictionary() | |
805 self.ref_map = {} | |
806 self.free = [] | |
807 | |
808 def get(self, a): | |
809 try: | |
810 return self.mapping[a][0] # if object is mapped, return ID | |
811 except KeyError: | |
812 pass | |
813 | |
814 if self.free: # if there are any free IDs, use the smallest | |
815 nxt = heapq.heappop(self.free) | |
816 else: # if there are no free numbers, use the next highest ID | |
817 nxt = len(self.mapping) | |
818 ref = weakref.ref(a, self._clean) | |
819 self.mapping[a] = (nxt, ref) | |
820 self.ref_map[ref] = nxt | |
821 return nxt | |
822 | |
823 def drop(self, a): | |
824 freed, ref = self.mapping[a] | |
825 del self.mapping[a] | |
826 del self.ref_map[ref] | |
827 heapq.heappush(self.free, freed) | |
828 | |
829 def _clean(self, ref): | |
830 print(self.ref_map[ref]) | |
831 heapq.heappush(self.free, self.ref_map[ref]) | |
832 del self.ref_map[ref] | |
833 | |
834 def __contains__(self, a): | |
835 return a in self.mapping | |
836 | |
837 def __iter__(self): | |
838 return iter(self.mapping) | |
839 | |
840 def __len__(self): | |
841 return self.mapping.__len__() | |
842 | |
843 def iteritems(self): | |
844 return iter((k, self.mapping[k][0]) for k in iter(self.mapping)) | |
845 | |
846 | |
847 # end cacheutils.py |