comparison env/lib/python3.7/site-packages/boltons/iterutils.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # -*- coding: utf-8 -*-
2 """:mod:`itertools` is full of great examples of Python generator
3 usage. However, there are still some critical gaps. ``iterutils``
4 fills many of those gaps with featureful, tested, and Pythonic
5 solutions.
6
7 Many of the functions below have two versions, one which
8 returns an iterator (denoted by the ``*_iter`` naming pattern), and a
9 shorter-named convenience form that returns a list. Some of the
10 following are based on examples in itertools docs.
11 """
12
13 import os
14 import math
15 import time
16 import codecs
17 import random
18 import socket
19 import hashlib
20 import itertools
21
22 try:
23 from collections.abc import Mapping, Sequence, Set, ItemsView, Iterable
24 except ImportError:
25 from collections import Mapping, Sequence, Set, ItemsView, Iterable
26
27
28 try:
29 from typeutils import make_sentinel
30 _UNSET = make_sentinel('_UNSET')
31 _REMAP_EXIT = make_sentinel('_REMAP_EXIT')
32 except ImportError:
33 _REMAP_EXIT = object()
34 _UNSET = object()
35
36 try:
37 from future_builtins import filter
38 from itertools import izip
39 _IS_PY3 = False
40 except ImportError:
41 # Python 3 compat
42 _IS_PY3 = True
43 basestring = (str, bytes)
44 unicode = str
45 izip, xrange = zip, range
46
47
48 def is_iterable(obj):
49 """Similar in nature to :func:`callable`, ``is_iterable`` returns
50 ``True`` if an object is `iterable`_, ``False`` if not.
51
52 >>> is_iterable([])
53 True
54 >>> is_iterable(object())
55 False
56
57 .. _iterable: https://docs.python.org/2/glossary.html#term-iterable
58 """
59 try:
60 iter(obj)
61 except TypeError:
62 return False
63 return True
64
65
66 def is_scalar(obj):
67 """A near-mirror of :func:`is_iterable`. Returns ``False`` if an
68 object is an iterable container type. Strings are considered
69 scalar as well, because strings are more often treated as whole
70 values as opposed to iterables of 1-character substrings.
71
72 >>> is_scalar(object())
73 True
74 >>> is_scalar(range(10))
75 False
76 >>> is_scalar('hello')
77 True
78 """
79 return not is_iterable(obj) or isinstance(obj, basestring)
80
81
82 def is_collection(obj):
83 """The opposite of :func:`is_scalar`. Returns ``True`` if an object
84 is an iterable other than a string.
85
86 >>> is_collection(object())
87 False
88 >>> is_collection(range(10))
89 True
90 >>> is_collection('hello')
91 False
92 """
93 return is_iterable(obj) and not isinstance(obj, basestring)
94
95
96 def split(src, sep=None, maxsplit=None):
97 """Splits an iterable based on a separator. Like :meth:`str.split`,
98 but for all iterables. Returns a list of lists.
99
100 >>> split(['hi', 'hello', None, None, 'sup', None, 'soap', None])
101 [['hi', 'hello'], ['sup'], ['soap']]
102
103 See :func:`split_iter` docs for more info.
104 """
105 return list(split_iter(src, sep, maxsplit))
106
107
108 def split_iter(src, sep=None, maxsplit=None):
109 """Splits an iterable based on a separator, *sep*, a max of
110 *maxsplit* times (no max by default). *sep* can be:
111
112 * a single value
113 * an iterable of separators
114 * a single-argument callable that returns True when a separator is
115 encountered
116
117 ``split_iter()`` yields lists of non-separator values. A separator will
118 never appear in the output.
119
120 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None, 'soap', None]))
121 [['hi', 'hello'], ['sup'], ['soap']]
122
123 Note that ``split_iter`` is based on :func:`str.split`, so if
124 *sep* is ``None``, ``split()`` **groups** separators. If empty lists
125 are desired between two contiguous ``None`` values, simply use
126 ``sep=[None]``:
127
128 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None]))
129 [['hi', 'hello'], ['sup']]
130 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None], sep=[None]))
131 [['hi', 'hello'], [], ['sup'], []]
132
133 Using a callable separator:
134
135 >>> falsy_sep = lambda x: not x
136 >>> list(split_iter(['hi', 'hello', None, '', 'sup', False], falsy_sep))
137 [['hi', 'hello'], [], ['sup'], []]
138
139 See :func:`split` for a list-returning version.
140
141 """
142 if not is_iterable(src):
143 raise TypeError('expected an iterable')
144
145 if maxsplit is not None:
146 maxsplit = int(maxsplit)
147 if maxsplit == 0:
148 yield [src]
149 return
150
151 if callable(sep):
152 sep_func = sep
153 elif not is_scalar(sep):
154 sep = frozenset(sep)
155 sep_func = lambda x: x in sep
156 else:
157 sep_func = lambda x: x == sep
158
159 cur_group = []
160 split_count = 0
161 for s in src:
162 if maxsplit is not None and split_count >= maxsplit:
163 sep_func = lambda x: False
164 if sep_func(s):
165 if sep is None and not cur_group:
166 # If sep is none, str.split() "groups" separators
167 # check the str.split() docs for more info
168 continue
169 split_count += 1
170 yield cur_group
171 cur_group = []
172 else:
173 cur_group.append(s)
174
175 if cur_group or sep is not None:
176 yield cur_group
177 return
178
179
180 def chunked(src, size, count=None, **kw):
181 """Returns a list of *count* chunks, each with *size* elements,
182 generated from iterable *src*. If *src* is not evenly divisible by
183 *size*, the final chunk will have fewer than *size* elements.
184 Provide the *fill* keyword argument to provide a pad value and
185 enable padding, otherwise no padding will take place.
186
187 >>> chunked(range(10), 3)
188 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
189 >>> chunked(range(10), 3, fill=None)
190 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
191 >>> chunked(range(10), 3, count=2)
192 [[0, 1, 2], [3, 4, 5]]
193
194 See :func:`chunked_iter` for more info.
195 """
196 chunk_iter = chunked_iter(src, size, **kw)
197 if count is None:
198 return list(chunk_iter)
199 else:
200 return list(itertools.islice(chunk_iter, count))
201
202
203 def chunked_iter(src, size, **kw):
204 """Generates *size*-sized chunks from *src* iterable. Unless the
205 optional *fill* keyword argument is provided, iterables not evenly
206 divisible by *size* will have a final chunk that is smaller than
207 *size*.
208
209 >>> list(chunked_iter(range(10), 3))
210 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
211 >>> list(chunked_iter(range(10), 3, fill=None))
212 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
213
214 Note that ``fill=None`` in fact uses ``None`` as the fill value.
215 """
216 # TODO: add count kwarg?
217 if not is_iterable(src):
218 raise TypeError('expected an iterable')
219 size = int(size)
220 if size <= 0:
221 raise ValueError('expected a positive integer chunk size')
222 do_fill = True
223 try:
224 fill_val = kw.pop('fill')
225 except KeyError:
226 do_fill = False
227 fill_val = None
228 if kw:
229 raise ValueError('got unexpected keyword arguments: %r' % kw.keys())
230 if not src:
231 return
232 postprocess = lambda chk: chk
233 if isinstance(src, basestring):
234 postprocess = lambda chk, _sep=type(src)(): _sep.join(chk)
235 if _IS_PY3 and isinstance(src, bytes):
236 postprocess = lambda chk: bytes(chk)
237 src_iter = iter(src)
238 while True:
239 cur_chunk = list(itertools.islice(src_iter, size))
240 if not cur_chunk:
241 break
242 lc = len(cur_chunk)
243 if lc < size and do_fill:
244 cur_chunk[lc:] = [fill_val] * (size - lc)
245 yield postprocess(cur_chunk)
246 return
247
248
249 def pairwise(src):
250 """Convenience function for calling :func:`windowed` on *src*, with
251 *size* set to 2.
252
253 >>> pairwise(range(5))
254 [(0, 1), (1, 2), (2, 3), (3, 4)]
255 >>> pairwise([])
256 []
257
258 The number of pairs is always one less than the number of elements
259 in the iterable passed in, except on empty inputs, which returns
260 an empty list.
261 """
262 return windowed(src, 2)
263
264
265 def pairwise_iter(src):
266 """Convenience function for calling :func:`windowed_iter` on *src*,
267 with *size* set to 2.
268
269 >>> list(pairwise_iter(range(5)))
270 [(0, 1), (1, 2), (2, 3), (3, 4)]
271 >>> list(pairwise_iter([]))
272 []
273
274 The number of pairs is always one less than the number of elements
275 in the iterable passed in, or zero, when *src* is empty.
276
277 """
278 return windowed_iter(src, 2)
279
280
281 def windowed(src, size):
282 """Returns tuples with exactly length *size*. If the iterable is
283 too short to make a window of length *size*, no tuples are
284 returned. See :func:`windowed_iter` for more.
285 """
286 return list(windowed_iter(src, size))
287
288
289 def windowed_iter(src, size):
290 """Returns tuples with length *size* which represent a sliding
291 window over iterable *src*.
292
293 >>> list(windowed_iter(range(7), 3))
294 [(0, 1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)]
295
296 If the iterable is too short to make a window of length *size*,
297 then no window tuples are returned.
298
299 >>> list(windowed_iter(range(3), 5))
300 []
301 """
302 # TODO: lists? (for consistency)
303 tees = itertools.tee(src, size)
304 try:
305 for i, t in enumerate(tees):
306 for _ in xrange(i):
307 next(t)
308 except StopIteration:
309 return izip([])
310 return izip(*tees)
311
312
313 def xfrange(stop, start=None, step=1.0):
314 """Same as :func:`frange`, but generator-based instead of returning a
315 list.
316
317 >>> tuple(xfrange(1, 3, step=0.75))
318 (1.0, 1.75, 2.5)
319
320 See :func:`frange` for more details.
321 """
322 if not step:
323 raise ValueError('step must be non-zero')
324 if start is None:
325 start, stop = 0.0, stop * 1.0
326 else:
327 # swap when all args are used
328 stop, start = start * 1.0, stop * 1.0
329 cur = start
330 while cur < stop:
331 yield cur
332 cur += step
333
334
335 def frange(stop, start=None, step=1.0):
336 """A :func:`range` clone for float-based ranges.
337
338 >>> frange(5)
339 [0.0, 1.0, 2.0, 3.0, 4.0]
340 >>> frange(6, step=1.25)
341 [0.0, 1.25, 2.5, 3.75, 5.0]
342 >>> frange(100.5, 101.5, 0.25)
343 [100.5, 100.75, 101.0, 101.25]
344 >>> frange(5, 0)
345 []
346 >>> frange(5, 0, step=-1.25)
347 [5.0, 3.75, 2.5, 1.25]
348 """
349 if not step:
350 raise ValueError('step must be non-zero')
351 if start is None:
352 start, stop = 0.0, stop * 1.0
353 else:
354 # swap when all args are used
355 stop, start = start * 1.0, stop * 1.0
356 count = int(math.ceil((stop - start) / step))
357 ret = [None] * count
358 if not ret:
359 return ret
360 ret[0] = start
361 for i in xrange(1, count):
362 ret[i] = ret[i - 1] + step
363 return ret
364
365
366 def backoff(start, stop, count=None, factor=2.0, jitter=False):
367 """Returns a list of geometrically-increasing floating-point numbers,
368 suitable for usage with `exponential backoff`_. Exactly like
369 :func:`backoff_iter`, but without the ``'repeat'`` option for
370 *count*. See :func:`backoff_iter` for more details.
371
372 .. _exponential backoff: https://en.wikipedia.org/wiki/Exponential_backoff
373
374 >>> backoff(1, 10)
375 [1.0, 2.0, 4.0, 8.0, 10.0]
376 """
377 if count == 'repeat':
378 raise ValueError("'repeat' supported in backoff_iter, not backoff")
379 return list(backoff_iter(start, stop, count=count,
380 factor=factor, jitter=jitter))
381
382
383 def backoff_iter(start, stop, count=None, factor=2.0, jitter=False):
384 """Generates a sequence of geometrically-increasing floats, suitable
385 for usage with `exponential backoff`_. Starts with *start*,
386 increasing by *factor* until *stop* is reached, optionally
387 stopping iteration once *count* numbers are yielded. *factor*
388 defaults to 2. In general retrying with properly-configured
389 backoff creates a better-behaved component for a larger service
390 ecosystem.
391
392 .. _exponential backoff: https://en.wikipedia.org/wiki/Exponential_backoff
393
394 >>> list(backoff_iter(1.0, 10.0, count=5))
395 [1.0, 2.0, 4.0, 8.0, 10.0]
396 >>> list(backoff_iter(1.0, 10.0, count=8))
397 [1.0, 2.0, 4.0, 8.0, 10.0, 10.0, 10.0, 10.0]
398 >>> list(backoff_iter(0.25, 100.0, factor=10))
399 [0.25, 2.5, 25.0, 100.0]
400
401 A simplified usage example:
402
403 .. code-block:: python
404
405 for timeout in backoff_iter(0.25, 5.0):
406 try:
407 res = network_call()
408 break
409 except Exception as e:
410 log(e)
411 time.sleep(timeout)
412
413 An enhancement for large-scale systems would be to add variation,
414 or *jitter*, to timeout values. This is done to avoid a thundering
415 herd on the receiving end of the network call.
416
417 Finally, for *count*, the special value ``'repeat'`` can be passed to
418 continue yielding indefinitely.
419
420 Args:
421
422 start (float): Positive number for baseline.
423 stop (float): Positive number for maximum.
424 count (int): Number of steps before stopping
425 iteration. Defaults to the number of steps between *start* and
426 *stop*. Pass the string, `'repeat'`, to continue iteration
427 indefinitely.
428 factor (float): Rate of exponential increase. Defaults to `2.0`,
429 e.g., `[1, 2, 4, 8, 16]`.
430 jitter (float): A factor between `-1.0` and `1.0`, used to
431 uniformly randomize and thus spread out timeouts in a distributed
432 system, avoiding rhythm effects. Positive values use the base
433 backoff curve as a maximum, negative values use the curve as a
434 minimum. Set to 1.0 or `True` for a jitter approximating
435 Ethernet's time-tested backoff solution. Defaults to `False`.
436
437 """
438 start = float(start)
439 stop = float(stop)
440 factor = float(factor)
441 if start < 0.0:
442 raise ValueError('expected start >= 0, not %r' % start)
443 if factor < 1.0:
444 raise ValueError('expected factor >= 1.0, not %r' % factor)
445 if stop == 0.0:
446 raise ValueError('expected stop >= 0')
447 if stop < start:
448 raise ValueError('expected stop >= start, not %r' % stop)
449 if count is None:
450 denom = start if start else 1
451 count = 1 + math.ceil(math.log(stop/denom, factor))
452 count = count if start else count + 1
453 if count != 'repeat' and count < 0:
454 raise ValueError('count must be positive or "repeat", not %r' % count)
455 if jitter:
456 jitter = float(jitter)
457 if not (-1.0 <= jitter <= 1.0):
458 raise ValueError('expected jitter -1 <= j <= 1, not: %r' % jitter)
459
460 cur, i = start, 0
461 while count == 'repeat' or i < count:
462 if not jitter:
463 cur_ret = cur
464 elif jitter:
465 cur_ret = cur - (cur * jitter * random.random())
466 yield cur_ret
467 i += 1
468 if cur == 0:
469 cur = 1
470 elif cur < stop:
471 cur *= factor
472 if cur > stop:
473 cur = stop
474 return
475
476
477 def bucketize(src, key=bool, value_transform=None, key_filter=None):
478 """Group values in the *src* iterable by the value returned by *key*.
479
480 >>> bucketize(range(5))
481 {False: [0], True: [1, 2, 3, 4]}
482 >>> is_odd = lambda x: x % 2 == 1
483 >>> bucketize(range(5), is_odd)
484 {False: [0, 2, 4], True: [1, 3]}
485
486 *key* is :class:`bool` by default, but can either be a callable or a string
487 name of the attribute on which to bucketize objects.
488
489 >>> bucketize([1+1j, 2+2j, 1, 2], key='real')
490 {1.0: [(1+1j), 1], 2.0: [(2+2j), 2]}
491
492 Value lists are not deduplicated:
493
494 >>> bucketize([None, None, None, 'hello'])
495 {False: [None, None, None], True: ['hello']}
496
497 Bucketize into more than 3 groups
498
499 >>> bucketize(range(10), lambda x: x % 3)
500 {0: [0, 3, 6, 9], 1: [1, 4, 7], 2: [2, 5, 8]}
501
502 ``bucketize`` has a couple of advanced options useful in certain
503 cases. *value_transform* can be used to modify values as they are
504 added to buckets, and *key_filter* will allow excluding certain
505 buckets from being collected.
506
507 >>> bucketize(range(5), value_transform=lambda x: x*x)
508 {False: [0], True: [1, 4, 9, 16]}
509
510 >>> bucketize(range(10), key=lambda x: x % 3, key_filter=lambda k: k % 3 != 1)
511 {0: [0, 3, 6, 9], 2: [2, 5, 8]}
512
513 Note in some of these examples there were at most two keys, ``True`` and
514 ``False``, and each key present has a list with at least one
515 item. See :func:`partition` for a version specialized for binary
516 use cases.
517
518 """
519 if not is_iterable(src):
520 raise TypeError('expected an iterable')
521
522 if isinstance(key, basestring):
523 key_func = lambda x: getattr(x, key, x)
524 elif callable(key):
525 key_func = key
526 else:
527 raise TypeError('expected key to be callable or a string')
528
529 if value_transform is None:
530 value_transform = lambda x: x
531 if not callable(value_transform):
532 raise TypeError('expected callable value transform function')
533
534 ret = {}
535 for val in src:
536 key_of_val = key_func(val)
537 if key_filter is None or key_filter(key_of_val):
538 ret.setdefault(key_of_val, []).append(value_transform(val))
539 return ret
540
541
542 def partition(src, key=bool):
543 """No relation to :meth:`str.partition`, ``partition`` is like
544 :func:`bucketize`, but for added convenience returns a tuple of
545 ``(truthy_values, falsy_values)``.
546
547 >>> nonempty, empty = partition(['', '', 'hi', '', 'bye'])
548 >>> nonempty
549 ['hi', 'bye']
550
551 *key* defaults to :class:`bool`, but can be carefully overridden to
552 use either a function that returns either ``True`` or ``False`` or
553 a string name of the attribute on which to partition objects.
554
555 >>> import string
556 >>> is_digit = lambda x: x in string.digits
557 >>> decimal_digits, hexletters = partition(string.hexdigits, is_digit)
558 >>> ''.join(decimal_digits), ''.join(hexletters)
559 ('0123456789', 'abcdefABCDEF')
560 """
561 bucketized = bucketize(src, key)
562 return bucketized.get(True, []), bucketized.get(False, [])
563
564
565 def unique(src, key=None):
566 """``unique()`` returns a list of unique values, as determined by
567 *key*, in the order they first appeared in the input iterable,
568 *src*.
569
570 >>> ones_n_zeros = '11010110001010010101010'
571 >>> ''.join(unique(ones_n_zeros))
572 '10'
573
574 See :func:`unique_iter` docs for more details.
575 """
576 return list(unique_iter(src, key))
577
578
579 def unique_iter(src, key=None):
580 """Yield unique elements from the iterable, *src*, based on *key*,
581 in the order in which they first appeared in *src*.
582
583 >>> repetitious = [1, 2, 3] * 10
584 >>> list(unique_iter(repetitious))
585 [1, 2, 3]
586
587 By default, *key* is the object itself, but *key* can either be a
588 callable or, for convenience, a string name of the attribute on
589 which to uniqueify objects, falling back on identity when the
590 attribute is not present.
591
592 >>> pleasantries = ['hi', 'hello', 'ok', 'bye', 'yes']
593 >>> list(unique_iter(pleasantries, key=lambda x: len(x)))
594 ['hi', 'hello', 'bye']
595 """
596 if not is_iterable(src):
597 raise TypeError('expected an iterable, not %r' % type(src))
598 if key is None:
599 key_func = lambda x: x
600 elif callable(key):
601 key_func = key
602 elif isinstance(key, basestring):
603 key_func = lambda x: getattr(x, key, x)
604 else:
605 raise TypeError('"key" expected a string or callable, not %r' % key)
606 seen = set()
607 for i in src:
608 k = key_func(i)
609 if k not in seen:
610 seen.add(k)
611 yield i
612 return
613
614
615 def redundant(src, key=None, groups=False):
616 """The complement of :func:`unique()`.
617
618 By default returns non-unique values as a list of the *first*
619 redundant value in *src*. Pass ``groups=True`` to get groups of
620 all values with redundancies, ordered by position of the first
621 redundant value. This is useful in conjunction with some
622 normalizing *key* function.
623
624 >>> redundant([1, 2, 3, 4])
625 []
626 >>> redundant([1, 2, 3, 2, 3, 3, 4])
627 [2, 3]
628 >>> redundant([1, 2, 3, 2, 3, 3, 4], groups=True)
629 [[2, 2], [3, 3, 3]]
630
631 An example using a *key* function to do case-insensitive
632 redundancy detection.
633
634 >>> redundant(['hi', 'Hi', 'HI', 'hello'], key=str.lower)
635 ['Hi']
636 >>> redundant(['hi', 'Hi', 'HI', 'hello'], groups=True, key=str.lower)
637 [['hi', 'Hi', 'HI']]
638
639 *key* should also be used when the values in *src* are not hashable.
640
641 .. note::
642
643 This output of this function is designed for reporting
644 duplicates in contexts when a unique input is desired. Due to
645 the grouped return type, there is no streaming equivalent of
646 this function for the time being.
647
648 """
649 if key is None:
650 pass
651 elif callable(key):
652 key_func = key
653 elif isinstance(key, basestring):
654 key_func = lambda x: getattr(x, key, x)
655 else:
656 raise TypeError('"key" expected a string or callable, not %r' % key)
657 seen = {} # key to first seen item
658 redundant_order = []
659 redundant_groups = {}
660 for i in src:
661 k = key_func(i) if key else i
662 if k not in seen:
663 seen[k] = i
664 else:
665 if k in redundant_groups:
666 if groups:
667 redundant_groups[k].append(i)
668 else:
669 redundant_order.append(k)
670 redundant_groups[k] = [seen[k], i]
671 if not groups:
672 ret = [redundant_groups[k][1] for k in redundant_order]
673 else:
674 ret = [redundant_groups[k] for k in redundant_order]
675 return ret
676
677
678 def one(src, default=None, key=None):
679 """Along the same lines as builtins, :func:`all` and :func:`any`, and
680 similar to :func:`first`, ``one()`` returns the single object in
681 the given iterable *src* that evaluates to ``True``, as determined
682 by callable *key*. If unset, *key* defaults to :class:`bool`. If
683 no such objects are found, *default* is returned. If *default* is
684 not passed, ``None`` is returned.
685
686 If *src* has more than one object that evaluates to ``True``, or
687 if there is no object that fulfills such condition, return
688 *default*. It's like an `XOR`_ over an iterable.
689
690 >>> one((True, False, False))
691 True
692 >>> one((True, False, True))
693 >>> one((0, 0, 'a'))
694 'a'
695 >>> one((0, False, None))
696 >>> one((True, True), default=False)
697 False
698 >>> bool(one(('', 1)))
699 True
700 >>> one((10, 20, 30, 42), key=lambda i: i > 40)
701 42
702
703 See `Martín Gaitán's original repo`_ for further use cases.
704
705 .. _Martín Gaitán's original repo: https://github.com/mgaitan/one
706 .. _XOR: https://en.wikipedia.org/wiki/Exclusive_or
707
708 """
709 ones = list(itertools.islice(filter(key, src), 2))
710 return ones[0] if len(ones) == 1 else default
711
712
713 def first(iterable, default=None, key=None):
714 """Return first element of *iterable* that evaluates to ``True``, else
715 return ``None`` or optional *default*. Similar to :func:`one`.
716
717 >>> first([0, False, None, [], (), 42])
718 42
719 >>> first([0, False, None, [], ()]) is None
720 True
721 >>> first([0, False, None, [], ()], default='ohai')
722 'ohai'
723 >>> import re
724 >>> m = first(re.match(regex, 'abc') for regex in ['b.*', 'a(.*)'])
725 >>> m.group(1)
726 'bc'
727
728 The optional *key* argument specifies a one-argument predicate function
729 like that used for *filter()*. The *key* argument, if supplied, should be
730 in keyword form. For example, finding the first even number in an iterable:
731
732 >>> first([1, 1, 3, 4, 5], key=lambda x: x % 2 == 0)
733 4
734
735 Contributed by Hynek Schlawack, author of `the original standalone module`_.
736
737 .. _the original standalone module: https://github.com/hynek/first
738 """
739 return next(filter(key, iterable), default)
740
741
742 def flatten_iter(iterable):
743 """``flatten_iter()`` yields all the elements from *iterable* while
744 collapsing any nested iterables.
745
746 >>> nested = [[1, 2], [[3], [4, 5]]]
747 >>> list(flatten_iter(nested))
748 [1, 2, 3, 4, 5]
749 """
750 for item in iterable:
751 if isinstance(item, Iterable) and not isinstance(item, basestring):
752 for subitem in flatten_iter(item):
753 yield subitem
754 else:
755 yield item
756
757 def flatten(iterable):
758 """``flatten()`` returns a collapsed list of all the elements from
759 *iterable* while collapsing any nested iterables.
760
761 >>> nested = [[1, 2], [[3], [4, 5]]]
762 >>> flatten(nested)
763 [1, 2, 3, 4, 5]
764 """
765 return list(flatten_iter(iterable))
766
767
768 def same(iterable, ref=_UNSET):
769 """``same()`` returns ``True`` when all values in *iterable* are
770 equal to one another, or optionally a reference value,
771 *ref*. Similar to :func:`all` and :func:`any` in that it evaluates
772 an iterable and returns a :class:`bool`. ``same()`` returns
773 ``True`` for empty iterables.
774
775 >>> same([])
776 True
777 >>> same([1])
778 True
779 >>> same(['a', 'a', 'a'])
780 True
781 >>> same(range(20))
782 False
783 >>> same([[], []])
784 True
785 >>> same([[], []], ref='test')
786 False
787
788 """
789 iterator = iter(iterable)
790 if ref is _UNSET:
791 ref = next(iterator, ref)
792 return all(val == ref for val in iterator)
793
794
795 def default_visit(path, key, value):
796 # print('visit(%r, %r, %r)' % (path, key, value))
797 return key, value
798
799 # enable the extreme: monkeypatching iterutils with a different default_visit
800 _orig_default_visit = default_visit
801
802
803 def default_enter(path, key, value):
804 # print('enter(%r, %r)' % (key, value))
805 if isinstance(value, basestring):
806 return value, False
807 elif isinstance(value, Mapping):
808 return value.__class__(), ItemsView(value)
809 elif isinstance(value, Sequence):
810 return value.__class__(), enumerate(value)
811 elif isinstance(value, Set):
812 return value.__class__(), enumerate(value)
813 else:
814 # files, strings, other iterables, and scalars are not
815 # traversed
816 return value, False
817
818
819 def default_exit(path, key, old_parent, new_parent, new_items):
820 # print('exit(%r, %r, %r, %r, %r)'
821 # % (path, key, old_parent, new_parent, new_items))
822 ret = new_parent
823 if isinstance(new_parent, Mapping):
824 new_parent.update(new_items)
825 elif isinstance(new_parent, Sequence):
826 vals = [v for i, v in new_items]
827 try:
828 new_parent.extend(vals)
829 except AttributeError:
830 ret = new_parent.__class__(vals) # tuples
831 elif isinstance(new_parent, Set):
832 vals = [v for i, v in new_items]
833 try:
834 new_parent.update(vals)
835 except AttributeError:
836 ret = new_parent.__class__(vals) # frozensets
837 else:
838 raise RuntimeError('unexpected iterable type: %r' % type(new_parent))
839 return ret
840
841
842 def remap(root, visit=default_visit, enter=default_enter, exit=default_exit,
843 **kwargs):
844 """The remap ("recursive map") function is used to traverse and
845 transform nested structures. Lists, tuples, sets, and dictionaries
846 are just a few of the data structures nested into heterogenous
847 tree-like structures that are so common in programming.
848 Unfortunately, Python's built-in ways to manipulate collections
849 are almost all flat. List comprehensions may be fast and succinct,
850 but they do not recurse, making it tedious to apply quick changes
851 or complex transforms to real-world data.
852
853 remap goes where list comprehensions cannot.
854
855 Here's an example of removing all Nones from some data:
856
857 >>> from pprint import pprint
858 >>> reviews = {'Star Trek': {'TNG': 10, 'DS9': 8.5, 'ENT': None},
859 ... 'Babylon 5': 6, 'Dr. Who': None}
860 >>> pprint(remap(reviews, lambda p, k, v: v is not None))
861 {'Babylon 5': 6, 'Star Trek': {'DS9': 8.5, 'TNG': 10}}
862
863 Notice how both Nones have been removed despite the nesting in the
864 dictionary. Not bad for a one-liner, and that's just the beginning.
865 See `this remap cookbook`_ for more delicious recipes.
866
867 .. _this remap cookbook: http://sedimental.org/remap.html
868
869 remap takes four main arguments: the object to traverse and three
870 optional callables which determine how the remapped object will be
871 created.
872
873 Args:
874
875 root: The target object to traverse. By default, remap
876 supports iterables like :class:`list`, :class:`tuple`,
877 :class:`dict`, and :class:`set`, but any object traversable by
878 *enter* will work.
879 visit (callable): This function is called on every item in
880 *root*. It must accept three positional arguments, *path*,
881 *key*, and *value*. *path* is simply a tuple of parents'
882 keys. *visit* should return the new key-value pair. It may
883 also return ``True`` as shorthand to keep the old item
884 unmodified, or ``False`` to drop the item from the new
885 structure. *visit* is called after *enter*, on the new parent.
886
887 The *visit* function is called for every item in root,
888 including duplicate items. For traversable values, it is
889 called on the new parent object, after all its children
890 have been visited. The default visit behavior simply
891 returns the key-value pair unmodified.
892 enter (callable): This function controls which items in *root*
893 are traversed. It accepts the same arguments as *visit*: the
894 path, the key, and the value of the current item. It returns a
895 pair of the blank new parent, and an iterator over the items
896 which should be visited. If ``False`` is returned instead of
897 an iterator, the value will not be traversed.
898
899 The *enter* function is only called once per unique value. The
900 default enter behavior support mappings, sequences, and
901 sets. Strings and all other iterables will not be traversed.
902 exit (callable): This function determines how to handle items
903 once they have been visited. It gets the same three
904 arguments as the other functions -- *path*, *key*, *value*
905 -- plus two more: the blank new parent object returned
906 from *enter*, and a list of the new items, as remapped by
907 *visit*.
908
909 Like *enter*, the *exit* function is only called once per
910 unique value. The default exit behavior is to simply add
911 all new items to the new parent, e.g., using
912 :meth:`list.extend` and :meth:`dict.update` to add to the
913 new parent. Immutable objects, such as a :class:`tuple` or
914 :class:`namedtuple`, must be recreated from scratch, but
915 use the same type as the new parent passed back from the
916 *enter* function.
917 reraise_visit (bool): A pragmatic convenience for the *visit*
918 callable. When set to ``False``, remap ignores any errors
919 raised by the *visit* callback. Items causing exceptions
920 are kept. See examples for more details.
921
922 remap is designed to cover the majority of cases with just the
923 *visit* callable. While passing in multiple callables is very
924 empowering, remap is designed so very few cases should require
925 passing more than one function.
926
927 When passing *enter* and *exit*, it's common and easiest to build
928 on the default behavior. Simply add ``from boltons.iterutils import
929 default_enter`` (or ``default_exit``), and have your enter/exit
930 function call the default behavior before or after your custom
931 logic. See `this example`_.
932
933 Duplicate and self-referential objects (aka reference loops) are
934 automatically handled internally, `as shown here`_.
935
936 .. _this example: http://sedimental.org/remap.html#sort_all_lists
937 .. _as shown here: http://sedimental.org/remap.html#corner_cases
938
939 """
940 # TODO: improve argument formatting in sphinx doc
941 # TODO: enter() return (False, items) to continue traverse but cancel copy?
942 if not callable(visit):
943 raise TypeError('visit expected callable, not: %r' % visit)
944 if not callable(enter):
945 raise TypeError('enter expected callable, not: %r' % enter)
946 if not callable(exit):
947 raise TypeError('exit expected callable, not: %r' % exit)
948 reraise_visit = kwargs.pop('reraise_visit', True)
949 if kwargs:
950 raise TypeError('unexpected keyword arguments: %r' % kwargs.keys())
951
952 path, registry, stack = (), {}, [(None, root)]
953 new_items_stack = []
954 while stack:
955 key, value = stack.pop()
956 id_value = id(value)
957 if key is _REMAP_EXIT:
958 key, new_parent, old_parent = value
959 id_value = id(old_parent)
960 path, new_items = new_items_stack.pop()
961 value = exit(path, key, old_parent, new_parent, new_items)
962 registry[id_value] = value
963 if not new_items_stack:
964 continue
965 elif id_value in registry:
966 value = registry[id_value]
967 else:
968 res = enter(path, key, value)
969 try:
970 new_parent, new_items = res
971 except TypeError:
972 # TODO: handle False?
973 raise TypeError('enter should return a tuple of (new_parent,'
974 ' items_iterator), not: %r' % res)
975 if new_items is not False:
976 # traverse unless False is explicitly passed
977 registry[id_value] = new_parent
978 new_items_stack.append((path, []))
979 if value is not root:
980 path += (key,)
981 stack.append((_REMAP_EXIT, (key, new_parent, value)))
982 if new_items:
983 stack.extend(reversed(list(new_items)))
984 continue
985 if visit is _orig_default_visit:
986 # avoid function call overhead by inlining identity operation
987 visited_item = (key, value)
988 else:
989 try:
990 visited_item = visit(path, key, value)
991 except Exception:
992 if reraise_visit:
993 raise
994 visited_item = True
995 if visited_item is False:
996 continue # drop
997 elif visited_item is True:
998 visited_item = (key, value)
999 # TODO: typecheck?
1000 # raise TypeError('expected (key, value) from visit(),'
1001 # ' not: %r' % visited_item)
1002 try:
1003 new_items_stack[-1][1].append(visited_item)
1004 except IndexError:
1005 raise TypeError('expected remappable root, not: %r' % root)
1006 return value
1007
1008
1009 class PathAccessError(KeyError, IndexError, TypeError):
1010 """An amalgamation of KeyError, IndexError, and TypeError,
1011 representing what can occur when looking up a path in a nested
1012 object.
1013 """
1014 def __init__(self, exc, seg, path):
1015 self.exc = exc
1016 self.seg = seg
1017 self.path = path
1018
1019 def __repr__(self):
1020 cn = self.__class__.__name__
1021 return '%s(%r, %r, %r)' % (cn, self.exc, self.seg, self.path)
1022
1023 def __str__(self):
1024 return ('could not access %r from path %r, got error: %r'
1025 % (self.seg, self.path, self.exc))
1026
1027
1028 def get_path(root, path, default=_UNSET):
1029 """Retrieve a value from a nested object via a tuple representing the
1030 lookup path.
1031
1032 >>> root = {'a': {'b': {'c': [[1], [2], [3]]}}}
1033 >>> get_path(root, ('a', 'b', 'c', 2, 0))
1034 3
1035
1036 The path format is intentionally consistent with that of
1037 :func:`remap`.
1038
1039 One of get_path's chief aims is improved error messaging. EAFP is
1040 great, but the error messages are not.
1041
1042 For instance, ``root['a']['b']['c'][2][1]`` gives back
1043 ``IndexError: list index out of range``
1044
1045 What went out of range where? get_path currently raises
1046 ``PathAccessError: could not access 2 from path ('a', 'b', 'c', 2,
1047 1), got error: IndexError('list index out of range',)``, a
1048 subclass of IndexError and KeyError.
1049
1050 You can also pass a default that covers the entire operation,
1051 should the lookup fail at any level.
1052
1053 Args:
1054 root: The target nesting of dictionaries, lists, or other
1055 objects supporting ``__getitem__``.
1056 path (tuple): A list of strings and integers to be successively
1057 looked up within *root*.
1058 default: The value to be returned should any
1059 ``PathAccessError`` exceptions be raised.
1060 """
1061 if isinstance(path, basestring):
1062 path = path.split('.')
1063 cur = root
1064 try:
1065 for seg in path:
1066 try:
1067 cur = cur[seg]
1068 except (KeyError, IndexError) as exc:
1069 raise PathAccessError(exc, seg, path)
1070 except TypeError as exc:
1071 # either string index in a list, or a parent that
1072 # doesn't support indexing
1073 try:
1074 seg = int(seg)
1075 cur = cur[seg]
1076 except (ValueError, KeyError, IndexError, TypeError):
1077 if not is_iterable(cur):
1078 exc = TypeError('%r object is not indexable'
1079 % type(cur).__name__)
1080 raise PathAccessError(exc, seg, path)
1081 except PathAccessError:
1082 if default is _UNSET:
1083 raise
1084 return default
1085 return cur
1086
1087
1088 def research(root, query=lambda p, k, v: True, reraise=False):
1089 """The :func:`research` function uses :func:`remap` to recurse over
1090 any data nested in *root*, and find values which match a given
1091 criterion, specified by the *query* callable.
1092
1093 Results are returned as a list of ``(path, value)`` pairs. The
1094 paths are tuples in the same format accepted by
1095 :func:`get_path`. This can be useful for comparing values nested
1096 in two or more different structures.
1097
1098 Here's a simple example that finds all integers:
1099
1100 >>> root = {'a': {'b': 1, 'c': (2, 'd', 3)}, 'e': None}
1101 >>> res = research(root, query=lambda p, k, v: isinstance(v, int))
1102 >>> print(sorted(res))
1103 [(('a', 'b'), 1), (('a', 'c', 0), 2), (('a', 'c', 2), 3)]
1104
1105 Note how *query* follows the same, familiar ``path, key, value``
1106 signature as the ``visit`` and ``enter`` functions on
1107 :func:`remap`, and returns a :class:`bool`.
1108
1109 Args:
1110 root: The target object to search. Supports the same types of
1111 objects as :func:`remap`, including :class:`list`,
1112 :class:`tuple`, :class:`dict`, and :class:`set`.
1113 query (callable): The function called on every object to
1114 determine whether to include it in the search results. The
1115 callable must accept three arguments, *path*, *key*, and
1116 *value*, commonly abbreviated *p*, *k*, and *v*, same as
1117 *enter* and *visit* from :func:`remap`.
1118 reraise (bool): Whether to reraise exceptions raised by *query*
1119 or to simply drop the result that caused the error.
1120
1121
1122 With :func:`research` it's easy to inspect the details of a data
1123 structure, like finding values that are at a certain depth (using
1124 ``len(p)``) and much more. If more advanced functionality is
1125 needed, check out the code and make your own :func:`remap`
1126 wrapper, and consider `submitting a patch`_!
1127
1128 .. _submitting a patch: https://github.com/mahmoud/boltons/pulls
1129 """
1130 ret = []
1131
1132 if not callable(query):
1133 raise TypeError('query expected callable, not: %r' % query)
1134
1135 def enter(path, key, value):
1136 try:
1137 if query(path, key, value):
1138 ret.append((path + (key,), value))
1139 except Exception:
1140 if reraise:
1141 raise
1142 return default_enter(path, key, value)
1143
1144 remap(root, enter=enter)
1145 return ret
1146
1147
1148 # TODO: recollect()
1149 # TODO: refilter()
1150 # TODO: reiter()
1151
1152
1153 # GUID iterators: 10x faster and somewhat more compact than uuid.
1154
1155 class GUIDerator(object):
1156 """The GUIDerator is an iterator that yields a globally-unique
1157 identifier (GUID) on every iteration. The GUIDs produced are
1158 hexadecimal strings.
1159
1160 Testing shows it to be around 12x faster than the uuid module. By
1161 default it is also more compact, partly due to its default 96-bit
1162 (24-hexdigit) length. 96 bits of randomness means that there is a
1163 1 in 2 ^ 32 chance of collision after 2 ^ 64 iterations. If more
1164 or less uniqueness is desired, the *size* argument can be adjusted
1165 accordingly.
1166
1167 Args:
1168 size (int): character length of the GUID, defaults to 24. Lengths
1169 between 20 and 36 are considered valid.
1170
1171 The GUIDerator has built-in fork protection that causes it to
1172 detect a fork on next iteration and reseed accordingly.
1173
1174 """
1175 def __init__(self, size=24):
1176 self.size = size
1177 if size < 20 or size > 36:
1178 raise ValueError('expected 20 < size <= 36')
1179 self.count = itertools.count()
1180 self.reseed()
1181
1182 def reseed(self):
1183 self.pid = os.getpid()
1184 self.salt = '-'.join([str(self.pid),
1185 socket.gethostname() or b'<nohostname>',
1186 str(time.time()),
1187 codecs.encode(os.urandom(6),
1188 'hex_codec').decode('ascii')])
1189 # that codecs trick is the best/only way to get a bytes to
1190 # hexbytes in py2/3
1191 return
1192
1193 def __iter__(self):
1194 return self
1195
1196 if _IS_PY3:
1197 def __next__(self):
1198 if os.getpid() != self.pid:
1199 self.reseed()
1200 target_bytes = (self.salt + str(next(self.count))).encode('utf8')
1201 hash_text = hashlib.sha1(target_bytes).hexdigest()[:self.size]
1202 return hash_text
1203 else:
1204 def __next__(self):
1205 if os.getpid() != self.pid:
1206 self.reseed()
1207 return hashlib.sha1(self.salt +
1208 str(next(self.count))).hexdigest()[:self.size]
1209
1210 next = __next__
1211
1212
1213 class SequentialGUIDerator(GUIDerator):
1214 """Much like the standard GUIDerator, the SequentialGUIDerator is an
1215 iterator that yields a globally-unique identifier (GUID) on every
1216 iteration. The GUIDs produced are hexadecimal strings.
1217
1218 The SequentialGUIDerator differs in that it picks a starting GUID
1219 value and increments every iteration. This yields GUIDs which are
1220 of course unique, but also ordered and lexicographically sortable.
1221
1222 The SequentialGUIDerator is around 50% faster than the normal
1223 GUIDerator, making it almost 20x as fast as the built-in uuid
1224 module. By default it is also more compact, partly due to its
1225 96-bit (24-hexdigit) default length. 96 bits of randomness means that
1226 there is a 1 in 2 ^ 32 chance of collision after 2 ^ 64
1227 iterations. If more or less uniqueness is desired, the *size*
1228 argument can be adjusted accordingly.
1229
1230 Args:
1231 size (int): character length of the GUID, defaults to 24.
1232
1233 Note that with SequentialGUIDerator there is a chance of GUIDs
1234 growing larger than the size configured. The SequentialGUIDerator
1235 has built-in fork protection that causes it to detect a fork on
1236 next iteration and reseed accordingly.
1237
1238 """
1239
1240 if _IS_PY3:
1241 def reseed(self):
1242 super(SequentialGUIDerator, self).reseed()
1243 start_str = hashlib.sha1(self.salt.encode('utf8')).hexdigest()
1244 self.start = int(start_str[:self.size], 16)
1245 self.start |= (1 << ((self.size * 4) - 2))
1246 else:
1247 def reseed(self):
1248 super(SequentialGUIDerator, self).reseed()
1249 start_str = hashlib.sha1(self.salt).hexdigest()
1250 self.start = int(start_str[:self.size], 16)
1251 self.start |= (1 << ((self.size * 4) - 2))
1252
1253 def __next__(self):
1254 if os.getpid() != self.pid:
1255 self.reseed()
1256 return '%x' % (next(self.count) + self.start)
1257
1258 next = __next__
1259
1260
1261 guid_iter = GUIDerator()
1262 seq_guid_iter = SequentialGUIDerator()
1263
1264
1265 def soft_sorted(iterable, first=None, last=None, key=None, reverse=False):
1266 """For when you care about the order of some elements, but not about
1267 others.
1268
1269 Use this to float to the top and/or sink to the bottom a specific
1270 ordering, while sorting the rest of the elements according to
1271 normal :func:`sorted` rules.
1272
1273 >>> soft_sorted(['two', 'b', 'one', 'a'], first=['one', 'two'])
1274 ['one', 'two', 'a', 'b']
1275 >>> soft_sorted(range(7), first=[6, 15], last=[2, 4], reverse=True)
1276 [6, 5, 3, 1, 0, 2, 4]
1277 >>> import string
1278 >>> ''.join(soft_sorted(string.hexdigits, first='za1', last='b', key=str.lower))
1279 'aA1023456789cCdDeEfFbB'
1280
1281 Args:
1282 iterable (list): A list or other iterable to sort.
1283 first (list): A sequence to enforce for elements which should
1284 appear at the beginning of the returned list.
1285 last (list): A sequence to enforce for elements which should
1286 appear at the end of the returned list.
1287 key (callable): Callable used to generate a comparable key for
1288 each item to be sorted, same as the key in
1289 :func:`sorted`. Note that entries in *first* and *last*
1290 should be the keys for the items. Defaults to
1291 passthrough/the identity function.
1292 reverse (bool): Whether or not elements not explicitly ordered
1293 by *first* and *last* should be in reverse order or not.
1294
1295 Returns a new list in sorted order.
1296 """
1297 first = first or []
1298 last = last or []
1299 key = key or (lambda x: x)
1300 seq = list(iterable)
1301 other = [x for x in seq if not ((first and key(x) in first) or (last and key(x) in last))]
1302 other.sort(key=key, reverse=reverse)
1303
1304 if first:
1305 first = sorted([x for x in seq if key(x) in first], key=lambda x: first.index(key(x)))
1306 if last:
1307 last = sorted([x for x in seq if key(x) in last], key=lambda x: last.index(key(x)))
1308 return first + other + last
1309
1310 """
1311 May actually be faster to do an isinstance check for a str path
1312
1313 $ python -m timeit -s "x = [1]" "x[0]"
1314 10000000 loops, best of 3: 0.0207 usec per loop
1315 $ python -m timeit -s "x = [1]" "try: x[0] \nexcept: pass"
1316 10000000 loops, best of 3: 0.029 usec per loop
1317 $ python -m timeit -s "x = [1]" "try: x[1] \nexcept: pass"
1318 1000000 loops, best of 3: 0.315 usec per loop
1319 # setting up try/except is fast, only around 0.01us
1320 # actually triggering the exception takes almost 10x as long
1321
1322 $ python -m timeit -s "x = [1]" "isinstance(x, basestring)"
1323 10000000 loops, best of 3: 0.141 usec per loop
1324 $ python -m timeit -s "x = [1]" "isinstance(x, str)"
1325 10000000 loops, best of 3: 0.131 usec per loop
1326 $ python -m timeit -s "x = [1]" "try: x.split('.')\n except: pass"
1327 1000000 loops, best of 3: 0.443 usec per loop
1328 $ python -m timeit -s "x = [1]" "try: x.split('.') \nexcept AttributeError: pass"
1329 1000000 loops, best of 3: 0.544 usec per loop
1330 """