comparison env/lib/python3.7/site-packages/boltons/strutils.py @ 2:6af9afd405e9 draft

"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
author shellac
date Thu, 14 May 2020 14:56:58 -0400
parents 26e78fe6e8c4
children
comparison
equal deleted inserted replaced
1:75ca89e9b81c 2:6af9afd405e9
1 # -*- coding: utf-8 -*-
2 """So much practical programming involves string manipulation, which
3 Python readily accommodates. Still, there are dozens of basic and
4 common capabilities missing from the standard library, several of them
5 provided by ``strutils``.
6 """
7
8 from __future__ import print_function
9
10 import re
11 import sys
12 import uuid
13 import zlib
14 import string
15 import unicodedata
16 import collections
17 from gzip import GzipFile
18
19 try:
20 from cStringIO import cStringIO as StringIO
21 except ImportError:
22 from io import BytesIO as StringIO
23
24 try:
25 from collections.abc import Mapping
26 except ImportError:
27 from collections import Mapping
28
29 try:
30 unicode, str, bytes, basestring = unicode, str, str, basestring
31 from HTMLParser import HTMLParser
32 import htmlentitydefs
33 except NameError: # basestring not defined in Python 3
34 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)
35 unichr = chr
36 from html.parser import HTMLParser
37 from html import entities as htmlentitydefs
38
39
40 __all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
41 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
42 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
43 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
44 'iter_splitlines', 'indent', 'escape_shell_args',
45 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list', 'unwrap_text']
46
47
48 _punct_ws_str = string.punctuation + string.whitespace
49 _punct_re = re.compile('[' + _punct_ws_str + ']+')
50 _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
51
52
53 def camel2under(camel_string):
54 """Converts a camelcased string to underscores. Useful for turning a
55 class name into a function name.
56
57 >>> camel2under('BasicParseTest')
58 'basic_parse_test'
59 """
60 return _camel2under_re.sub(r'_\1', camel_string).lower()
61
62
63 def under2camel(under_string):
64 """Converts an underscored string to camelcased. Useful for turning a
65 function name into a class name.
66
67 >>> under2camel('complex_tokenizer')
68 'ComplexTokenizer'
69 """
70 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
71
72
73 def slugify(text, delim='_', lower=True, ascii=False):
74 """
75 A basic function that turns text full of scary characters
76 (i.e., punctuation and whitespace), into a relatively safe
77 lowercased string separated only by the delimiter specified
78 by *delim*, which defaults to ``_``.
79
80 The *ascii* convenience flag will :func:`asciify` the slug if
81 you require ascii-only slugs.
82
83 >>> slugify('First post! Hi!!!!~1 ')
84 'first_post_hi_1'
85
86 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
87 b'kurt_goedel_s_pretty_cool'
88 True
89
90 """
91 ret = delim.join(split_punct_ws(text)) or delim if text else ''
92 if ascii:
93 ret = asciify(ret)
94 if lower:
95 ret = ret.lower()
96 return ret
97
98
99 def split_punct_ws(text):
100 """While :meth:`str.split` will split on whitespace,
101 :func:`split_punct_ws` will split on punctuation and
102 whitespace. This used internally by :func:`slugify`, above.
103
104 >>> split_punct_ws('First post! Hi!!!!~1 ')
105 ['First', 'post', 'Hi', '1']
106 """
107 return [w for w in _punct_re.split(text) if w]
108
109
110 def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
111 """Returns a plain-English description of an iterable's
112 :func:`len()`, conditionally pluralized with :func:`cardinalize`,
113 detailed below.
114
115 >>> print(unit_len(range(10), 'number'))
116 10 numbers
117 >>> print(unit_len('aeiou', 'vowel'))
118 5 vowels
119 >>> print(unit_len([], 'worry'))
120 No worries
121 """
122 count = len(sized_iterable)
123 units = cardinalize(unit_noun, count)
124 if count:
125 return u'%s %s' % (count, units)
126 return u'No %s' % (units,)
127
128
129 _ORDINAL_MAP = {'1': 'st',
130 '2': 'nd',
131 '3': 'rd'} # 'th' is the default
132
133
134 def ordinalize(number, ext_only=False):
135 """Turns *number* into its cardinal form, i.e., 1st, 2nd,
136 3rd, 4th, etc. If the last character isn't a digit, it returns the
137 string value unchanged.
138
139 Args:
140 number (int or str): Number to be cardinalized.
141 ext_only (bool): Whether to return only the suffix. Default ``False``.
142
143 >>> print(ordinalize(1))
144 1st
145 >>> print(ordinalize(3694839230))
146 3694839230th
147 >>> print(ordinalize('hi'))
148 hi
149 >>> print(ordinalize(1515))
150 1515th
151 """
152 numstr, ext = unicode(number), ''
153 if numstr and numstr[-1] in string.digits:
154 try:
155 # first check for teens
156 if numstr[-2] == '1':
157 ext = 'th'
158 else:
159 # all other cases
160 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
161 except IndexError:
162 # single digit numbers (will reach here based on [-2] above)
163 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
164 if ext_only:
165 return ext
166 else:
167 return numstr + ext
168
169
170 def cardinalize(unit_noun, count):
171 """Conditionally pluralizes a singular word *unit_noun* if
172 *count* is not one, preserving case when possible.
173
174 >>> vowels = 'aeiou'
175 >>> print(len(vowels), cardinalize('vowel', len(vowels)))
176 5 vowels
177 >>> print(3, cardinalize('Wish', 3))
178 3 Wishes
179 """
180 if count == 1:
181 return unit_noun
182 return pluralize(unit_noun)
183
184
185 def singularize(word):
186 """Semi-intelligently converts an English plural *word* to its
187 singular form, preserving case pattern.
188
189 >>> singularize('chances')
190 'chance'
191 >>> singularize('Activities')
192 'Activity'
193 >>> singularize('Glasses')
194 'Glass'
195 >>> singularize('FEET')
196 'FOOT'
197
198 """
199 orig_word, word = word, word.strip().lower()
200 if not word or word in _IRR_S2P:
201 return orig_word
202
203 irr_singular = _IRR_P2S.get(word)
204 if irr_singular:
205 singular = irr_singular
206 elif not word.endswith('s'):
207 return orig_word
208 elif len(word) == 2:
209 singular = word[:-1] # or just return word?
210 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
211 singular = word[:-3] + 'y'
212 elif word.endswith('es') and word[-3] == 's':
213 singular = word[:-2]
214 else:
215 singular = word[:-1]
216 return _match_case(orig_word, singular)
217
218
219 def pluralize(word):
220 """Semi-intelligently converts an English *word* from singular form to
221 plural, preserving case pattern.
222
223 >>> pluralize('friend')
224 'friends'
225 >>> pluralize('enemy')
226 'enemies'
227 >>> pluralize('Sheep')
228 'Sheep'
229 """
230 orig_word, word = word, word.strip().lower()
231 if not word or word in _IRR_P2S:
232 return orig_word
233 irr_plural = _IRR_S2P.get(word)
234 if irr_plural:
235 plural = irr_plural
236 elif word.endswith('y') and word[-2:-1] not in 'aeiou':
237 plural = word[:-1] + 'ies'
238 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
239 plural = word if word.endswith('es') else word + 'es'
240 else:
241 plural = word + 's'
242 return _match_case(orig_word, plural)
243
244
245 def _match_case(master, disciple):
246 if not master.strip():
247 return disciple
248 if master.lower() == master:
249 return disciple.lower()
250 elif master.upper() == master:
251 return disciple.upper()
252 elif master.title() == master:
253 return disciple.title()
254 return disciple
255
256
257 # Singular to plural map of irregular pluralizations
258 _IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
259 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
260 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
261 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
262 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
263 'calf': 'calves', 'child': 'children', 'corps': 'corps',
264 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
265 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
266 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
267 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
268 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
269 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
270 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
271 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
272 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
273 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
274 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
275 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
276 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
277 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
278 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
279 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
280 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
281 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
282 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
283 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
284 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
285 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
286 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
287 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
288 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
289 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
290 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
291 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
292 'wolf': 'wolves', 'woman': 'women'}
293
294
295 # Reverse index of the above
296 _IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])
297
298 HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
299
300
301 def find_hashtags(string):
302 """Finds and returns all hashtags in a string, with the hashmark
303 removed. Supports full-width hashmarks for Asian languages and
304 does not false-positive on URL anchors.
305
306 >>> find_hashtags('#atag http://asite/#ananchor')
307 ['atag']
308
309 ``find_hashtags`` also works with unicode hashtags.
310 """
311
312 # the following works, doctest just struggles with it
313 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
314 # [u'\u80af\u5fb7\u57fa']
315 return HASHTAG_RE.findall(string)
316
317
318 def a10n(string):
319 """That thing where "internationalization" becomes "i18n", what's it
320 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
321 of `numeronym`_.)
322
323 >>> a10n('abbreviation')
324 'a10n'
325 >>> a10n('internationalization')
326 'i18n'
327 >>> a10n('')
328 ''
329
330 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
331 """
332 if len(string) < 3:
333 return string
334 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])
335
336
337 ANSI_ESCAPE_BEGIN = '\x1b['
338 ANSI_TERMINATORS = ('H', 'f', 'A', 'B', 'C', 'D', 'R', 's', 'u', 'J',
339 'K', 'h', 'l', 'p', 'm')
340
341
342 def strip_ansi(text):
343 """Strips ANSI escape codes from *text*. Useful for the occasional
344 time when a log or redirected output accidentally captures console
345 color codes and the like.
346
347 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m\xdc')
348 'art'
349
350 The test above is an excerpt from ANSI art on
351 `sixteencolors.net`_. This function does not interpret or render
352 ANSI art, but you can do so with `ansi2img`_ or `escapes.js`_.
353
354 .. _sixteencolors.net: http://sixteencolors.net
355 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
356 .. _escapes.js: https://github.com/atdt/escapes.js
357 """
358 # TODO: move to cliutils.py
359 nansi, keep, i, text_len = [], True, 0, len(text)
360 while i < text_len:
361 if not keep and text[i] in ANSI_TERMINATORS:
362 keep = True
363 elif keep:
364 keep_end_i = text.find(ANSI_ESCAPE_BEGIN, i)
365 if keep_end_i < 0:
366 break
367 else:
368 nansi.append(text[i:keep_end_i])
369 i, keep = keep_end_i, False
370 i += 1
371 if not nansi:
372 return text
373 return type(text)().join(nansi) # attempted unicode + str support
374
375
376 def asciify(text, ignore=False):
377 """Converts a unicode or bytestring, *text*, into a bytestring with
378 just ascii characters. Performs basic deaccenting for all you
379 Europhiles out there.
380
381 Also, a gentle reminder that this is a **utility**, primarily meant
382 for slugification. Whenever possible, make your application work
383 **with** unicode, not against it.
384
385 Args:
386 text (str or unicode): The string to be asciified.
387 ignore (bool): Configures final encoding to ignore remaining
388 unasciified unicode instead of replacing it.
389
390 >>> asciify('Beyoncé') == b'Beyonce'
391 True
392 """
393 try:
394 try:
395 return text.encode('ascii')
396 except UnicodeDecodeError:
397 # this usually means you passed in a non-unicode string
398 text = text.decode('utf-8')
399 return text.encode('ascii')
400 except UnicodeEncodeError:
401 mode = 'replace'
402 if ignore:
403 mode = 'ignore'
404 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
405 ret = transd.encode('ascii', mode)
406 return ret
407
408
409 def is_ascii(text):
410 """Check if a unicode or bytestring, *text*, is composed of ascii
411 characters only. Raises :exc:`ValueError` if argument is not text.
412
413 Args:
414 text (str or unicode): The string to be checked.
415
416 >>> is_ascii('Beyoncé')
417 False
418 >>> is_ascii('Beyonce')
419 True
420 """
421 if isinstance(text, unicode):
422 try:
423 text.encode('ascii')
424 except UnicodeEncodeError:
425 return False
426 elif isinstance(text, bytes):
427 try:
428 text.decode('ascii')
429 except UnicodeDecodeError:
430 return False
431 else:
432 raise ValueError('expected text or bytes, not %r' % type(text))
433 return True
434
435
436 class DeaccenterDict(dict):
437 "A small caching dictionary for deaccenting."
438 def __missing__(self, key):
439 ch = self.get(key)
440 if ch is not None:
441 return ch
442 try:
443 de = unicodedata.decomposition(unichr(key))
444 p1, _, p2 = de.rpartition(' ')
445 if int(p2, 16) == 0x308:
446 ch = self.get(key)
447 else:
448 ch = int(p1, 16)
449 except (IndexError, ValueError):
450 ch = self.get(key, key)
451 self[key] = ch
452 return ch
453
454 try:
455 from collections import defaultdict
456 except ImportError:
457 # no defaultdict means that __missing__ isn't supported in
458 # this version of python, so we define __getitem__
459 def __getitem__(self, key):
460 try:
461 return super(DeaccenterDict, self).__getitem__(key)
462 except KeyError:
463 return self.__missing__(key)
464 else:
465 del defaultdict
466
467
468 # http://chmullig.com/2009/12/python-unicode-ascii-ifier/
469 # For something more complete, investigate the unidecode
470 # or isounidecode packages, which are capable of performing
471 # crude transliteration.
472 _BASE_DEACCENT_MAP = {
473 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
474 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
475 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
476 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
477 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
478 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
479 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
480 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
481 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
482 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
483 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
484 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
485 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
486 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
487 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
488 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
489 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
490 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
491 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
492 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
493 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
494 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
495 0xe6: u"ae", # æ LATIN SMALL LETTER AE
496 0xf0: u"d", # ð LATIN SMALL LETTER ETH
497 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
498 0xfe: u"th", # þ LATIN SMALL LETTER THORN,
499 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
500 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
501 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
502 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
503 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
504 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
505 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
506 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
507 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
508 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
509 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
510 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
511 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
512 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
513 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
514 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
515 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
516 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
517 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
518 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
519 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
520 }
521
522
523 DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
524
525
526 _SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
527 _SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
528 _SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
529
530
531 def bytes2human(nbytes, ndigits=0):
532 """Turns an integer value of *nbytes* into a human readable format. Set
533 *ndigits* to control how many digits after the decimal point
534 should be shown (default ``0``).
535
536 >>> bytes2human(128991)
537 '126K'
538 >>> bytes2human(100001221)
539 '95M'
540 >>> bytes2human(0, 2)
541 '0.00B'
542 """
543 abs_bytes = abs(nbytes)
544 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
545 if abs_bytes <= next_size:
546 break
547 hnbytes = float(nbytes) / size
548 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
549 ndigits=ndigits,
550 symbol=symbol)
551
552
553 class HTMLTextExtractor(HTMLParser):
554 def __init__(self):
555 self.reset()
556 self.strict = False
557 self.convert_charrefs = True
558 self.result = []
559
560 def handle_data(self, d):
561 self.result.append(d)
562
563 def handle_charref(self, number):
564 if number[0] == u'x' or number[0] == u'X':
565 codepoint = int(number[1:], 16)
566 else:
567 codepoint = int(number)
568 self.result.append(unichr(codepoint))
569
570 def handle_entityref(self, name):
571 try:
572 codepoint = htmlentitydefs.name2codepoint[name]
573 except KeyError:
574 self.result.append(u'&' + name + u';')
575 else:
576 self.result.append(unichr(codepoint))
577
578 def get_text(self):
579 return u''.join(self.result)
580
581
582 def html2text(html):
583 """Strips tags from HTML text, returning markup-free text. Also, does
584 a best effort replacement of entities like "&nbsp;"
585
586 >>> r = html2text(u'<a href="#">Test &amp;<em>(\u0394&#x03b7;&#956;&#x03CE;)</em></a>')
587 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
588 True
589 """
590 # based on answers to http://stackoverflow.com/questions/753052/
591 s = HTMLTextExtractor()
592 s.feed(html)
593 return s.get_text()
594
595
596 _EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
597 _NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
598
599
600 def gunzip_bytes(bytestring):
601 """The :mod:`gzip` module is great if you have a file or file-like
602 object, but what if you just have bytes. StringIO is one
603 possibility, but it's often faster, easier, and simpler to just
604 use this one-liner. Use this tried-and-true utility function to
605 decompress gzip from bytes.
606
607 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
608 True
609 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
610 True
611 """
612 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
613
614
615 def gzip_bytes(bytestring, level=6):
616 """Turn some bytes into some compressed bytes.
617
618 >>> len(gzip_bytes(b'a' * 10000))
619 46
620
621 Args:
622 bytestring (bytes): Bytes to be compressed
623 level (int): An integer, 1-9, controlling the
624 speed/compression. 1 is fastest, least compressed, 9 is
625 slowest, but most compressed.
626
627 Note that all levels of gzip are pretty fast these days, though
628 it's not really a competitor in compression, at any level.
629 """
630 out = StringIO()
631 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
632 f.write(bytestring)
633 f.close()
634 return out.getvalue()
635
636
637
638 _line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
639 re.UNICODE)
640
641
642 def iter_splitlines(text):
643 r"""Like :meth:`str.splitlines`, but returns an iterator of lines
644 instead of a list. Also similar to :meth:`file.next`, as that also
645 lazily reads and yields lines from a file.
646
647 This function works with a variety of line endings, but as always,
648 be careful when mixing line endings within a file.
649
650 >>> list(iter_splitlines('\nhi\nbye\n'))
651 ['', 'hi', 'bye', '']
652 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
653 ['', 'hi', 'bye', '']
654 >>> list(iter_splitlines(''))
655 []
656 """
657 prev_end, len_text = 0, len(text)
658 # print('last: %r' % last_idx)
659 # start, end = None, None
660 for match in _line_ending_re.finditer(text):
661 start, end = match.start(1), match.end(1)
662 # print(start, end)
663 if prev_end <= start:
664 yield text[prev_end:start]
665 if end == len_text:
666 yield ''
667 prev_end = end
668 tail = text[prev_end:]
669 if tail:
670 yield tail
671 return
672
673
674 def indent(text, margin, newline='\n', key=bool):
675 """The missing counterpart to the built-in :func:`textwrap.dedent`.
676
677 Args:
678 text (str): The text to indent.
679 margin (str): The string to prepend to each line.
680 newline (str): The newline used to rejoin the lines (default: ``\\n``)
681 key (callable): Called on each line to determine whether to
682 indent it. Default: :class:`bool`, to ensure that empty lines do
683 not get whitespace added.
684 """
685 indented_lines = [(margin + line if key(line) else line)
686 for line in iter_splitlines(text)]
687 return newline.join(indented_lines)
688
689
690 def is_uuid(obj, version=4):
691 """Check the argument is either a valid UUID object or string.
692
693 Args:
694 obj (object): The test target. Strings and UUID objects supported.
695 version (int): The target UUID version, set to 0 to skip version check.
696
697 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
698 True
699 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
700 False
701 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
702 True
703 """
704 if not isinstance(obj, uuid.UUID):
705 try:
706 obj = uuid.UUID(obj)
707 except (TypeError, ValueError, AttributeError):
708 return False
709 if version and obj.version != int(version):
710 return False
711 return True
712
713
714 def escape_shell_args(args, sep=' ', style=None):
715 """Returns an escaped version of each string in *args*, according to
716 *style*.
717
718 Args:
719 args (list): A list of arguments to escape and join together
720 sep (str): The separator used to join the escaped arguments.
721 style (str): The style of escaping to use. Can be one of
722 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
723 respectively. If *style* is ``None``, then it is picked
724 according to the system platform.
725
726 See :func:`args2cmd` and :func:`args2sh` for details and example
727 output for each style.
728 """
729 if not style:
730 style = 'cmd' if sys.platform == 'win32' else 'sh'
731
732 if style == 'sh':
733 return args2sh(args, sep=sep)
734 elif style == 'cmd':
735 return args2cmd(args, sep=sep)
736
737 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
738
739
740 _find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
741
742
743 def args2sh(args, sep=' '):
744 """Return a shell-escaped string version of *args*, separated by
745 *sep*, based on the rules of sh, bash, and other shells in the
746 Linux/BSD/MacOS ecosystem.
747
748 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
749 aa '[bb]' 'cc'"'"'cc' 'dd"dd'
750
751 As you can see, arguments with no special characters are not
752 escaped, arguments with special characters are quoted with single
753 quotes, and single quotes themselves are quoted with double
754 quotes. Double quotes are handled like any other special
755 character.
756
757 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
758 note that :mod:`shlex` and :mod:`argparse` have functions to split
759 and parse strings escaped in this manner.
760 """
761 ret_list = []
762
763 for arg in args:
764 if not arg:
765 ret_list.append("''")
766 continue
767 if _find_sh_unsafe(arg) is None:
768 ret_list.append(arg)
769 continue
770 # use single quotes, and put single quotes into double quotes
771 # the string $'b is then quoted as '$'"'"'b'
772 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
773
774 return ' '.join(ret_list)
775
776
777 def args2cmd(args, sep=' '):
778 r"""Return a shell-escaped string version of *args*, separated by
779 *sep*, using the same rules as the Microsoft C runtime.
780
781 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
782 aa [bb] cc'cc dd\"dd
783
784 As you can see, escaping is through backslashing and not quoting,
785 and double quotes are the only special character. See the comment
786 in the code for more details. Based on internal code from the
787 :mod:`subprocess` module.
788
789 """
790 # technique description from subprocess below
791 """
792 1) Arguments are delimited by white space, which is either a
793 space or a tab.
794
795 2) A string surrounded by double quotation marks is
796 interpreted as a single argument, regardless of white space
797 contained within. A quoted string can be embedded in an
798 argument.
799
800 3) A double quotation mark preceded by a backslash is
801 interpreted as a literal double quotation mark.
802
803 4) Backslashes are interpreted literally, unless they
804 immediately precede a double quotation mark.
805
806 5) If backslashes immediately precede a double quotation mark,
807 every pair of backslashes is interpreted as a literal
808 backslash. If the number of backslashes is odd, the last
809 backslash escapes the next double quotation mark as
810 described in rule 3.
811
812 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
813 or search http://msdn.microsoft.com for
814 "Parsing C++ Command-Line Arguments"
815 """
816 result = []
817 needquote = False
818 for arg in args:
819 bs_buf = []
820
821 # Add a space to separate this argument from the others
822 if result:
823 result.append(' ')
824
825 needquote = (" " in arg) or ("\t" in arg) or not arg
826 if needquote:
827 result.append('"')
828
829 for c in arg:
830 if c == '\\':
831 # Don't know if we need to double yet.
832 bs_buf.append(c)
833 elif c == '"':
834 # Double backslashes.
835 result.append('\\' * len(bs_buf)*2)
836 bs_buf = []
837 result.append('\\"')
838 else:
839 # Normal char
840 if bs_buf:
841 result.extend(bs_buf)
842 bs_buf = []
843 result.append(c)
844
845 # Add remaining backslashes, if any.
846 if bs_buf:
847 result.extend(bs_buf)
848
849 if needquote:
850 result.extend(bs_buf)
851 result.append('"')
852
853 return ''.join(result)
854
855
856 def parse_int_list(range_string, delim=',', range_delim='-'):
857 """Returns a sorted list of positive integers based on
858 *range_string*. Reverse of :func:`format_int_list`.
859
860 Args:
861 range_string (str): String of comma separated positive
862 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
863 page range string used in printer dialogs.
864 delim (char): Defaults to ','. Separates integers and
865 contiguous ranges of integers.
866 range_delim (char): Defaults to '-'. Indicates a contiguous
867 range of integers.
868
869 >>> parse_int_list('1,3,5-8,10-11,15')
870 [1, 3, 5, 6, 7, 8, 10, 11, 15]
871
872 """
873 output = []
874
875 for x in range_string.strip().split(delim):
876
877 # Range
878 if range_delim in x:
879 range_limits = list(map(int, x.split(range_delim)))
880 output += list(range(min(range_limits), max(range_limits)+1))
881
882 # Empty String
883 elif not x:
884 continue
885
886 # Integer
887 else:
888 output.append(int(x))
889
890 return sorted(output)
891
892
893 def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
894 """Returns a sorted range string from a list of positive integers
895 (*int_list*). Contiguous ranges of integers are collapsed to min
896 and max values. Reverse of :func:`parse_int_list`.
897
898 Args:
899 int_list (list): List of positive integers to be converted
900 into a range string (e.g. [1,2,4,5,6,8]).
901 delim (char): Defaults to ','. Separates integers and
902 contiguous ranges of integers.
903 range_delim (char): Defaults to '-'. Indicates a contiguous
904 range of integers.
905 delim_space (bool): Defaults to ``False``. If ``True``, adds a
906 space after all *delim* characters.
907
908 >>> format_int_list([1,3,5,6,7,8,10,11,15])
909 '1,3,5-8,10-11,15'
910
911 """
912 output = []
913 contig_range = collections.deque()
914
915 for x in sorted(int_list):
916
917 # Handle current (and first) value.
918 if len(contig_range) < 1:
919 contig_range.append(x)
920
921 # Handle current value, given multiple previous values are contiguous.
922 elif len(contig_range) > 1:
923 delta = x - contig_range[-1]
924
925 # Current value is contiguous.
926 if delta == 1:
927 contig_range.append(x)
928
929 # Current value is non-contiguous.
930 elif delta > 1:
931 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
932 range_delim,
933 max(contig_range))
934 output.append(range_substr)
935 contig_range.clear()
936 contig_range.append(x)
937
938 # Current value repeated.
939 else:
940 continue
941
942 # Handle current value, given no previous contiguous integers
943 else:
944 delta = x - contig_range[0]
945
946 # Current value is contiguous.
947 if delta == 1:
948 contig_range.append(x)
949
950 # Current value is non-contiguous.
951 elif delta > 1:
952 output.append('{0:d}'.format(contig_range.popleft()))
953 contig_range.append(x)
954
955 # Current value repeated.
956 else:
957 continue
958
959 # Handle the last value.
960 else:
961
962 # Last value is non-contiguous.
963 if len(contig_range) == 1:
964 output.append('{0:d}'.format(contig_range.popleft()))
965 contig_range.clear()
966
967 # Last value is part of contiguous range.
968 elif len(contig_range) > 1:
969 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
970 range_delim,
971 max(contig_range))
972 output.append(range_substr)
973 contig_range.clear()
974
975 if delim_space:
976 output_str = (delim+' ').join(output)
977 else:
978 output_str = delim.join(output)
979
980 return output_str
981
982
983 class MultiReplace(object):
984 """
985 MultiReplace is a tool for doing multiple find/replace actions in one pass.
986
987 Given a mapping of values to be replaced it allows for all of the matching
988 values to be replaced in a single pass which can save a lot of performance
989 on very large strings. In addition to simple replace, it also allows for
990 replacing based on regular expressions.
991
992 Keyword Arguments:
993
994 :type regex: bool
995 :param regex: Treat search keys as regular expressions [Default: False]
996 :type flags: int
997 :param flags: flags to pass to the regex engine during compile
998
999 Dictionary Usage::
1000
1001 from lrmslib import stringutils
1002 s = stringutils.MultiReplace({
1003 'foo': 'zoo',
1004 'cat': 'hat',
1005 'bat': 'kraken'
1006 })
1007 new = s.sub('The foo bar cat ate a bat')
1008 new == 'The zoo bar hat ate a kraken'
1009
1010 Iterable Usage::
1011
1012 from lrmslib import stringutils
1013 s = stringutils.MultiReplace([
1014 ('foo', 'zoo'),
1015 ('cat', 'hat'),
1016 ('bat', 'kraken)'
1017 ])
1018 new = s.sub('The foo bar cat ate a bat')
1019 new == 'The zoo bar hat ate a kraken'
1020
1021
1022 The constructor can be passed a dictionary or other mapping as well as
1023 an iterable of tuples. If given an iterable, the substitution will be run
1024 in the order the replacement values are specified in the iterable. This is
1025 also true if it is given an OrderedDict. If given a dictionary then the
1026 order will be non-deterministic::
1027
1028 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
1029 'bar bar bar'
1030 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
1031 >>> m.sub('foo bar baz')
1032 'baz bar bar'
1033
1034 This is because the order of replacement can matter if you're inserting
1035 something that might be replaced by a later substitution. Pay attention and
1036 if you need to rely on order then consider using a list of tuples instead
1037 of a dictionary.
1038 """
1039
1040 def __init__(self, sub_map, **kwargs):
1041 """Compile any regular expressions that have been passed."""
1042 options = {
1043 'regex': False,
1044 'flags': 0,
1045 }
1046 options.update(kwargs)
1047 self.group_map = {}
1048 regex_values = []
1049
1050 if isinstance(sub_map, Mapping):
1051 sub_map = sub_map.items()
1052
1053 for idx, vals in enumerate(sub_map):
1054 group_name = 'group{0}'.format(idx)
1055 if isinstance(vals[0], basestring):
1056 # If we're not treating input strings like a regex, escape it
1057 if not options['regex']:
1058 exp = re.escape(vals[0])
1059 else:
1060 exp = vals[0]
1061 else:
1062 exp = vals[0].pattern
1063
1064 regex_values.append('(?P<{0}>{1})'.format(
1065 group_name,
1066 exp
1067 ))
1068 self.group_map[group_name] = vals[1]
1069
1070 self.combined_pattern = re.compile(
1071 '|'.join(regex_values),
1072 flags=options['flags']
1073 )
1074
1075 def _get_value(self, match):
1076 """Given a match object find replacement value."""
1077 group_dict = match.groupdict()
1078 key = [x for x in group_dict if group_dict[x]][0]
1079 return self.group_map[key]
1080
1081 def sub(self, text):
1082 """
1083 Run substitutions on the input text.
1084
1085 Given an input string, run all substitutions given in the
1086 constructor.
1087 """
1088 return self.combined_pattern.sub(self._get_value, text)
1089
1090
1091 def multi_replace(text, sub_map, **kwargs):
1092 """Shortcut function to invoke multi-replace in a single command."""
1093 m = MultiReplace(sub_map, **kwargs)
1094 return m.sub(text)
1095
1096
1097 def unwrap_text(text, ending='\n\n'):
1098 r"""
1099 Unwrap text, the natural complement to :func:`textwrap.wrap`.
1100
1101 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
1102 >>> unwrap_text(text)
1103 'Short lines wrapped small.\n\nAnother paragraph.'
1104
1105 Args:
1106 text: A string to unwrap.
1107 ending (str): The string to join all unwrapped paragraphs
1108 by. Pass ``None`` to get the list. Defaults to '\n\n' for
1109 compatibility with Markdown and RST.
1110
1111 """
1112 all_grafs = []
1113 cur_graf = []
1114 for line in text.splitlines():
1115 line = line.strip()
1116 if line:
1117 cur_graf.append(line)
1118 else:
1119 all_grafs.append(' '.join(cur_graf))
1120 cur_graf = []
1121 if cur_graf:
1122 all_grafs.append(' '.join(cur_graf))
1123 if ending is None:
1124 return all_grafs
1125 return ending.join(all_grafs)