Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/boltons/strutils.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/boltons/strutils.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1125 +0,0 @@ -# -*- coding: utf-8 -*- -"""So much practical programming involves string manipulation, which -Python readily accommodates. Still, there are dozens of basic and -common capabilities missing from the standard library, several of them -provided by ``strutils``. -""" - -from __future__ import print_function - -import re -import sys -import uuid -import zlib -import string -import unicodedata -import collections -from gzip import GzipFile - -try: - from cStringIO import cStringIO as StringIO -except ImportError: - from io import BytesIO as StringIO - -try: - from collections.abc import Mapping -except ImportError: - from collections import Mapping - -try: - unicode, str, bytes, basestring = unicode, str, str, basestring - from HTMLParser import HTMLParser - import htmlentitydefs -except NameError: # basestring not defined in Python 3 - unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes) - unichr = chr - from html.parser import HTMLParser - from html import entities as htmlentitydefs - - -__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws', - 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize', - 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi', - 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes', - 'iter_splitlines', 'indent', 'escape_shell_args', - 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list', 'unwrap_text'] - - -_punct_ws_str = string.punctuation + string.whitespace -_punct_re = re.compile('[' + _punct_ws_str + ']+') -_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') - - -def camel2under(camel_string): - """Converts a camelcased string to underscores. Useful for turning a - class name into a function name. - - >>> camel2under('BasicParseTest') - 'basic_parse_test' - """ - return _camel2under_re.sub(r'_\1', camel_string).lower() - - -def under2camel(under_string): - """Converts an underscored string to camelcased. Useful for turning a - function name into a class name. - - >>> under2camel('complex_tokenizer') - 'ComplexTokenizer' - """ - return ''.join(w.capitalize() or '_' for w in under_string.split('_')) - - -def slugify(text, delim='_', lower=True, ascii=False): - """ - A basic function that turns text full of scary characters - (i.e., punctuation and whitespace), into a relatively safe - lowercased string separated only by the delimiter specified - by *delim*, which defaults to ``_``. - - The *ascii* convenience flag will :func:`asciify` the slug if - you require ascii-only slugs. - - >>> slugify('First post! Hi!!!!~1 ') - 'first_post_hi_1' - - >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \ - b'kurt_goedel_s_pretty_cool' - True - - """ - ret = delim.join(split_punct_ws(text)) or delim if text else '' - if ascii: - ret = asciify(ret) - if lower: - ret = ret.lower() - return ret - - -def split_punct_ws(text): - """While :meth:`str.split` will split on whitespace, - :func:`split_punct_ws` will split on punctuation and - whitespace. This used internally by :func:`slugify`, above. - - >>> split_punct_ws('First post! Hi!!!!~1 ') - ['First', 'post', 'Hi', '1'] - """ - return [w for w in _punct_re.split(text) if w] - - -def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()? - """Returns a plain-English description of an iterable's - :func:`len()`, conditionally pluralized with :func:`cardinalize`, - detailed below. - - >>> print(unit_len(range(10), 'number')) - 10 numbers - >>> print(unit_len('aeiou', 'vowel')) - 5 vowels - >>> print(unit_len([], 'worry')) - No worries - """ - count = len(sized_iterable) - units = cardinalize(unit_noun, count) - if count: - return u'%s %s' % (count, units) - return u'No %s' % (units,) - - -_ORDINAL_MAP = {'1': 'st', - '2': 'nd', - '3': 'rd'} # 'th' is the default - - -def ordinalize(number, ext_only=False): - """Turns *number* into its cardinal form, i.e., 1st, 2nd, - 3rd, 4th, etc. If the last character isn't a digit, it returns the - string value unchanged. - - Args: - number (int or str): Number to be cardinalized. - ext_only (bool): Whether to return only the suffix. Default ``False``. - - >>> print(ordinalize(1)) - 1st - >>> print(ordinalize(3694839230)) - 3694839230th - >>> print(ordinalize('hi')) - hi - >>> print(ordinalize(1515)) - 1515th - """ - numstr, ext = unicode(number), '' - if numstr and numstr[-1] in string.digits: - try: - # first check for teens - if numstr[-2] == '1': - ext = 'th' - else: - # all other cases - ext = _ORDINAL_MAP.get(numstr[-1], 'th') - except IndexError: - # single digit numbers (will reach here based on [-2] above) - ext = _ORDINAL_MAP.get(numstr[-1], 'th') - if ext_only: - return ext - else: - return numstr + ext - - -def cardinalize(unit_noun, count): - """Conditionally pluralizes a singular word *unit_noun* if - *count* is not one, preserving case when possible. - - >>> vowels = 'aeiou' - >>> print(len(vowels), cardinalize('vowel', len(vowels))) - 5 vowels - >>> print(3, cardinalize('Wish', 3)) - 3 Wishes - """ - if count == 1: - return unit_noun - return pluralize(unit_noun) - - -def singularize(word): - """Semi-intelligently converts an English plural *word* to its - singular form, preserving case pattern. - - >>> singularize('chances') - 'chance' - >>> singularize('Activities') - 'Activity' - >>> singularize('Glasses') - 'Glass' - >>> singularize('FEET') - 'FOOT' - - """ - orig_word, word = word, word.strip().lower() - if not word or word in _IRR_S2P: - return orig_word - - irr_singular = _IRR_P2S.get(word) - if irr_singular: - singular = irr_singular - elif not word.endswith('s'): - return orig_word - elif len(word) == 2: - singular = word[:-1] # or just return word? - elif word.endswith('ies') and word[-4:-3] not in 'aeiou': - singular = word[:-3] + 'y' - elif word.endswith('es') and word[-3] == 's': - singular = word[:-2] - else: - singular = word[:-1] - return _match_case(orig_word, singular) - - -def pluralize(word): - """Semi-intelligently converts an English *word* from singular form to - plural, preserving case pattern. - - >>> pluralize('friend') - 'friends' - >>> pluralize('enemy') - 'enemies' - >>> pluralize('Sheep') - 'Sheep' - """ - orig_word, word = word, word.strip().lower() - if not word or word in _IRR_P2S: - return orig_word - irr_plural = _IRR_S2P.get(word) - if irr_plural: - plural = irr_plural - elif word.endswith('y') and word[-2:-1] not in 'aeiou': - plural = word[:-1] + 'ies' - elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'): - plural = word if word.endswith('es') else word + 'es' - else: - plural = word + 's' - return _match_case(orig_word, plural) - - -def _match_case(master, disciple): - if not master.strip(): - return disciple - if master.lower() == master: - return disciple.lower() - elif master.upper() == master: - return disciple.upper() - elif master.title() == master: - return disciple.title() - return disciple - - -# Singular to plural map of irregular pluralizations -_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae', - 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae', - 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli', - 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux', - 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti', - 'calf': 'calves', 'child': 'children', 'corps': 'corps', - 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria', - 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer', - 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves', - 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses', - 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata', - 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci', - 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas', - 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese', - 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami', - 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices', - 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives', - 'loaf': 'loaves', 'louse': 'lice', 'man': 'men', - 'matrix': 'matrices', 'means': 'means', 'medium': 'media', - 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose', - 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae', - 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases', - 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova', - 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses', - 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes', - 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors', - 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep': - 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus': - 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium': - 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses', - 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses', - 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth': - 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto': - 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives', - 'wolf': 'wolves', 'woman': 'women'} - - -# Reverse index of the above -_IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()]) - -HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE) - - -def find_hashtags(string): - """Finds and returns all hashtags in a string, with the hashmark - removed. Supports full-width hashmarks for Asian languages and - does not false-positive on URL anchors. - - >>> find_hashtags('#atag http://asite/#ananchor') - ['atag'] - - ``find_hashtags`` also works with unicode hashtags. - """ - - # the following works, doctest just struggles with it - # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo") - # [u'\u80af\u5fb7\u57fa'] - return HASHTAG_RE.findall(string) - - -def a10n(string): - """That thing where "internationalization" becomes "i18n", what's it - called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form - of `numeronym`_.) - - >>> a10n('abbreviation') - 'a10n' - >>> a10n('internationalization') - 'i18n' - >>> a10n('') - '' - - .. _numeronym: http://en.wikipedia.org/wiki/Numeronym - """ - if len(string) < 3: - return string - return '%s%s%s' % (string[0], len(string[1:-1]), string[-1]) - - -ANSI_ESCAPE_BEGIN = '\x1b[' -ANSI_TERMINATORS = ('H', 'f', 'A', 'B', 'C', 'D', 'R', 's', 'u', 'J', - 'K', 'h', 'l', 'p', 'm') - - -def strip_ansi(text): - """Strips ANSI escape codes from *text*. Useful for the occasional - time when a log or redirected output accidentally captures console - color codes and the like. - - >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m\xdc') - 'art' - - The test above is an excerpt from ANSI art on - `sixteencolors.net`_. This function does not interpret or render - ANSI art, but you can do so with `ansi2img`_ or `escapes.js`_. - - .. _sixteencolors.net: http://sixteencolors.net - .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img - .. _escapes.js: https://github.com/atdt/escapes.js - """ - # TODO: move to cliutils.py - nansi, keep, i, text_len = [], True, 0, len(text) - while i < text_len: - if not keep and text[i] in ANSI_TERMINATORS: - keep = True - elif keep: - keep_end_i = text.find(ANSI_ESCAPE_BEGIN, i) - if keep_end_i < 0: - break - else: - nansi.append(text[i:keep_end_i]) - i, keep = keep_end_i, False - i += 1 - if not nansi: - return text - return type(text)().join(nansi) # attempted unicode + str support - - -def asciify(text, ignore=False): - """Converts a unicode or bytestring, *text*, into a bytestring with - just ascii characters. Performs basic deaccenting for all you - Europhiles out there. - - Also, a gentle reminder that this is a **utility**, primarily meant - for slugification. Whenever possible, make your application work - **with** unicode, not against it. - - Args: - text (str or unicode): The string to be asciified. - ignore (bool): Configures final encoding to ignore remaining - unasciified unicode instead of replacing it. - - >>> asciify('Beyoncé') == b'Beyonce' - True - """ - try: - try: - return text.encode('ascii') - except UnicodeDecodeError: - # this usually means you passed in a non-unicode string - text = text.decode('utf-8') - return text.encode('ascii') - except UnicodeEncodeError: - mode = 'replace' - if ignore: - mode = 'ignore' - transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP)) - ret = transd.encode('ascii', mode) - return ret - - -def is_ascii(text): - """Check if a unicode or bytestring, *text*, is composed of ascii - characters only. Raises :exc:`ValueError` if argument is not text. - - Args: - text (str or unicode): The string to be checked. - - >>> is_ascii('Beyoncé') - False - >>> is_ascii('Beyonce') - True - """ - if isinstance(text, unicode): - try: - text.encode('ascii') - except UnicodeEncodeError: - return False - elif isinstance(text, bytes): - try: - text.decode('ascii') - except UnicodeDecodeError: - return False - else: - raise ValueError('expected text or bytes, not %r' % type(text)) - return True - - -class DeaccenterDict(dict): - "A small caching dictionary for deaccenting." - def __missing__(self, key): - ch = self.get(key) - if ch is not None: - return ch - try: - de = unicodedata.decomposition(unichr(key)) - p1, _, p2 = de.rpartition(' ') - if int(p2, 16) == 0x308: - ch = self.get(key) - else: - ch = int(p1, 16) - except (IndexError, ValueError): - ch = self.get(key, key) - self[key] = ch - return ch - - try: - from collections import defaultdict - except ImportError: - # no defaultdict means that __missing__ isn't supported in - # this version of python, so we define __getitem__ - def __getitem__(self, key): - try: - return super(DeaccenterDict, self).__getitem__(key) - except KeyError: - return self.__missing__(key) - else: - del defaultdict - - -# http://chmullig.com/2009/12/python-unicode-ascii-ifier/ -# For something more complete, investigate the unidecode -# or isounidecode packages, which are capable of performing -# crude transliteration. -_BASE_DEACCENT_MAP = { - 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE - 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH - 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE - 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN - 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS - 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS - 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS - 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE - 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE - 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE - 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA - 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE - 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE - 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX - 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE - 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE - 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE - 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE - 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE - 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE - 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE - 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S - 0xe6: u"ae", # æ LATIN SMALL LETTER AE - 0xf0: u"d", # ð LATIN SMALL LETTER ETH - 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE - 0xfe: u"th", # þ LATIN SMALL LETTER THORN, - 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS - 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS - 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS - 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE - 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE - 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE - 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA - 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE - 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE - 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX - 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE - 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE - 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE - 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE - 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE - 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE - 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE - 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK - 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK - 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK - 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK - } - - -DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP) - - -_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') -_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)] -_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:])) - - -def bytes2human(nbytes, ndigits=0): - """Turns an integer value of *nbytes* into a human readable format. Set - *ndigits* to control how many digits after the decimal point - should be shown (default ``0``). - - >>> bytes2human(128991) - '126K' - >>> bytes2human(100001221) - '95M' - >>> bytes2human(0, 2) - '0.00B' - """ - abs_bytes = abs(nbytes) - for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES: - if abs_bytes <= next_size: - break - hnbytes = float(nbytes) / size - return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes, - ndigits=ndigits, - symbol=symbol) - - -class HTMLTextExtractor(HTMLParser): - def __init__(self): - self.reset() - self.strict = False - self.convert_charrefs = True - self.result = [] - - def handle_data(self, d): - self.result.append(d) - - def handle_charref(self, number): - if number[0] == u'x' or number[0] == u'X': - codepoint = int(number[1:], 16) - else: - codepoint = int(number) - self.result.append(unichr(codepoint)) - - def handle_entityref(self, name): - try: - codepoint = htmlentitydefs.name2codepoint[name] - except KeyError: - self.result.append(u'&' + name + u';') - else: - self.result.append(unichr(codepoint)) - - def get_text(self): - return u''.join(self.result) - - -def html2text(html): - """Strips tags from HTML text, returning markup-free text. Also, does - a best effort replacement of entities like " " - - >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>') - >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)' - True - """ - # based on answers to http://stackoverflow.com/questions/753052/ - s = HTMLTextExtractor() - s.feed(html) - return s.get_text() - - -_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' -_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00' - - -def gunzip_bytes(bytestring): - """The :mod:`gzip` module is great if you have a file or file-like - object, but what if you just have bytes. StringIO is one - possibility, but it's often faster, easier, and simpler to just - use this one-liner. Use this tried-and-true utility function to - decompress gzip from bytes. - - >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b'' - True - >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!' - True - """ - return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS) - - -def gzip_bytes(bytestring, level=6): - """Turn some bytes into some compressed bytes. - - >>> len(gzip_bytes(b'a' * 10000)) - 46 - - Args: - bytestring (bytes): Bytes to be compressed - level (int): An integer, 1-9, controlling the - speed/compression. 1 is fastest, least compressed, 9 is - slowest, but most compressed. - - Note that all levels of gzip are pretty fast these days, though - it's not really a competitor in compression, at any level. - """ - out = StringIO() - f = GzipFile(fileobj=out, mode='wb', compresslevel=level) - f.write(bytestring) - f.close() - return out.getvalue() - - - -_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)', - re.UNICODE) - - -def iter_splitlines(text): - r"""Like :meth:`str.splitlines`, but returns an iterator of lines - instead of a list. Also similar to :meth:`file.next`, as that also - lazily reads and yields lines from a file. - - This function works with a variety of line endings, but as always, - be careful when mixing line endings within a file. - - >>> list(iter_splitlines('\nhi\nbye\n')) - ['', 'hi', 'bye', ''] - >>> list(iter_splitlines('\r\nhi\rbye\r\n')) - ['', 'hi', 'bye', ''] - >>> list(iter_splitlines('')) - [] - """ - prev_end, len_text = 0, len(text) - # print('last: %r' % last_idx) - # start, end = None, None - for match in _line_ending_re.finditer(text): - start, end = match.start(1), match.end(1) - # print(start, end) - if prev_end <= start: - yield text[prev_end:start] - if end == len_text: - yield '' - prev_end = end - tail = text[prev_end:] - if tail: - yield tail - return - - -def indent(text, margin, newline='\n', key=bool): - """The missing counterpart to the built-in :func:`textwrap.dedent`. - - Args: - text (str): The text to indent. - margin (str): The string to prepend to each line. - newline (str): The newline used to rejoin the lines (default: ``\\n``) - key (callable): Called on each line to determine whether to - indent it. Default: :class:`bool`, to ensure that empty lines do - not get whitespace added. - """ - indented_lines = [(margin + line if key(line) else line) - for line in iter_splitlines(text)] - return newline.join(indented_lines) - - -def is_uuid(obj, version=4): - """Check the argument is either a valid UUID object or string. - - Args: - obj (object): The test target. Strings and UUID objects supported. - version (int): The target UUID version, set to 0 to skip version check. - - >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea') - True - >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9') - False - >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1) - True - """ - if not isinstance(obj, uuid.UUID): - try: - obj = uuid.UUID(obj) - except (TypeError, ValueError, AttributeError): - return False - if version and obj.version != int(version): - return False - return True - - -def escape_shell_args(args, sep=' ', style=None): - """Returns an escaped version of each string in *args*, according to - *style*. - - Args: - args (list): A list of arguments to escape and join together - sep (str): The separator used to join the escaped arguments. - style (str): The style of escaping to use. Can be one of - ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc., - respectively. If *style* is ``None``, then it is picked - according to the system platform. - - See :func:`args2cmd` and :func:`args2sh` for details and example - output for each style. - """ - if not style: - style = 'cmd' if sys.platform == 'win32' else 'sh' - - if style == 'sh': - return args2sh(args, sep=sep) - elif style == 'cmd': - return args2cmd(args, sep=sep) - - raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style) - - -_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search - - -def args2sh(args, sep=' '): - """Return a shell-escaped string version of *args*, separated by - *sep*, based on the rules of sh, bash, and other shells in the - Linux/BSD/MacOS ecosystem. - - >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd'])) - aa '[bb]' 'cc'"'"'cc' 'dd"dd' - - As you can see, arguments with no special characters are not - escaped, arguments with special characters are quoted with single - quotes, and single quotes themselves are quoted with double - quotes. Double quotes are handled like any other special - character. - - Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also - note that :mod:`shlex` and :mod:`argparse` have functions to split - and parse strings escaped in this manner. - """ - ret_list = [] - - for arg in args: - if not arg: - ret_list.append("''") - continue - if _find_sh_unsafe(arg) is None: - ret_list.append(arg) - continue - # use single quotes, and put single quotes into double quotes - # the string $'b is then quoted as '$'"'"'b' - ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'") - - return ' '.join(ret_list) - - -def args2cmd(args, sep=' '): - r"""Return a shell-escaped string version of *args*, separated by - *sep*, using the same rules as the Microsoft C runtime. - - >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd'])) - aa [bb] cc'cc dd\"dd - - As you can see, escaping is through backslashing and not quoting, - and double quotes are the only special character. See the comment - in the code for more details. Based on internal code from the - :mod:`subprocess` module. - - """ - # technique description from subprocess below - """ - 1) Arguments are delimited by white space, which is either a - space or a tab. - - 2) A string surrounded by double quotation marks is - interpreted as a single argument, regardless of white space - contained within. A quoted string can be embedded in an - argument. - - 3) A double quotation mark preceded by a backslash is - interpreted as a literal double quotation mark. - - 4) Backslashes are interpreted literally, unless they - immediately precede a double quotation mark. - - 5) If backslashes immediately precede a double quotation mark, - every pair of backslashes is interpreted as a literal - backslash. If the number of backslashes is odd, the last - backslash escapes the next double quotation mark as - described in rule 3. - - See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx - or search http://msdn.microsoft.com for - "Parsing C++ Command-Line Arguments" - """ - result = [] - needquote = False - for arg in args: - bs_buf = [] - - # Add a space to separate this argument from the others - if result: - result.append(' ') - - needquote = (" " in arg) or ("\t" in arg) or not arg - if needquote: - result.append('"') - - for c in arg: - if c == '\\': - # Don't know if we need to double yet. - bs_buf.append(c) - elif c == '"': - # Double backslashes. - result.append('\\' * len(bs_buf)*2) - bs_buf = [] - result.append('\\"') - else: - # Normal char - if bs_buf: - result.extend(bs_buf) - bs_buf = [] - result.append(c) - - # Add remaining backslashes, if any. - if bs_buf: - result.extend(bs_buf) - - if needquote: - result.extend(bs_buf) - result.append('"') - - return ''.join(result) - - -def parse_int_list(range_string, delim=',', range_delim='-'): - """Returns a sorted list of positive integers based on - *range_string*. Reverse of :func:`format_int_list`. - - Args: - range_string (str): String of comma separated positive - integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom - page range string used in printer dialogs. - delim (char): Defaults to ','. Separates integers and - contiguous ranges of integers. - range_delim (char): Defaults to '-'. Indicates a contiguous - range of integers. - - >>> parse_int_list('1,3,5-8,10-11,15') - [1, 3, 5, 6, 7, 8, 10, 11, 15] - - """ - output = [] - - for x in range_string.strip().split(delim): - - # Range - if range_delim in x: - range_limits = list(map(int, x.split(range_delim))) - output += list(range(min(range_limits), max(range_limits)+1)) - - # Empty String - elif not x: - continue - - # Integer - else: - output.append(int(x)) - - return sorted(output) - - -def format_int_list(int_list, delim=',', range_delim='-', delim_space=False): - """Returns a sorted range string from a list of positive integers - (*int_list*). Contiguous ranges of integers are collapsed to min - and max values. Reverse of :func:`parse_int_list`. - - Args: - int_list (list): List of positive integers to be converted - into a range string (e.g. [1,2,4,5,6,8]). - delim (char): Defaults to ','. Separates integers and - contiguous ranges of integers. - range_delim (char): Defaults to '-'. Indicates a contiguous - range of integers. - delim_space (bool): Defaults to ``False``. If ``True``, adds a - space after all *delim* characters. - - >>> format_int_list([1,3,5,6,7,8,10,11,15]) - '1,3,5-8,10-11,15' - - """ - output = [] - contig_range = collections.deque() - - for x in sorted(int_list): - - # Handle current (and first) value. - if len(contig_range) < 1: - contig_range.append(x) - - # Handle current value, given multiple previous values are contiguous. - elif len(contig_range) > 1: - delta = x - contig_range[-1] - - # Current value is contiguous. - if delta == 1: - contig_range.append(x) - - # Current value is non-contiguous. - elif delta > 1: - range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), - range_delim, - max(contig_range)) - output.append(range_substr) - contig_range.clear() - contig_range.append(x) - - # Current value repeated. - else: - continue - - # Handle current value, given no previous contiguous integers - else: - delta = x - contig_range[0] - - # Current value is contiguous. - if delta == 1: - contig_range.append(x) - - # Current value is non-contiguous. - elif delta > 1: - output.append('{0:d}'.format(contig_range.popleft())) - contig_range.append(x) - - # Current value repeated. - else: - continue - - # Handle the last value. - else: - - # Last value is non-contiguous. - if len(contig_range) == 1: - output.append('{0:d}'.format(contig_range.popleft())) - contig_range.clear() - - # Last value is part of contiguous range. - elif len(contig_range) > 1: - range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), - range_delim, - max(contig_range)) - output.append(range_substr) - contig_range.clear() - - if delim_space: - output_str = (delim+' ').join(output) - else: - output_str = delim.join(output) - - return output_str - - -class MultiReplace(object): - """ - MultiReplace is a tool for doing multiple find/replace actions in one pass. - - Given a mapping of values to be replaced it allows for all of the matching - values to be replaced in a single pass which can save a lot of performance - on very large strings. In addition to simple replace, it also allows for - replacing based on regular expressions. - - Keyword Arguments: - - :type regex: bool - :param regex: Treat search keys as regular expressions [Default: False] - :type flags: int - :param flags: flags to pass to the regex engine during compile - - Dictionary Usage:: - - from lrmslib import stringutils - s = stringutils.MultiReplace({ - 'foo': 'zoo', - 'cat': 'hat', - 'bat': 'kraken' - }) - new = s.sub('The foo bar cat ate a bat') - new == 'The zoo bar hat ate a kraken' - - Iterable Usage:: - - from lrmslib import stringutils - s = stringutils.MultiReplace([ - ('foo', 'zoo'), - ('cat', 'hat'), - ('bat', 'kraken)' - ]) - new = s.sub('The foo bar cat ate a bat') - new == 'The zoo bar hat ate a kraken' - - - The constructor can be passed a dictionary or other mapping as well as - an iterable of tuples. If given an iterable, the substitution will be run - in the order the replacement values are specified in the iterable. This is - also true if it is given an OrderedDict. If given a dictionary then the - order will be non-deterministic:: - - >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar') - 'bar bar bar' - >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'}) - >>> m.sub('foo bar baz') - 'baz bar bar' - - This is because the order of replacement can matter if you're inserting - something that might be replaced by a later substitution. Pay attention and - if you need to rely on order then consider using a list of tuples instead - of a dictionary. - """ - - def __init__(self, sub_map, **kwargs): - """Compile any regular expressions that have been passed.""" - options = { - 'regex': False, - 'flags': 0, - } - options.update(kwargs) - self.group_map = {} - regex_values = [] - - if isinstance(sub_map, Mapping): - sub_map = sub_map.items() - - for idx, vals in enumerate(sub_map): - group_name = 'group{0}'.format(idx) - if isinstance(vals[0], basestring): - # If we're not treating input strings like a regex, escape it - if not options['regex']: - exp = re.escape(vals[0]) - else: - exp = vals[0] - else: - exp = vals[0].pattern - - regex_values.append('(?P<{0}>{1})'.format( - group_name, - exp - )) - self.group_map[group_name] = vals[1] - - self.combined_pattern = re.compile( - '|'.join(regex_values), - flags=options['flags'] - ) - - def _get_value(self, match): - """Given a match object find replacement value.""" - group_dict = match.groupdict() - key = [x for x in group_dict if group_dict[x]][0] - return self.group_map[key] - - def sub(self, text): - """ - Run substitutions on the input text. - - Given an input string, run all substitutions given in the - constructor. - """ - return self.combined_pattern.sub(self._get_value, text) - - -def multi_replace(text, sub_map, **kwargs): - """Shortcut function to invoke multi-replace in a single command.""" - m = MultiReplace(sub_map, **kwargs) - return m.sub(text) - - -def unwrap_text(text, ending='\n\n'): - r""" - Unwrap text, the natural complement to :func:`textwrap.wrap`. - - >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph." - >>> unwrap_text(text) - 'Short lines wrapped small.\n\nAnother paragraph.' - - Args: - text: A string to unwrap. - ending (str): The string to join all unwrapped paragraphs - by. Pass ``None`` to get the list. Defaults to '\n\n' for - compatibility with Markdown and RST. - - """ - all_grafs = [] - cur_graf = [] - for line in text.splitlines(): - line = line.strip() - if line: - cur_graf.append(line) - else: - all_grafs.append(' '.join(cur_graf)) - cur_graf = [] - if cur_graf: - all_grafs.append(' '.join(cur_graf)) - if ending is None: - return all_grafs - return ending.join(all_grafs)