Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/webencodings/__init__.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/webencodings/__init__.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,342 @@ +# coding: utf-8 +""" + + webencodings + ~~~~~~~~~~~~ + + This is a Python implementation of the `WHATWG Encoding standard + <http://encoding.spec.whatwg.org/>`. See README for details. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +from __future__ import unicode_literals + +import codecs + +from .labels import LABELS + + +VERSION = '0.5.1' + + +# Some names in Encoding are not valid Python aliases. Remap these. +PYTHON_NAMES = { + 'iso-8859-8-i': 'iso-8859-8', + 'x-mac-cyrillic': 'mac-cyrillic', + 'macintosh': 'mac-roman', + 'windows-874': 'cp874'} + +CACHE = {} + + +def ascii_lower(string): + r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. + + :param string: An Unicode string. + :returns: A new Unicode string. + + This is used for `ASCII case-insensitive + <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ + matching of encoding labels. + The same matching is also used, among other things, + for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. + + This is different from the :meth:`~py:str.lower` method of Unicode strings + which also affect non-ASCII characters, + sometimes mapping them into the ASCII range: + + >>> keyword = u'Bac\N{KELVIN SIGN}ground' + >>> assert keyword.lower() == u'background' + >>> assert ascii_lower(keyword) != keyword.lower() + >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' + + """ + # This turns out to be faster than unicode.translate() + return string.encode('utf8').lower().decode('utf8') + + +def lookup(label): + """ + Look for an encoding by its label. + This is the spec’s `get an encoding + <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. + Supported labels are listed there. + + :param label: A string. + :returns: + An :class:`Encoding` object, or :obj:`None` for an unknown label. + + """ + # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. + label = ascii_lower(label.strip('\t\n\f\r ')) + name = LABELS.get(label) + if name is None: + return None + encoding = CACHE.get(name) + if encoding is None: + if name == 'x-user-defined': + from .x_user_defined import codec_info + else: + python_name = PYTHON_NAMES.get(name, name) + # Any python_name value that gets to here should be valid. + codec_info = codecs.lookup(python_name) + encoding = Encoding(name, codec_info) + CACHE[name] = encoding + return encoding + + +def _get_encoding(encoding_or_label): + """ + Accept either an encoding object or label. + + :param encoding: An :class:`Encoding` object or a label string. + :returns: An :class:`Encoding` object. + :raises: :exc:`~exceptions.LookupError` for an unknown label. + + """ + if hasattr(encoding_or_label, 'codec_info'): + return encoding_or_label + + encoding = lookup(encoding_or_label) + if encoding is None: + raise LookupError('Unknown encoding label: %r' % encoding_or_label) + return encoding + + +class Encoding(object): + """Reresents a character encoding such as UTF-8, + that can be used for decoding or encoding. + + .. attribute:: name + + Canonical name of the encoding + + .. attribute:: codec_info + + The actual implementation of the encoding, + a stdlib :class:`~codecs.CodecInfo` object. + See :func:`codecs.register`. + + """ + def __init__(self, name, codec_info): + self.name = name + self.codec_info = codec_info + + def __repr__(self): + return '<Encoding %s>' % self.name + + +#: The UTF-8 encoding. Should be used for new content and formats. +UTF8 = lookup('utf-8') + +_UTF16LE = lookup('utf-16le') +_UTF16BE = lookup('utf-16be') + + +def decode(input, fallback_encoding, errors='replace'): + """ + Decode a single string. + + :param input: A byte string + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: + A ``(output, encoding)`` tuple of an Unicode string + and an :obj:`Encoding`. + + """ + # Fail early if `encoding` is an invalid label. + fallback_encoding = _get_encoding(fallback_encoding) + bom_encoding, input = _detect_bom(input) + encoding = bom_encoding or fallback_encoding + return encoding.codec_info.decode(input, errors)[0], encoding + + +def _detect_bom(input): + """Return (bom_encoding, input), with any BOM removed from the input.""" + if input.startswith(b'\xFF\xFE'): + return _UTF16LE, input[2:] + if input.startswith(b'\xFE\xFF'): + return _UTF16BE, input[2:] + if input.startswith(b'\xEF\xBB\xBF'): + return UTF8, input[3:] + return None, input + + +def encode(input, encoding=UTF8, errors='strict'): + """ + Encode a single string. + + :param input: An Unicode string. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: A byte string. + + """ + return _get_encoding(encoding).codec_info.encode(input, errors)[0] + + +def iter_decode(input, fallback_encoding, errors='replace'): + """ + "Pull"-based decoder. + + :param input: + An iterable of byte strings. + + The input is first consumed just enough to determine the encoding + based on the precense of a BOM, + then consumed on demand when the return value is. + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: + An ``(output, encoding)`` tuple. + :obj:`output` is an iterable of Unicode strings, + :obj:`encoding` is the :obj:`Encoding` that is being used. + + """ + + decoder = IncrementalDecoder(fallback_encoding, errors) + generator = _iter_decode_generator(input, decoder) + encoding = next(generator) + return generator, encoding + + +def _iter_decode_generator(input, decoder): + """Return a generator that first yields the :obj:`Encoding`, + then yields output chukns as Unicode strings. + + """ + decode = decoder.decode + input = iter(input) + for chunck in input: + output = decode(chunck) + if output: + assert decoder.encoding is not None + yield decoder.encoding + yield output + break + else: + # Input exhausted without determining the encoding + output = decode(b'', final=True) + assert decoder.encoding is not None + yield decoder.encoding + if output: + yield output + return + + for chunck in input: + output = decode(chunck) + if output: + yield output + output = decode(b'', final=True) + if output: + yield output + + +def iter_encode(input, encoding=UTF8, errors='strict'): + """ + “Pull”-based encoder. + + :param input: An iterable of Unicode strings. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: An iterable of byte strings. + + """ + # Fail early if `encoding` is an invalid label. + encode = IncrementalEncoder(encoding, errors).encode + return _iter_encode_generator(input, encode) + + +def _iter_encode_generator(input, encode): + for chunck in input: + output = encode(chunck) + if output: + yield output + output = encode('', final=True) + if output: + yield output + + +class IncrementalDecoder(object): + """ + “Push”-based decoder. + + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + """ + def __init__(self, fallback_encoding, errors='replace'): + # Fail early if `encoding` is an invalid label. + self._fallback_encoding = _get_encoding(fallback_encoding) + self._errors = errors + self._buffer = b'' + self._decoder = None + #: The actual :class:`Encoding` that is being used, + #: or :obj:`None` if that is not determined yet. + #: (Ie. if there is not enough input yet to determine + #: if there is a BOM.) + self.encoding = None # Not known yet. + + def decode(self, input, final=False): + """Decode one chunk of the input. + + :param input: A byte string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: An Unicode string. + + """ + decoder = self._decoder + if decoder is not None: + return decoder(input, final) + + input = self._buffer + input + encoding, input = _detect_bom(input) + if encoding is None: + if len(input) < 3 and not final: # Not enough data yet. + self._buffer = input + return '' + else: # No BOM + encoding = self._fallback_encoding + decoder = encoding.codec_info.incrementaldecoder(self._errors).decode + self._decoder = decoder + self.encoding = encoding + return decoder(input, final) + + +class IncrementalEncoder(object): + """ + “Push”-based encoder. + + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + .. method:: encode(input, final=False) + + :param input: An Unicode string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: A byte string. + + """ + def __init__(self, encoding=UTF8, errors='strict'): + encoding = _get_encoding(encoding) + self.encode = encoding.codec_info.incrementalencoder(errors).encode