Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/webencodings/__init__.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:d30785e31577 | 1:56ad4e20f292 | 
|---|---|
| 1 # coding: utf-8 | |
| 2 """ | |
| 3 | |
| 4 webencodings | |
| 5 ~~~~~~~~~~~~ | |
| 6 | |
| 7 This is a Python implementation of the `WHATWG Encoding standard | |
| 8 <http://encoding.spec.whatwg.org/>`. See README for details. | |
| 9 | |
| 10 :copyright: Copyright 2012 by Simon Sapin | |
| 11 :license: BSD, see LICENSE for details. | |
| 12 | |
| 13 """ | |
| 14 | |
| 15 from __future__ import unicode_literals | |
| 16 | |
| 17 import codecs | |
| 18 | |
| 19 from .labels import LABELS | |
| 20 | |
| 21 | |
| 22 VERSION = '0.5.1' | |
| 23 | |
| 24 | |
| 25 # Some names in Encoding are not valid Python aliases. Remap these. | |
| 26 PYTHON_NAMES = { | |
| 27 'iso-8859-8-i': 'iso-8859-8', | |
| 28 'x-mac-cyrillic': 'mac-cyrillic', | |
| 29 'macintosh': 'mac-roman', | |
| 30 'windows-874': 'cp874'} | |
| 31 | |
| 32 CACHE = {} | |
| 33 | |
| 34 | |
| 35 def ascii_lower(string): | |
| 36 r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. | |
| 37 | |
| 38 :param string: An Unicode string. | |
| 39 :returns: A new Unicode string. | |
| 40 | |
| 41 This is used for `ASCII case-insensitive | |
| 42 <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ | |
| 43 matching of encoding labels. | |
| 44 The same matching is also used, among other things, | |
| 45 for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. | |
| 46 | |
| 47 This is different from the :meth:`~py:str.lower` method of Unicode strings | |
| 48 which also affect non-ASCII characters, | |
| 49 sometimes mapping them into the ASCII range: | |
| 50 | |
| 51 >>> keyword = u'Bac\N{KELVIN SIGN}ground' | |
| 52 >>> assert keyword.lower() == u'background' | |
| 53 >>> assert ascii_lower(keyword) != keyword.lower() | |
| 54 >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' | |
| 55 | |
| 56 """ | |
| 57 # This turns out to be faster than unicode.translate() | |
| 58 return string.encode('utf8').lower().decode('utf8') | |
| 59 | |
| 60 | |
| 61 def lookup(label): | |
| 62 """ | |
| 63 Look for an encoding by its label. | |
| 64 This is the spec’s `get an encoding | |
| 65 <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. | |
| 66 Supported labels are listed there. | |
| 67 | |
| 68 :param label: A string. | |
| 69 :returns: | |
| 70 An :class:`Encoding` object, or :obj:`None` for an unknown label. | |
| 71 | |
| 72 """ | |
| 73 # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. | |
| 74 label = ascii_lower(label.strip('\t\n\f\r ')) | |
| 75 name = LABELS.get(label) | |
| 76 if name is None: | |
| 77 return None | |
| 78 encoding = CACHE.get(name) | |
| 79 if encoding is None: | |
| 80 if name == 'x-user-defined': | |
| 81 from .x_user_defined import codec_info | |
| 82 else: | |
| 83 python_name = PYTHON_NAMES.get(name, name) | |
| 84 # Any python_name value that gets to here should be valid. | |
| 85 codec_info = codecs.lookup(python_name) | |
| 86 encoding = Encoding(name, codec_info) | |
| 87 CACHE[name] = encoding | |
| 88 return encoding | |
| 89 | |
| 90 | |
| 91 def _get_encoding(encoding_or_label): | |
| 92 """ | |
| 93 Accept either an encoding object or label. | |
| 94 | |
| 95 :param encoding: An :class:`Encoding` object or a label string. | |
| 96 :returns: An :class:`Encoding` object. | |
| 97 :raises: :exc:`~exceptions.LookupError` for an unknown label. | |
| 98 | |
| 99 """ | |
| 100 if hasattr(encoding_or_label, 'codec_info'): | |
| 101 return encoding_or_label | |
| 102 | |
| 103 encoding = lookup(encoding_or_label) | |
| 104 if encoding is None: | |
| 105 raise LookupError('Unknown encoding label: %r' % encoding_or_label) | |
| 106 return encoding | |
| 107 | |
| 108 | |
| 109 class Encoding(object): | |
| 110 """Reresents a character encoding such as UTF-8, | |
| 111 that can be used for decoding or encoding. | |
| 112 | |
| 113 .. attribute:: name | |
| 114 | |
| 115 Canonical name of the encoding | |
| 116 | |
| 117 .. attribute:: codec_info | |
| 118 | |
| 119 The actual implementation of the encoding, | |
| 120 a stdlib :class:`~codecs.CodecInfo` object. | |
| 121 See :func:`codecs.register`. | |
| 122 | |
| 123 """ | |
| 124 def __init__(self, name, codec_info): | |
| 125 self.name = name | |
| 126 self.codec_info = codec_info | |
| 127 | |
| 128 def __repr__(self): | |
| 129 return '<Encoding %s>' % self.name | |
| 130 | |
| 131 | |
| 132 #: The UTF-8 encoding. Should be used for new content and formats. | |
| 133 UTF8 = lookup('utf-8') | |
| 134 | |
| 135 _UTF16LE = lookup('utf-16le') | |
| 136 _UTF16BE = lookup('utf-16be') | |
| 137 | |
| 138 | |
| 139 def decode(input, fallback_encoding, errors='replace'): | |
| 140 """ | |
| 141 Decode a single string. | |
| 142 | |
| 143 :param input: A byte string | |
| 144 :param fallback_encoding: | |
| 145 An :class:`Encoding` object or a label string. | |
| 146 The encoding to use if :obj:`input` does note have a BOM. | |
| 147 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 148 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 149 :return: | |
| 150 A ``(output, encoding)`` tuple of an Unicode string | |
| 151 and an :obj:`Encoding`. | |
| 152 | |
| 153 """ | |
| 154 # Fail early if `encoding` is an invalid label. | |
| 155 fallback_encoding = _get_encoding(fallback_encoding) | |
| 156 bom_encoding, input = _detect_bom(input) | |
| 157 encoding = bom_encoding or fallback_encoding | |
| 158 return encoding.codec_info.decode(input, errors)[0], encoding | |
| 159 | |
| 160 | |
| 161 def _detect_bom(input): | |
| 162 """Return (bom_encoding, input), with any BOM removed from the input.""" | |
| 163 if input.startswith(b'\xFF\xFE'): | |
| 164 return _UTF16LE, input[2:] | |
| 165 if input.startswith(b'\xFE\xFF'): | |
| 166 return _UTF16BE, input[2:] | |
| 167 if input.startswith(b'\xEF\xBB\xBF'): | |
| 168 return UTF8, input[3:] | |
| 169 return None, input | |
| 170 | |
| 171 | |
| 172 def encode(input, encoding=UTF8, errors='strict'): | |
| 173 """ | |
| 174 Encode a single string. | |
| 175 | |
| 176 :param input: An Unicode string. | |
| 177 :param encoding: An :class:`Encoding` object or a label string. | |
| 178 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 179 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 180 :return: A byte string. | |
| 181 | |
| 182 """ | |
| 183 return _get_encoding(encoding).codec_info.encode(input, errors)[0] | |
| 184 | |
| 185 | |
| 186 def iter_decode(input, fallback_encoding, errors='replace'): | |
| 187 """ | |
| 188 "Pull"-based decoder. | |
| 189 | |
| 190 :param input: | |
| 191 An iterable of byte strings. | |
| 192 | |
| 193 The input is first consumed just enough to determine the encoding | |
| 194 based on the precense of a BOM, | |
| 195 then consumed on demand when the return value is. | |
| 196 :param fallback_encoding: | |
| 197 An :class:`Encoding` object or a label string. | |
| 198 The encoding to use if :obj:`input` does note have a BOM. | |
| 199 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 200 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 201 :returns: | |
| 202 An ``(output, encoding)`` tuple. | |
| 203 :obj:`output` is an iterable of Unicode strings, | |
| 204 :obj:`encoding` is the :obj:`Encoding` that is being used. | |
| 205 | |
| 206 """ | |
| 207 | |
| 208 decoder = IncrementalDecoder(fallback_encoding, errors) | |
| 209 generator = _iter_decode_generator(input, decoder) | |
| 210 encoding = next(generator) | |
| 211 return generator, encoding | |
| 212 | |
| 213 | |
| 214 def _iter_decode_generator(input, decoder): | |
| 215 """Return a generator that first yields the :obj:`Encoding`, | |
| 216 then yields output chukns as Unicode strings. | |
| 217 | |
| 218 """ | |
| 219 decode = decoder.decode | |
| 220 input = iter(input) | |
| 221 for chunck in input: | |
| 222 output = decode(chunck) | |
| 223 if output: | |
| 224 assert decoder.encoding is not None | |
| 225 yield decoder.encoding | |
| 226 yield output | |
| 227 break | |
| 228 else: | |
| 229 # Input exhausted without determining the encoding | |
| 230 output = decode(b'', final=True) | |
| 231 assert decoder.encoding is not None | |
| 232 yield decoder.encoding | |
| 233 if output: | |
| 234 yield output | |
| 235 return | |
| 236 | |
| 237 for chunck in input: | |
| 238 output = decode(chunck) | |
| 239 if output: | |
| 240 yield output | |
| 241 output = decode(b'', final=True) | |
| 242 if output: | |
| 243 yield output | |
| 244 | |
| 245 | |
| 246 def iter_encode(input, encoding=UTF8, errors='strict'): | |
| 247 """ | |
| 248 “Pull”-based encoder. | |
| 249 | |
| 250 :param input: An iterable of Unicode strings. | |
| 251 :param encoding: An :class:`Encoding` object or a label string. | |
| 252 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 253 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 254 :returns: An iterable of byte strings. | |
| 255 | |
| 256 """ | |
| 257 # Fail early if `encoding` is an invalid label. | |
| 258 encode = IncrementalEncoder(encoding, errors).encode | |
| 259 return _iter_encode_generator(input, encode) | |
| 260 | |
| 261 | |
| 262 def _iter_encode_generator(input, encode): | |
| 263 for chunck in input: | |
| 264 output = encode(chunck) | |
| 265 if output: | |
| 266 yield output | |
| 267 output = encode('', final=True) | |
| 268 if output: | |
| 269 yield output | |
| 270 | |
| 271 | |
| 272 class IncrementalDecoder(object): | |
| 273 """ | |
| 274 “Push”-based decoder. | |
| 275 | |
| 276 :param fallback_encoding: | |
| 277 An :class:`Encoding` object or a label string. | |
| 278 The encoding to use if :obj:`input` does note have a BOM. | |
| 279 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 280 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 281 | |
| 282 """ | |
| 283 def __init__(self, fallback_encoding, errors='replace'): | |
| 284 # Fail early if `encoding` is an invalid label. | |
| 285 self._fallback_encoding = _get_encoding(fallback_encoding) | |
| 286 self._errors = errors | |
| 287 self._buffer = b'' | |
| 288 self._decoder = None | |
| 289 #: The actual :class:`Encoding` that is being used, | |
| 290 #: or :obj:`None` if that is not determined yet. | |
| 291 #: (Ie. if there is not enough input yet to determine | |
| 292 #: if there is a BOM.) | |
| 293 self.encoding = None # Not known yet. | |
| 294 | |
| 295 def decode(self, input, final=False): | |
| 296 """Decode one chunk of the input. | |
| 297 | |
| 298 :param input: A byte string. | |
| 299 :param final: | |
| 300 Indicate that no more input is available. | |
| 301 Must be :obj:`True` if this is the last call. | |
| 302 :returns: An Unicode string. | |
| 303 | |
| 304 """ | |
| 305 decoder = self._decoder | |
| 306 if decoder is not None: | |
| 307 return decoder(input, final) | |
| 308 | |
| 309 input = self._buffer + input | |
| 310 encoding, input = _detect_bom(input) | |
| 311 if encoding is None: | |
| 312 if len(input) < 3 and not final: # Not enough data yet. | |
| 313 self._buffer = input | |
| 314 return '' | |
| 315 else: # No BOM | |
| 316 encoding = self._fallback_encoding | |
| 317 decoder = encoding.codec_info.incrementaldecoder(self._errors).decode | |
| 318 self._decoder = decoder | |
| 319 self.encoding = encoding | |
| 320 return decoder(input, final) | |
| 321 | |
| 322 | |
| 323 class IncrementalEncoder(object): | |
| 324 """ | |
| 325 “Push”-based encoder. | |
| 326 | |
| 327 :param encoding: An :class:`Encoding` object or a label string. | |
| 328 :param errors: Type of error handling. See :func:`codecs.register`. | |
| 329 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
| 330 | |
| 331 .. method:: encode(input, final=False) | |
| 332 | |
| 333 :param input: An Unicode string. | |
| 334 :param final: | |
| 335 Indicate that no more input is available. | |
| 336 Must be :obj:`True` if this is the last call. | |
| 337 :returns: A byte string. | |
| 338 | |
| 339 """ | |
| 340 def __init__(self, encoding=UTF8, errors='strict'): | |
| 341 encoding = _get_encoding(encoding) | |
| 342 self.encode = encoding.codec_info.incrementalencoder(errors).encode | 
