guppy_basecaller: env/lib/python3.7/site-packages/future/utils/surrogateescape.py comparison

comparison env/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"

author	shellac
date	Sat, 02 May 2020 07:14:21 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:26e78fe6e8c4
+"""
+This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
+handler of Python 3.
+Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
+"""
+# This code is released under the Python license and the BSD 2-clause license
+import codecs
+import sys
+from future import utils
+FS_ERRORS = 'surrogateescape'
+#     # -- Python 2/3 compatibility -------------------------------------
+#     FS_ERRORS = 'my_surrogateescape'
+def u(text):
+if utils.PY3:
+return text
+else:
+return text.decode('unicode_escape')
+def b(data):
+if utils.PY3:
+return data.encode('latin1')
+else:
+return data
+if utils.PY3:
+_unichr = chr
+bytes_chr = lambda code: bytes((code,))
+else:
+_unichr = unichr
+bytes_chr = chr
+def surrogateescape_handler(exc):
+"""
+Pure Python implementation of the PEP 383: the "surrogateescape" error
+handler of Python 3. Undecodable bytes will be replaced by a Unicode
+character U+DCxx on decoding, and these are translated into the
+original bytes on encoding.
+"""
+mystring = exc.object[exc.start:exc.end]
+try:
+if isinstance(exc, UnicodeDecodeError):
+# mystring is a byte-string in this case
+decoded = replace_surrogate_decode(mystring)
+elif isinstance(exc, UnicodeEncodeError):
+# In the case of u'\udcc3'.encode('ascii',
+# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
+# exception anyway after this function is called, even though I think
+# it's doing what it should. It seems that the strict encoder is called
+# to encode the unicode string that this function returns ...
+decoded = replace_surrogate_encode(mystring)
+else:
+raise exc
+except NotASurrogateError:
+raise exc
+return (decoded, exc.end)
+class NotASurrogateError(Exception):
+pass
+def replace_surrogate_encode(mystring):
+"""
+Returns a (unicode) string, not the more logical bytes, because the codecs
+register_error functionality expects this.
+"""
+decoded = []
+for ch in mystring:
+# if utils.PY3:
+#     code = ch
+# else:
+code = ord(ch)
+# The following magic comes from Py3.3's Python/codecs.c file:
+if not 0xD800 <= code <= 0xDCFF:
+# Not a surrogate. Fail with the original exception.
+raise NotASurrogateError
+# mybytes = [0xe0 | (code >> 12),
+#            0x80 | ((code >> 6) & 0x3f),
+#            0x80 | (code & 0x3f)]
+# Is this a good idea?
+if 0xDC00 <= code <= 0xDC7F:
+decoded.append(_unichr(code - 0xDC00))
+elif code <= 0xDCFF:
+decoded.append(_unichr(code - 0xDC00))
+else:
+raise NotASurrogateError
+return str().join(decoded)
+def replace_surrogate_decode(mybytes):
+"""
+Returns a (unicode) string
+"""
+decoded = []
+for ch in mybytes:
+# We may be parsing newbytes (in which case ch is an int) or a native
+# str on Py2
+if isinstance(ch, int):
+code = ch
+else:
+code = ord(ch)
+if 0x80 <= code <= 0xFF:
+decoded.append(_unichr(0xDC00 + code))
+elif code <= 0x7F:
+decoded.append(_unichr(code))
+else:
+# # It may be a bad byte
+# # Try swallowing it.
+# continue
+# print("RAISE!")
+raise NotASurrogateError
+return str().join(decoded)
+def encodefilename(fn):
+if FS_ENCODING == 'ascii':
+# ASCII encoder of Python 2 expects that the error handler returns a
+# Unicode string encodable to ASCII, whereas our surrogateescape error
+# handler has to return bytes in 0x80-0xFF range.
+encoded = []
+for index, ch in enumerate(fn):
+code = ord(ch)
+if code < 128:
+ch = bytes_chr(code)
+elif 0xDC80 <= code <= 0xDCFF:
+ch = bytes_chr(code - 0xDC00)
+else:
+raise UnicodeEncodeError(FS_ENCODING,
+fn, index, index+1,
+'ordinal not in range(128)')
+encoded.append(ch)
+return bytes().join(encoded)
+elif FS_ENCODING == 'utf-8':
+# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
+# doesn't go through our error handler
+encoded = []
+for index, ch in enumerate(fn):
+code = ord(ch)
+if 0xD800 <= code <= 0xDFFF:
+if 0xDC80 <= code <= 0xDCFF:
+ch = bytes_chr(code - 0xDC00)
+encoded.append(ch)
+else:
+raise UnicodeEncodeError(
+FS_ENCODING,
+fn, index, index+1, 'surrogates not allowed')
+else:
+ch_utf8 = ch.encode('utf-8')
+encoded.append(ch_utf8)
+return bytes().join(encoded)
+else:
+return fn.encode(FS_ENCODING, FS_ERRORS)
+def decodefilename(fn):
+return fn.decode(FS_ENCODING, FS_ERRORS)
+FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
+# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+# normalize the filesystem encoding name.
+# For example, we expect "utf-8", not "UTF8".
+FS_ENCODING = codecs.lookup(FS_ENCODING).name
+def register_surrogateescape():
+"""
+Registers the surrogateescape error handler on Python 2 (only)
+"""
+if utils.PY3:
+return
+try:
+codecs.lookup_error(FS_ERRORS)
+except LookupError:
+codecs.register_error(FS_ERRORS, surrogateescape_handler)
+if __name__ == '__main__':
+pass
+# # Tests:
+# register_surrogateescape()
+# b = decodefilename(fn)
+# assert b == encoded, "%r != %r" % (b, encoded)
+# c = encodefilename(b)
+# assert c == fn, '%r != %r' % (c, fn)
+# # print("ok")

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 0:26e78fe6e8c4 draft