comparison env/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 """
2 This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
3 handler of Python 3.
4
5 Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
6 """
7
8 # This code is released under the Python license and the BSD 2-clause license
9
10 import codecs
11 import sys
12
13 from future import utils
14
15
16 FS_ERRORS = 'surrogateescape'
17
18 # # -- Python 2/3 compatibility -------------------------------------
19 # FS_ERRORS = 'my_surrogateescape'
20
21 def u(text):
22 if utils.PY3:
23 return text
24 else:
25 return text.decode('unicode_escape')
26
27 def b(data):
28 if utils.PY3:
29 return data.encode('latin1')
30 else:
31 return data
32
33 if utils.PY3:
34 _unichr = chr
35 bytes_chr = lambda code: bytes((code,))
36 else:
37 _unichr = unichr
38 bytes_chr = chr
39
40 def surrogateescape_handler(exc):
41 """
42 Pure Python implementation of the PEP 383: the "surrogateescape" error
43 handler of Python 3. Undecodable bytes will be replaced by a Unicode
44 character U+DCxx on decoding, and these are translated into the
45 original bytes on encoding.
46 """
47 mystring = exc.object[exc.start:exc.end]
48
49 try:
50 if isinstance(exc, UnicodeDecodeError):
51 # mystring is a byte-string in this case
52 decoded = replace_surrogate_decode(mystring)
53 elif isinstance(exc, UnicodeEncodeError):
54 # In the case of u'\udcc3'.encode('ascii',
55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
56 # exception anyway after this function is called, even though I think
57 # it's doing what it should. It seems that the strict encoder is called
58 # to encode the unicode string that this function returns ...
59 decoded = replace_surrogate_encode(mystring)
60 else:
61 raise exc
62 except NotASurrogateError:
63 raise exc
64 return (decoded, exc.end)
65
66
67 class NotASurrogateError(Exception):
68 pass
69
70
71 def replace_surrogate_encode(mystring):
72 """
73 Returns a (unicode) string, not the more logical bytes, because the codecs
74 register_error functionality expects this.
75 """
76 decoded = []
77 for ch in mystring:
78 # if utils.PY3:
79 # code = ch
80 # else:
81 code = ord(ch)
82
83 # The following magic comes from Py3.3's Python/codecs.c file:
84 if not 0xD800 <= code <= 0xDCFF:
85 # Not a surrogate. Fail with the original exception.
86 raise NotASurrogateError
87 # mybytes = [0xe0 | (code >> 12),
88 # 0x80 | ((code >> 6) & 0x3f),
89 # 0x80 | (code & 0x3f)]
90 # Is this a good idea?
91 if 0xDC00 <= code <= 0xDC7F:
92 decoded.append(_unichr(code - 0xDC00))
93 elif code <= 0xDCFF:
94 decoded.append(_unichr(code - 0xDC00))
95 else:
96 raise NotASurrogateError
97 return str().join(decoded)
98
99
100 def replace_surrogate_decode(mybytes):
101 """
102 Returns a (unicode) string
103 """
104 decoded = []
105 for ch in mybytes:
106 # We may be parsing newbytes (in which case ch is an int) or a native
107 # str on Py2
108 if isinstance(ch, int):
109 code = ch
110 else:
111 code = ord(ch)
112 if 0x80 <= code <= 0xFF:
113 decoded.append(_unichr(0xDC00 + code))
114 elif code <= 0x7F:
115 decoded.append(_unichr(code))
116 else:
117 # # It may be a bad byte
118 # # Try swallowing it.
119 # continue
120 # print("RAISE!")
121 raise NotASurrogateError
122 return str().join(decoded)
123
124
125 def encodefilename(fn):
126 if FS_ENCODING == 'ascii':
127 # ASCII encoder of Python 2 expects that the error handler returns a
128 # Unicode string encodable to ASCII, whereas our surrogateescape error
129 # handler has to return bytes in 0x80-0xFF range.
130 encoded = []
131 for index, ch in enumerate(fn):
132 code = ord(ch)
133 if code < 128:
134 ch = bytes_chr(code)
135 elif 0xDC80 <= code <= 0xDCFF:
136 ch = bytes_chr(code - 0xDC00)
137 else:
138 raise UnicodeEncodeError(FS_ENCODING,
139 fn, index, index+1,
140 'ordinal not in range(128)')
141 encoded.append(ch)
142 return bytes().join(encoded)
143 elif FS_ENCODING == 'utf-8':
144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
145 # doesn't go through our error handler
146 encoded = []
147 for index, ch in enumerate(fn):
148 code = ord(ch)
149 if 0xD800 <= code <= 0xDFFF:
150 if 0xDC80 <= code <= 0xDCFF:
151 ch = bytes_chr(code - 0xDC00)
152 encoded.append(ch)
153 else:
154 raise UnicodeEncodeError(
155 FS_ENCODING,
156 fn, index, index+1, 'surrogates not allowed')
157 else:
158 ch_utf8 = ch.encode('utf-8')
159 encoded.append(ch_utf8)
160 return bytes().join(encoded)
161 else:
162 return fn.encode(FS_ENCODING, FS_ERRORS)
163
164 def decodefilename(fn):
165 return fn.decode(FS_ENCODING, FS_ERRORS)
166
167 FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
168 # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
169 # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
170
171
172 # normalize the filesystem encoding name.
173 # For example, we expect "utf-8", not "UTF8".
174 FS_ENCODING = codecs.lookup(FS_ENCODING).name
175
176
177 def register_surrogateescape():
178 """
179 Registers the surrogateescape error handler on Python 2 (only)
180 """
181 if utils.PY3:
182 return
183 try:
184 codecs.lookup_error(FS_ERRORS)
185 except LookupError:
186 codecs.register_error(FS_ERRORS, surrogateescape_handler)
187
188
189 if __name__ == '__main__':
190 pass
191 # # Tests:
192 # register_surrogateescape()
193
194 # b = decodefilename(fn)
195 # assert b == encoded, "%r != %r" % (b, encoded)
196 # c = encodefilename(b)
197 # assert c == fn, '%r != %r' % (c, fn)
198 # # print("ok")