Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/future/utils/surrogateescape.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 """ | |
2 This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error | |
3 handler of Python 3. | |
4 | |
5 Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc | |
6 """ | |
7 | |
8 # This code is released under the Python license and the BSD 2-clause license | |
9 | |
10 import codecs | |
11 import sys | |
12 | |
13 from future import utils | |
14 | |
15 | |
16 FS_ERRORS = 'surrogateescape' | |
17 | |
18 # # -- Python 2/3 compatibility ------------------------------------- | |
19 # FS_ERRORS = 'my_surrogateescape' | |
20 | |
21 def u(text): | |
22 if utils.PY3: | |
23 return text | |
24 else: | |
25 return text.decode('unicode_escape') | |
26 | |
27 def b(data): | |
28 if utils.PY3: | |
29 return data.encode('latin1') | |
30 else: | |
31 return data | |
32 | |
33 if utils.PY3: | |
34 _unichr = chr | |
35 bytes_chr = lambda code: bytes((code,)) | |
36 else: | |
37 _unichr = unichr | |
38 bytes_chr = chr | |
39 | |
40 def surrogateescape_handler(exc): | |
41 """ | |
42 Pure Python implementation of the PEP 383: the "surrogateescape" error | |
43 handler of Python 3. Undecodable bytes will be replaced by a Unicode | |
44 character U+DCxx on decoding, and these are translated into the | |
45 original bytes on encoding. | |
46 """ | |
47 mystring = exc.object[exc.start:exc.end] | |
48 | |
49 try: | |
50 if isinstance(exc, UnicodeDecodeError): | |
51 # mystring is a byte-string in this case | |
52 decoded = replace_surrogate_decode(mystring) | |
53 elif isinstance(exc, UnicodeEncodeError): | |
54 # In the case of u'\udcc3'.encode('ascii', | |
55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an | |
56 # exception anyway after this function is called, even though I think | |
57 # it's doing what it should. It seems that the strict encoder is called | |
58 # to encode the unicode string that this function returns ... | |
59 decoded = replace_surrogate_encode(mystring) | |
60 else: | |
61 raise exc | |
62 except NotASurrogateError: | |
63 raise exc | |
64 return (decoded, exc.end) | |
65 | |
66 | |
67 class NotASurrogateError(Exception): | |
68 pass | |
69 | |
70 | |
71 def replace_surrogate_encode(mystring): | |
72 """ | |
73 Returns a (unicode) string, not the more logical bytes, because the codecs | |
74 register_error functionality expects this. | |
75 """ | |
76 decoded = [] | |
77 for ch in mystring: | |
78 # if utils.PY3: | |
79 # code = ch | |
80 # else: | |
81 code = ord(ch) | |
82 | |
83 # The following magic comes from Py3.3's Python/codecs.c file: | |
84 if not 0xD800 <= code <= 0xDCFF: | |
85 # Not a surrogate. Fail with the original exception. | |
86 raise NotASurrogateError | |
87 # mybytes = [0xe0 | (code >> 12), | |
88 # 0x80 | ((code >> 6) & 0x3f), | |
89 # 0x80 | (code & 0x3f)] | |
90 # Is this a good idea? | |
91 if 0xDC00 <= code <= 0xDC7F: | |
92 decoded.append(_unichr(code - 0xDC00)) | |
93 elif code <= 0xDCFF: | |
94 decoded.append(_unichr(code - 0xDC00)) | |
95 else: | |
96 raise NotASurrogateError | |
97 return str().join(decoded) | |
98 | |
99 | |
100 def replace_surrogate_decode(mybytes): | |
101 """ | |
102 Returns a (unicode) string | |
103 """ | |
104 decoded = [] | |
105 for ch in mybytes: | |
106 # We may be parsing newbytes (in which case ch is an int) or a native | |
107 # str on Py2 | |
108 if isinstance(ch, int): | |
109 code = ch | |
110 else: | |
111 code = ord(ch) | |
112 if 0x80 <= code <= 0xFF: | |
113 decoded.append(_unichr(0xDC00 + code)) | |
114 elif code <= 0x7F: | |
115 decoded.append(_unichr(code)) | |
116 else: | |
117 # # It may be a bad byte | |
118 # # Try swallowing it. | |
119 # continue | |
120 # print("RAISE!") | |
121 raise NotASurrogateError | |
122 return str().join(decoded) | |
123 | |
124 | |
125 def encodefilename(fn): | |
126 if FS_ENCODING == 'ascii': | |
127 # ASCII encoder of Python 2 expects that the error handler returns a | |
128 # Unicode string encodable to ASCII, whereas our surrogateescape error | |
129 # handler has to return bytes in 0x80-0xFF range. | |
130 encoded = [] | |
131 for index, ch in enumerate(fn): | |
132 code = ord(ch) | |
133 if code < 128: | |
134 ch = bytes_chr(code) | |
135 elif 0xDC80 <= code <= 0xDCFF: | |
136 ch = bytes_chr(code - 0xDC00) | |
137 else: | |
138 raise UnicodeEncodeError(FS_ENCODING, | |
139 fn, index, index+1, | |
140 'ordinal not in range(128)') | |
141 encoded.append(ch) | |
142 return bytes().join(encoded) | |
143 elif FS_ENCODING == 'utf-8': | |
144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF | |
145 # doesn't go through our error handler | |
146 encoded = [] | |
147 for index, ch in enumerate(fn): | |
148 code = ord(ch) | |
149 if 0xD800 <= code <= 0xDFFF: | |
150 if 0xDC80 <= code <= 0xDCFF: | |
151 ch = bytes_chr(code - 0xDC00) | |
152 encoded.append(ch) | |
153 else: | |
154 raise UnicodeEncodeError( | |
155 FS_ENCODING, | |
156 fn, index, index+1, 'surrogates not allowed') | |
157 else: | |
158 ch_utf8 = ch.encode('utf-8') | |
159 encoded.append(ch_utf8) | |
160 return bytes().join(encoded) | |
161 else: | |
162 return fn.encode(FS_ENCODING, FS_ERRORS) | |
163 | |
164 def decodefilename(fn): | |
165 return fn.decode(FS_ENCODING, FS_ERRORS) | |
166 | |
167 FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
168 # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') | |
169 # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | |
170 | |
171 | |
172 # normalize the filesystem encoding name. | |
173 # For example, we expect "utf-8", not "UTF8". | |
174 FS_ENCODING = codecs.lookup(FS_ENCODING).name | |
175 | |
176 | |
177 def register_surrogateescape(): | |
178 """ | |
179 Registers the surrogateescape error handler on Python 2 (only) | |
180 """ | |
181 if utils.PY3: | |
182 return | |
183 try: | |
184 codecs.lookup_error(FS_ERRORS) | |
185 except LookupError: | |
186 codecs.register_error(FS_ERRORS, surrogateescape_handler) | |
187 | |
188 | |
189 if __name__ == '__main__': | |
190 pass | |
191 # # Tests: | |
192 # register_surrogateescape() | |
193 | |
194 # b = decodefilename(fn) | |
195 # assert b == encoded, "%r != %r" % (b, encoded) | |
196 # c = encodefilename(b) | |
197 # assert c == fn, '%r != %r' % (c, fn) | |
198 # # print("ok") |