Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/idna/core.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 from . import idnadata | |
2 import bisect | |
3 import unicodedata | |
4 import re | |
5 import sys | |
6 from .intranges import intranges_contain | |
7 | |
8 _virama_combining_class = 9 | |
9 _alabel_prefix = b'xn--' | |
10 _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]') | |
11 | |
12 if sys.version_info[0] >= 3: | |
13 unicode = str | |
14 unichr = chr | |
15 | |
16 class IDNAError(UnicodeError): | |
17 """ Base exception for all IDNA-encoding related problems """ | |
18 pass | |
19 | |
20 | |
21 class IDNABidiError(IDNAError): | |
22 """ Exception when bidirectional requirements are not satisfied """ | |
23 pass | |
24 | |
25 | |
26 class InvalidCodepoint(IDNAError): | |
27 """ Exception when a disallowed or unallocated codepoint is used """ | |
28 pass | |
29 | |
30 | |
31 class InvalidCodepointContext(IDNAError): | |
32 """ Exception when the codepoint is not valid in the context it is used """ | |
33 pass | |
34 | |
35 | |
36 def _combining_class(cp): | |
37 v = unicodedata.combining(unichr(cp)) | |
38 if v == 0: | |
39 if not unicodedata.name(unichr(cp)): | |
40 raise ValueError("Unknown character in unicodedata") | |
41 return v | |
42 | |
43 def _is_script(cp, script): | |
44 return intranges_contain(ord(cp), idnadata.scripts[script]) | |
45 | |
46 def _punycode(s): | |
47 return s.encode('punycode') | |
48 | |
49 def _unot(s): | |
50 return 'U+{0:04X}'.format(s) | |
51 | |
52 | |
53 def valid_label_length(label): | |
54 | |
55 if len(label) > 63: | |
56 return False | |
57 return True | |
58 | |
59 | |
60 def valid_string_length(label, trailing_dot): | |
61 | |
62 if len(label) > (254 if trailing_dot else 253): | |
63 return False | |
64 return True | |
65 | |
66 | |
67 def check_bidi(label, check_ltr=False): | |
68 | |
69 # Bidi rules should only be applied if string contains RTL characters | |
70 bidi_label = False | |
71 for (idx, cp) in enumerate(label, 1): | |
72 direction = unicodedata.bidirectional(cp) | |
73 if direction == '': | |
74 # String likely comes from a newer version of Unicode | |
75 raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx)) | |
76 if direction in ['R', 'AL', 'AN']: | |
77 bidi_label = True | |
78 if not bidi_label and not check_ltr: | |
79 return True | |
80 | |
81 # Bidi rule 1 | |
82 direction = unicodedata.bidirectional(label[0]) | |
83 if direction in ['R', 'AL']: | |
84 rtl = True | |
85 elif direction == 'L': | |
86 rtl = False | |
87 else: | |
88 raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label))) | |
89 | |
90 valid_ending = False | |
91 number_type = False | |
92 for (idx, cp) in enumerate(label, 1): | |
93 direction = unicodedata.bidirectional(cp) | |
94 | |
95 if rtl: | |
96 # Bidi rule 2 | |
97 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
98 raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx)) | |
99 # Bidi rule 3 | |
100 if direction in ['R', 'AL', 'EN', 'AN']: | |
101 valid_ending = True | |
102 elif direction != 'NSM': | |
103 valid_ending = False | |
104 # Bidi rule 4 | |
105 if direction in ['AN', 'EN']: | |
106 if not number_type: | |
107 number_type = direction | |
108 else: | |
109 if number_type != direction: | |
110 raise IDNABidiError('Can not mix numeral types in a right-to-left label') | |
111 else: | |
112 # Bidi rule 5 | |
113 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
114 raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx)) | |
115 # Bidi rule 6 | |
116 if direction in ['L', 'EN']: | |
117 valid_ending = True | |
118 elif direction != 'NSM': | |
119 valid_ending = False | |
120 | |
121 if not valid_ending: | |
122 raise IDNABidiError('Label ends with illegal codepoint directionality') | |
123 | |
124 return True | |
125 | |
126 | |
127 def check_initial_combiner(label): | |
128 | |
129 if unicodedata.category(label[0])[0] == 'M': | |
130 raise IDNAError('Label begins with an illegal combining character') | |
131 return True | |
132 | |
133 | |
134 def check_hyphen_ok(label): | |
135 | |
136 if label[2:4] == '--': | |
137 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') | |
138 if label[0] == '-' or label[-1] == '-': | |
139 raise IDNAError('Label must not start or end with a hyphen') | |
140 return True | |
141 | |
142 | |
143 def check_nfc(label): | |
144 | |
145 if unicodedata.normalize('NFC', label) != label: | |
146 raise IDNAError('Label must be in Normalization Form C') | |
147 | |
148 | |
149 def valid_contextj(label, pos): | |
150 | |
151 cp_value = ord(label[pos]) | |
152 | |
153 if cp_value == 0x200c: | |
154 | |
155 if pos > 0: | |
156 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
157 return True | |
158 | |
159 ok = False | |
160 for i in range(pos-1, -1, -1): | |
161 joining_type = idnadata.joining_types.get(ord(label[i])) | |
162 if joining_type == ord('T'): | |
163 continue | |
164 if joining_type in [ord('L'), ord('D')]: | |
165 ok = True | |
166 break | |
167 | |
168 if not ok: | |
169 return False | |
170 | |
171 ok = False | |
172 for i in range(pos+1, len(label)): | |
173 joining_type = idnadata.joining_types.get(ord(label[i])) | |
174 if joining_type == ord('T'): | |
175 continue | |
176 if joining_type in [ord('R'), ord('D')]: | |
177 ok = True | |
178 break | |
179 return ok | |
180 | |
181 if cp_value == 0x200d: | |
182 | |
183 if pos > 0: | |
184 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
185 return True | |
186 return False | |
187 | |
188 else: | |
189 | |
190 return False | |
191 | |
192 | |
193 def valid_contexto(label, pos, exception=False): | |
194 | |
195 cp_value = ord(label[pos]) | |
196 | |
197 if cp_value == 0x00b7: | |
198 if 0 < pos < len(label)-1: | |
199 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: | |
200 return True | |
201 return False | |
202 | |
203 elif cp_value == 0x0375: | |
204 if pos < len(label)-1 and len(label) > 1: | |
205 return _is_script(label[pos + 1], 'Greek') | |
206 return False | |
207 | |
208 elif cp_value == 0x05f3 or cp_value == 0x05f4: | |
209 if pos > 0: | |
210 return _is_script(label[pos - 1], 'Hebrew') | |
211 return False | |
212 | |
213 elif cp_value == 0x30fb: | |
214 for cp in label: | |
215 if cp == u'\u30fb': | |
216 continue | |
217 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'): | |
218 return True | |
219 return False | |
220 | |
221 elif 0x660 <= cp_value <= 0x669: | |
222 for cp in label: | |
223 if 0x6f0 <= ord(cp) <= 0x06f9: | |
224 return False | |
225 return True | |
226 | |
227 elif 0x6f0 <= cp_value <= 0x6f9: | |
228 for cp in label: | |
229 if 0x660 <= ord(cp) <= 0x0669: | |
230 return False | |
231 return True | |
232 | |
233 | |
234 def check_label(label): | |
235 | |
236 if isinstance(label, (bytes, bytearray)): | |
237 label = label.decode('utf-8') | |
238 if len(label) == 0: | |
239 raise IDNAError('Empty Label') | |
240 | |
241 check_nfc(label) | |
242 check_hyphen_ok(label) | |
243 check_initial_combiner(label) | |
244 | |
245 for (pos, cp) in enumerate(label): | |
246 cp_value = ord(cp) | |
247 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): | |
248 continue | |
249 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']): | |
250 try: | |
251 if not valid_contextj(label, pos): | |
252 raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format( | |
253 _unot(cp_value), pos+1, repr(label))) | |
254 except ValueError: | |
255 raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format( | |
256 _unot(cp_value), pos+1, repr(label))) | |
257 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']): | |
258 if not valid_contexto(label, pos): | |
259 raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label))) | |
260 else: | |
261 raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) | |
262 | |
263 check_bidi(label) | |
264 | |
265 | |
266 def alabel(label): | |
267 | |
268 try: | |
269 label = label.encode('ascii') | |
270 ulabel(label) | |
271 if not valid_label_length(label): | |
272 raise IDNAError('Label too long') | |
273 return label | |
274 except UnicodeEncodeError: | |
275 pass | |
276 | |
277 if not label: | |
278 raise IDNAError('No Input') | |
279 | |
280 label = unicode(label) | |
281 check_label(label) | |
282 label = _punycode(label) | |
283 label = _alabel_prefix + label | |
284 | |
285 if not valid_label_length(label): | |
286 raise IDNAError('Label too long') | |
287 | |
288 return label | |
289 | |
290 | |
291 def ulabel(label): | |
292 | |
293 if not isinstance(label, (bytes, bytearray)): | |
294 try: | |
295 label = label.encode('ascii') | |
296 except UnicodeEncodeError: | |
297 check_label(label) | |
298 return label | |
299 | |
300 label = label.lower() | |
301 if label.startswith(_alabel_prefix): | |
302 label = label[len(_alabel_prefix):] | |
303 if not label: | |
304 raise IDNAError('Malformed A-label, no Punycode eligible content found') | |
305 if label.decode('ascii')[-1] == '-': | |
306 raise IDNAError('A-label must not end with a hyphen') | |
307 else: | |
308 check_label(label) | |
309 return label.decode('ascii') | |
310 | |
311 label = label.decode('punycode') | |
312 check_label(label) | |
313 return label | |
314 | |
315 | |
316 def uts46_remap(domain, std3_rules=True, transitional=False): | |
317 """Re-map the characters in the string according to UTS46 processing.""" | |
318 from .uts46data import uts46data | |
319 output = u"" | |
320 try: | |
321 for pos, char in enumerate(domain): | |
322 code_point = ord(char) | |
323 uts46row = uts46data[code_point if code_point < 256 else | |
324 bisect.bisect_left(uts46data, (code_point, "Z")) - 1] | |
325 status = uts46row[1] | |
326 replacement = uts46row[2] if len(uts46row) == 3 else None | |
327 if (status == "V" or | |
328 (status == "D" and not transitional) or | |
329 (status == "3" and not std3_rules and replacement is None)): | |
330 output += char | |
331 elif replacement is not None and (status == "M" or | |
332 (status == "3" and not std3_rules) or | |
333 (status == "D" and transitional)): | |
334 output += replacement | |
335 elif status != "I": | |
336 raise IndexError() | |
337 return unicodedata.normalize("NFC", output) | |
338 except IndexError: | |
339 raise InvalidCodepoint( | |
340 "Codepoint {0} not allowed at position {1} in {2}".format( | |
341 _unot(code_point), pos + 1, repr(domain))) | |
342 | |
343 | |
344 def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False): | |
345 | |
346 if isinstance(s, (bytes, bytearray)): | |
347 s = s.decode("ascii") | |
348 if uts46: | |
349 s = uts46_remap(s, std3_rules, transitional) | |
350 trailing_dot = False | |
351 result = [] | |
352 if strict: | |
353 labels = s.split('.') | |
354 else: | |
355 labels = _unicode_dots_re.split(s) | |
356 if not labels or labels == ['']: | |
357 raise IDNAError('Empty domain') | |
358 if labels[-1] == '': | |
359 del labels[-1] | |
360 trailing_dot = True | |
361 for label in labels: | |
362 s = alabel(label) | |
363 if s: | |
364 result.append(s) | |
365 else: | |
366 raise IDNAError('Empty label') | |
367 if trailing_dot: | |
368 result.append(b'') | |
369 s = b'.'.join(result) | |
370 if not valid_string_length(s, trailing_dot): | |
371 raise IDNAError('Domain too long') | |
372 return s | |
373 | |
374 | |
375 def decode(s, strict=False, uts46=False, std3_rules=False): | |
376 | |
377 if isinstance(s, (bytes, bytearray)): | |
378 s = s.decode("ascii") | |
379 if uts46: | |
380 s = uts46_remap(s, std3_rules, False) | |
381 trailing_dot = False | |
382 result = [] | |
383 if not strict: | |
384 labels = _unicode_dots_re.split(s) | |
385 else: | |
386 labels = s.split(u'.') | |
387 if not labels or labels == ['']: | |
388 raise IDNAError('Empty domain') | |
389 if not labels[-1]: | |
390 del labels[-1] | |
391 trailing_dot = True | |
392 for label in labels: | |
393 s = ulabel(label) | |
394 if s: | |
395 result.append(s) | |
396 else: | |
397 raise IDNAError('Empty label') | |
398 if trailing_dot: | |
399 result.append(u'') | |
400 return u'.'.join(result) |