Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/idna/core.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:d30785e31577 | 1:56ad4e20f292 | 
|---|---|
| 1 from . import idnadata | |
| 2 import bisect | |
| 3 import unicodedata | |
| 4 import re | |
| 5 import sys | |
| 6 from .intranges import intranges_contain | |
| 7 | |
| 8 _virama_combining_class = 9 | |
| 9 _alabel_prefix = b'xn--' | |
| 10 _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]') | |
| 11 | |
| 12 if sys.version_info[0] >= 3: | |
| 13 unicode = str | |
| 14 unichr = chr | |
| 15 | |
| 16 class IDNAError(UnicodeError): | |
| 17 """ Base exception for all IDNA-encoding related problems """ | |
| 18 pass | |
| 19 | |
| 20 | |
| 21 class IDNABidiError(IDNAError): | |
| 22 """ Exception when bidirectional requirements are not satisfied """ | |
| 23 pass | |
| 24 | |
| 25 | |
| 26 class InvalidCodepoint(IDNAError): | |
| 27 """ Exception when a disallowed or unallocated codepoint is used """ | |
| 28 pass | |
| 29 | |
| 30 | |
| 31 class InvalidCodepointContext(IDNAError): | |
| 32 """ Exception when the codepoint is not valid in the context it is used """ | |
| 33 pass | |
| 34 | |
| 35 | |
| 36 def _combining_class(cp): | |
| 37 v = unicodedata.combining(unichr(cp)) | |
| 38 if v == 0: | |
| 39 if not unicodedata.name(unichr(cp)): | |
| 40 raise ValueError("Unknown character in unicodedata") | |
| 41 return v | |
| 42 | |
| 43 def _is_script(cp, script): | |
| 44 return intranges_contain(ord(cp), idnadata.scripts[script]) | |
| 45 | |
| 46 def _punycode(s): | |
| 47 return s.encode('punycode') | |
| 48 | |
| 49 def _unot(s): | |
| 50 return 'U+{0:04X}'.format(s) | |
| 51 | |
| 52 | |
| 53 def valid_label_length(label): | |
| 54 | |
| 55 if len(label) > 63: | |
| 56 return False | |
| 57 return True | |
| 58 | |
| 59 | |
| 60 def valid_string_length(label, trailing_dot): | |
| 61 | |
| 62 if len(label) > (254 if trailing_dot else 253): | |
| 63 return False | |
| 64 return True | |
| 65 | |
| 66 | |
| 67 def check_bidi(label, check_ltr=False): | |
| 68 | |
| 69 # Bidi rules should only be applied if string contains RTL characters | |
| 70 bidi_label = False | |
| 71 for (idx, cp) in enumerate(label, 1): | |
| 72 direction = unicodedata.bidirectional(cp) | |
| 73 if direction == '': | |
| 74 # String likely comes from a newer version of Unicode | |
| 75 raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx)) | |
| 76 if direction in ['R', 'AL', 'AN']: | |
| 77 bidi_label = True | |
| 78 if not bidi_label and not check_ltr: | |
| 79 return True | |
| 80 | |
| 81 # Bidi rule 1 | |
| 82 direction = unicodedata.bidirectional(label[0]) | |
| 83 if direction in ['R', 'AL']: | |
| 84 rtl = True | |
| 85 elif direction == 'L': | |
| 86 rtl = False | |
| 87 else: | |
| 88 raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label))) | |
| 89 | |
| 90 valid_ending = False | |
| 91 number_type = False | |
| 92 for (idx, cp) in enumerate(label, 1): | |
| 93 direction = unicodedata.bidirectional(cp) | |
| 94 | |
| 95 if rtl: | |
| 96 # Bidi rule 2 | |
| 97 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
| 98 raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx)) | |
| 99 # Bidi rule 3 | |
| 100 if direction in ['R', 'AL', 'EN', 'AN']: | |
| 101 valid_ending = True | |
| 102 elif direction != 'NSM': | |
| 103 valid_ending = False | |
| 104 # Bidi rule 4 | |
| 105 if direction in ['AN', 'EN']: | |
| 106 if not number_type: | |
| 107 number_type = direction | |
| 108 else: | |
| 109 if number_type != direction: | |
| 110 raise IDNABidiError('Can not mix numeral types in a right-to-left label') | |
| 111 else: | |
| 112 # Bidi rule 5 | |
| 113 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
| 114 raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx)) | |
| 115 # Bidi rule 6 | |
| 116 if direction in ['L', 'EN']: | |
| 117 valid_ending = True | |
| 118 elif direction != 'NSM': | |
| 119 valid_ending = False | |
| 120 | |
| 121 if not valid_ending: | |
| 122 raise IDNABidiError('Label ends with illegal codepoint directionality') | |
| 123 | |
| 124 return True | |
| 125 | |
| 126 | |
| 127 def check_initial_combiner(label): | |
| 128 | |
| 129 if unicodedata.category(label[0])[0] == 'M': | |
| 130 raise IDNAError('Label begins with an illegal combining character') | |
| 131 return True | |
| 132 | |
| 133 | |
| 134 def check_hyphen_ok(label): | |
| 135 | |
| 136 if label[2:4] == '--': | |
| 137 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') | |
| 138 if label[0] == '-' or label[-1] == '-': | |
| 139 raise IDNAError('Label must not start or end with a hyphen') | |
| 140 return True | |
| 141 | |
| 142 | |
| 143 def check_nfc(label): | |
| 144 | |
| 145 if unicodedata.normalize('NFC', label) != label: | |
| 146 raise IDNAError('Label must be in Normalization Form C') | |
| 147 | |
| 148 | |
| 149 def valid_contextj(label, pos): | |
| 150 | |
| 151 cp_value = ord(label[pos]) | |
| 152 | |
| 153 if cp_value == 0x200c: | |
| 154 | |
| 155 if pos > 0: | |
| 156 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
| 157 return True | |
| 158 | |
| 159 ok = False | |
| 160 for i in range(pos-1, -1, -1): | |
| 161 joining_type = idnadata.joining_types.get(ord(label[i])) | |
| 162 if joining_type == ord('T'): | |
| 163 continue | |
| 164 if joining_type in [ord('L'), ord('D')]: | |
| 165 ok = True | |
| 166 break | |
| 167 | |
| 168 if not ok: | |
| 169 return False | |
| 170 | |
| 171 ok = False | |
| 172 for i in range(pos+1, len(label)): | |
| 173 joining_type = idnadata.joining_types.get(ord(label[i])) | |
| 174 if joining_type == ord('T'): | |
| 175 continue | |
| 176 if joining_type in [ord('R'), ord('D')]: | |
| 177 ok = True | |
| 178 break | |
| 179 return ok | |
| 180 | |
| 181 if cp_value == 0x200d: | |
| 182 | |
| 183 if pos > 0: | |
| 184 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
| 185 return True | |
| 186 return False | |
| 187 | |
| 188 else: | |
| 189 | |
| 190 return False | |
| 191 | |
| 192 | |
| 193 def valid_contexto(label, pos, exception=False): | |
| 194 | |
| 195 cp_value = ord(label[pos]) | |
| 196 | |
| 197 if cp_value == 0x00b7: | |
| 198 if 0 < pos < len(label)-1: | |
| 199 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: | |
| 200 return True | |
| 201 return False | |
| 202 | |
| 203 elif cp_value == 0x0375: | |
| 204 if pos < len(label)-1 and len(label) > 1: | |
| 205 return _is_script(label[pos + 1], 'Greek') | |
| 206 return False | |
| 207 | |
| 208 elif cp_value == 0x05f3 or cp_value == 0x05f4: | |
| 209 if pos > 0: | |
| 210 return _is_script(label[pos - 1], 'Hebrew') | |
| 211 return False | |
| 212 | |
| 213 elif cp_value == 0x30fb: | |
| 214 for cp in label: | |
| 215 if cp == u'\u30fb': | |
| 216 continue | |
| 217 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'): | |
| 218 return True | |
| 219 return False | |
| 220 | |
| 221 elif 0x660 <= cp_value <= 0x669: | |
| 222 for cp in label: | |
| 223 if 0x6f0 <= ord(cp) <= 0x06f9: | |
| 224 return False | |
| 225 return True | |
| 226 | |
| 227 elif 0x6f0 <= cp_value <= 0x6f9: | |
| 228 for cp in label: | |
| 229 if 0x660 <= ord(cp) <= 0x0669: | |
| 230 return False | |
| 231 return True | |
| 232 | |
| 233 | |
| 234 def check_label(label): | |
| 235 | |
| 236 if isinstance(label, (bytes, bytearray)): | |
| 237 label = label.decode('utf-8') | |
| 238 if len(label) == 0: | |
| 239 raise IDNAError('Empty Label') | |
| 240 | |
| 241 check_nfc(label) | |
| 242 check_hyphen_ok(label) | |
| 243 check_initial_combiner(label) | |
| 244 | |
| 245 for (pos, cp) in enumerate(label): | |
| 246 cp_value = ord(cp) | |
| 247 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): | |
| 248 continue | |
| 249 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']): | |
| 250 try: | |
| 251 if not valid_contextj(label, pos): | |
| 252 raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format( | |
| 253 _unot(cp_value), pos+1, repr(label))) | |
| 254 except ValueError: | |
| 255 raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format( | |
| 256 _unot(cp_value), pos+1, repr(label))) | |
| 257 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']): | |
| 258 if not valid_contexto(label, pos): | |
| 259 raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label))) | |
| 260 else: | |
| 261 raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) | |
| 262 | |
| 263 check_bidi(label) | |
| 264 | |
| 265 | |
| 266 def alabel(label): | |
| 267 | |
| 268 try: | |
| 269 label = label.encode('ascii') | |
| 270 ulabel(label) | |
| 271 if not valid_label_length(label): | |
| 272 raise IDNAError('Label too long') | |
| 273 return label | |
| 274 except UnicodeEncodeError: | |
| 275 pass | |
| 276 | |
| 277 if not label: | |
| 278 raise IDNAError('No Input') | |
| 279 | |
| 280 label = unicode(label) | |
| 281 check_label(label) | |
| 282 label = _punycode(label) | |
| 283 label = _alabel_prefix + label | |
| 284 | |
| 285 if not valid_label_length(label): | |
| 286 raise IDNAError('Label too long') | |
| 287 | |
| 288 return label | |
| 289 | |
| 290 | |
| 291 def ulabel(label): | |
| 292 | |
| 293 if not isinstance(label, (bytes, bytearray)): | |
| 294 try: | |
| 295 label = label.encode('ascii') | |
| 296 except UnicodeEncodeError: | |
| 297 check_label(label) | |
| 298 return label | |
| 299 | |
| 300 label = label.lower() | |
| 301 if label.startswith(_alabel_prefix): | |
| 302 label = label[len(_alabel_prefix):] | |
| 303 if not label: | |
| 304 raise IDNAError('Malformed A-label, no Punycode eligible content found') | |
| 305 if label.decode('ascii')[-1] == '-': | |
| 306 raise IDNAError('A-label must not end with a hyphen') | |
| 307 else: | |
| 308 check_label(label) | |
| 309 return label.decode('ascii') | |
| 310 | |
| 311 label = label.decode('punycode') | |
| 312 check_label(label) | |
| 313 return label | |
| 314 | |
| 315 | |
| 316 def uts46_remap(domain, std3_rules=True, transitional=False): | |
| 317 """Re-map the characters in the string according to UTS46 processing.""" | |
| 318 from .uts46data import uts46data | |
| 319 output = u"" | |
| 320 try: | |
| 321 for pos, char in enumerate(domain): | |
| 322 code_point = ord(char) | |
| 323 uts46row = uts46data[code_point if code_point < 256 else | |
| 324 bisect.bisect_left(uts46data, (code_point, "Z")) - 1] | |
| 325 status = uts46row[1] | |
| 326 replacement = uts46row[2] if len(uts46row) == 3 else None | |
| 327 if (status == "V" or | |
| 328 (status == "D" and not transitional) or | |
| 329 (status == "3" and not std3_rules and replacement is None)): | |
| 330 output += char | |
| 331 elif replacement is not None and (status == "M" or | |
| 332 (status == "3" and not std3_rules) or | |
| 333 (status == "D" and transitional)): | |
| 334 output += replacement | |
| 335 elif status != "I": | |
| 336 raise IndexError() | |
| 337 return unicodedata.normalize("NFC", output) | |
| 338 except IndexError: | |
| 339 raise InvalidCodepoint( | |
| 340 "Codepoint {0} not allowed at position {1} in {2}".format( | |
| 341 _unot(code_point), pos + 1, repr(domain))) | |
| 342 | |
| 343 | |
| 344 def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False): | |
| 345 | |
| 346 if isinstance(s, (bytes, bytearray)): | |
| 347 s = s.decode("ascii") | |
| 348 if uts46: | |
| 349 s = uts46_remap(s, std3_rules, transitional) | |
| 350 trailing_dot = False | |
| 351 result = [] | |
| 352 if strict: | |
| 353 labels = s.split('.') | |
| 354 else: | |
| 355 labels = _unicode_dots_re.split(s) | |
| 356 if not labels or labels == ['']: | |
| 357 raise IDNAError('Empty domain') | |
| 358 if labels[-1] == '': | |
| 359 del labels[-1] | |
| 360 trailing_dot = True | |
| 361 for label in labels: | |
| 362 s = alabel(label) | |
| 363 if s: | |
| 364 result.append(s) | |
| 365 else: | |
| 366 raise IDNAError('Empty label') | |
| 367 if trailing_dot: | |
| 368 result.append(b'') | |
| 369 s = b'.'.join(result) | |
| 370 if not valid_string_length(s, trailing_dot): | |
| 371 raise IDNAError('Domain too long') | |
| 372 return s | |
| 373 | |
| 374 | |
| 375 def decode(s, strict=False, uts46=False, std3_rules=False): | |
| 376 | |
| 377 if isinstance(s, (bytes, bytearray)): | |
| 378 s = s.decode("ascii") | |
| 379 if uts46: | |
| 380 s = uts46_remap(s, std3_rules, False) | |
| 381 trailing_dot = False | |
| 382 result = [] | |
| 383 if not strict: | |
| 384 labels = _unicode_dots_re.split(s) | |
| 385 else: | |
| 386 labels = s.split(u'.') | |
| 387 if not labels or labels == ['']: | |
| 388 raise IDNAError('Empty domain') | |
| 389 if not labels[-1]: | |
| 390 del labels[-1] | |
| 391 trailing_dot = True | |
| 392 for label in labels: | |
| 393 s = ulabel(label) | |
| 394 if s: | |
| 395 result.append(s) | |
| 396 else: | |
| 397 raise IDNAError('Empty label') | |
| 398 if trailing_dot: | |
| 399 result.append(u'') | |
| 400 return u'.'.join(result) | 
