Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bs4/dammit.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """Beautiful Soup bonus library: Unicode, Dammit | |
3 | |
4 This library converts a bytestream to Unicode through any means | |
5 necessary. It is heavily based on code from Mark Pilgrim's Universal | |
6 Feed Parser. It works best on XML and HTML, but it does not rewrite the | |
7 XML or HTML to reflect a new encoding; that's the tree builder's job. | |
8 """ | |
9 # Use of this source code is governed by the MIT license. | |
10 __license__ = "MIT" | |
11 | |
12 import codecs | |
13 from html.entities import codepoint2name | |
14 import re | |
15 import logging | |
16 import string | |
17 | |
18 # Import a library to autodetect character encodings. | |
19 chardet_type = None | |
20 try: | |
21 # First try the fast C implementation. | |
22 # PyPI package: cchardet | |
23 import cchardet | |
24 def chardet_dammit(s): | |
25 if isinstance(s, str): | |
26 return None | |
27 return cchardet.detect(s)['encoding'] | |
28 except ImportError: | |
29 try: | |
30 # Fall back to the pure Python implementation | |
31 # Debian package: python-chardet | |
32 # PyPI package: chardet | |
33 import chardet | |
34 def chardet_dammit(s): | |
35 if isinstance(s, str): | |
36 return None | |
37 return chardet.detect(s)['encoding'] | |
38 #import chardet.constants | |
39 #chardet.constants._debug = 1 | |
40 except ImportError: | |
41 # No chardet available. | |
42 def chardet_dammit(s): | |
43 return None | |
44 | |
45 # Available from http://cjkpython.i18n.org/. | |
46 # | |
47 # TODO: This doesn't work anymore and the closest thing, iconv_codecs, | |
48 # is GPL-licensed. Check whether this is still necessary. | |
49 try: | |
50 import iconv_codec | |
51 except ImportError: | |
52 pass | |
53 | |
54 # Build bytestring and Unicode versions of regular expressions for finding | |
55 # a declared encoding inside an XML or HTML document. | |
56 xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' | |
57 html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' | |
58 encoding_res = dict() | |
59 encoding_res[bytes] = { | |
60 'html' : re.compile(html_meta.encode("ascii"), re.I), | |
61 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), | |
62 } | |
63 encoding_res[str] = { | |
64 'html' : re.compile(html_meta, re.I), | |
65 'xml' : re.compile(xml_encoding, re.I) | |
66 } | |
67 | |
68 class EntitySubstitution(object): | |
69 """The ability to substitute XML or HTML entities for certain characters.""" | |
70 | |
71 def _populate_class_variables(): | |
72 lookup = {} | |
73 reverse_lookup = {} | |
74 characters_for_re = [] | |
75 | |
76 # &apos is an XHTML entity and an HTML 5, but not an HTML 4 | |
77 # entity. We don't want to use it, but we want to recognize it on the way in. | |
78 # | |
79 # TODO: Ideally we would be able to recognize all HTML 5 named | |
80 # entities, but that's a little tricky. | |
81 extra = [(39, 'apos')] | |
82 for codepoint, name in list(codepoint2name.items()) + extra: | |
83 character = chr(codepoint) | |
84 if codepoint not in (34, 39): | |
85 # There's no point in turning the quotation mark into | |
86 # " or the single quote into ', unless it | |
87 # happens within an attribute value, which is handled | |
88 # elsewhere. | |
89 characters_for_re.append(character) | |
90 lookup[character] = name | |
91 # But we do want to recognize those entities on the way in and | |
92 # convert them to Unicode characters. | |
93 reverse_lookup[name] = character | |
94 re_definition = "[%s]" % "".join(characters_for_re) | |
95 return lookup, reverse_lookup, re.compile(re_definition) | |
96 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | |
97 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | |
98 | |
99 CHARACTER_TO_XML_ENTITY = { | |
100 "'": "apos", | |
101 '"': "quot", | |
102 "&": "amp", | |
103 "<": "lt", | |
104 ">": "gt", | |
105 } | |
106 | |
107 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | |
108 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" | |
109 ")") | |
110 | |
111 AMPERSAND_OR_BRACKET = re.compile("([<>&])") | |
112 | |
113 @classmethod | |
114 def _substitute_html_entity(cls, matchobj): | |
115 """Used with a regular expression to substitute the | |
116 appropriate HTML entity for a special character.""" | |
117 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | |
118 return "&%s;" % entity | |
119 | |
120 @classmethod | |
121 def _substitute_xml_entity(cls, matchobj): | |
122 """Used with a regular expression to substitute the | |
123 appropriate XML entity for a special character.""" | |
124 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | |
125 return "&%s;" % entity | |
126 | |
127 @classmethod | |
128 def quoted_attribute_value(self, value): | |
129 """Make a value into a quoted XML attribute, possibly escaping it. | |
130 | |
131 Most strings will be quoted using double quotes. | |
132 | |
133 Bob's Bar -> "Bob's Bar" | |
134 | |
135 If a string contains double quotes, it will be quoted using | |
136 single quotes. | |
137 | |
138 Welcome to "my bar" -> 'Welcome to "my bar"' | |
139 | |
140 If a string contains both single and double quotes, the | |
141 double quotes will be escaped, and the string will be quoted | |
142 using double quotes. | |
143 | |
144 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | |
145 """ | |
146 quote_with = '"' | |
147 if '"' in value: | |
148 if "'" in value: | |
149 # The string contains both single and double | |
150 # quotes. Turn the double quotes into | |
151 # entities. We quote the double quotes rather than | |
152 # the single quotes because the entity name is | |
153 # """ whether this is HTML or XML. If we | |
154 # quoted the single quotes, we'd have to decide | |
155 # between ' and &squot;. | |
156 replace_with = """ | |
157 value = value.replace('"', replace_with) | |
158 else: | |
159 # There are double quotes but no single quotes. | |
160 # We can use single quotes to quote the attribute. | |
161 quote_with = "'" | |
162 return quote_with + value + quote_with | |
163 | |
164 @classmethod | |
165 def substitute_xml(cls, value, make_quoted_attribute=False): | |
166 """Substitute XML entities for special XML characters. | |
167 | |
168 :param value: A string to be substituted. The less-than sign | |
169 will become <, the greater-than sign will become >, | |
170 and any ampersands will become &. If you want ampersands | |
171 that appear to be part of an entity definition to be left | |
172 alone, use substitute_xml_containing_entities() instead. | |
173 | |
174 :param make_quoted_attribute: If True, then the string will be | |
175 quoted, as befits an attribute value. | |
176 """ | |
177 # Escape angle brackets and ampersands. | |
178 value = cls.AMPERSAND_OR_BRACKET.sub( | |
179 cls._substitute_xml_entity, value) | |
180 | |
181 if make_quoted_attribute: | |
182 value = cls.quoted_attribute_value(value) | |
183 return value | |
184 | |
185 @classmethod | |
186 def substitute_xml_containing_entities( | |
187 cls, value, make_quoted_attribute=False): | |
188 """Substitute XML entities for special XML characters. | |
189 | |
190 :param value: A string to be substituted. The less-than sign will | |
191 become <, the greater-than sign will become >, and any | |
192 ampersands that are not part of an entity defition will | |
193 become &. | |
194 | |
195 :param make_quoted_attribute: If True, then the string will be | |
196 quoted, as befits an attribute value. | |
197 """ | |
198 # Escape angle brackets, and ampersands that aren't part of | |
199 # entities. | |
200 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | |
201 cls._substitute_xml_entity, value) | |
202 | |
203 if make_quoted_attribute: | |
204 value = cls.quoted_attribute_value(value) | |
205 return value | |
206 | |
207 @classmethod | |
208 def substitute_html(cls, s): | |
209 """Replace certain Unicode characters with named HTML entities. | |
210 | |
211 This differs from data.encode(encoding, 'xmlcharrefreplace') | |
212 in that the goal is to make the result more readable (to those | |
213 with ASCII displays) rather than to recover from | |
214 errors. There's absolutely nothing wrong with a UTF-8 string | |
215 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | |
216 character with "é" will make it more readable to some | |
217 people. | |
218 | |
219 :param s: A Unicode string. | |
220 """ | |
221 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | |
222 cls._substitute_html_entity, s) | |
223 | |
224 | |
225 class EncodingDetector: | |
226 """Suggests a number of possible encodings for a bytestring. | |
227 | |
228 Order of precedence: | |
229 | |
230 1. Encodings you specifically tell EncodingDetector to try first | |
231 (the override_encodings argument to the constructor). | |
232 | |
233 2. An encoding declared within the bytestring itself, either in an | |
234 XML declaration (if the bytestring is to be interpreted as an XML | |
235 document), or in a <meta> tag (if the bytestring is to be | |
236 interpreted as an HTML document.) | |
237 | |
238 3. An encoding detected through textual analysis by chardet, | |
239 cchardet, or a similar external library. | |
240 | |
241 4. UTF-8. | |
242 | |
243 5. Windows-1252. | |
244 """ | |
245 def __init__(self, markup, override_encodings=None, is_html=False, | |
246 exclude_encodings=None): | |
247 """Constructor. | |
248 | |
249 :param markup: Some markup in an unknown encoding. | |
250 :param override_encodings: These encodings will be tried first. | |
251 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
252 it's assumed to be XML. | |
253 :param exclude_encodings: These encodings will not be tried, even | |
254 if they otherwise would be. | |
255 """ | |
256 self.override_encodings = override_encodings or [] | |
257 exclude_encodings = exclude_encodings or [] | |
258 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | |
259 self.chardet_encoding = None | |
260 self.is_html = is_html | |
261 self.declared_encoding = None | |
262 | |
263 # First order of business: strip a byte-order mark. | |
264 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | |
265 | |
266 def _usable(self, encoding, tried): | |
267 """Should we even bother to try this encoding? | |
268 | |
269 :param encoding: Name of an encoding. | |
270 :param tried: Encodings that have already been tried. This will be modified | |
271 as a side effect. | |
272 """ | |
273 if encoding is not None: | |
274 encoding = encoding.lower() | |
275 if encoding in self.exclude_encodings: | |
276 return False | |
277 if encoding not in tried: | |
278 tried.add(encoding) | |
279 return True | |
280 return False | |
281 | |
282 @property | |
283 def encodings(self): | |
284 """Yield a number of encodings that might work for this markup. | |
285 | |
286 :yield: A sequence of strings. | |
287 """ | |
288 tried = set() | |
289 for e in self.override_encodings: | |
290 if self._usable(e, tried): | |
291 yield e | |
292 | |
293 # Did the document originally start with a byte-order mark | |
294 # that indicated its encoding? | |
295 if self._usable(self.sniffed_encoding, tried): | |
296 yield self.sniffed_encoding | |
297 | |
298 # Look within the document for an XML or HTML encoding | |
299 # declaration. | |
300 if self.declared_encoding is None: | |
301 self.declared_encoding = self.find_declared_encoding( | |
302 self.markup, self.is_html) | |
303 if self._usable(self.declared_encoding, tried): | |
304 yield self.declared_encoding | |
305 | |
306 # Use third-party character set detection to guess at the | |
307 # encoding. | |
308 if self.chardet_encoding is None: | |
309 self.chardet_encoding = chardet_dammit(self.markup) | |
310 if self._usable(self.chardet_encoding, tried): | |
311 yield self.chardet_encoding | |
312 | |
313 # As a last-ditch effort, try utf-8 and windows-1252. | |
314 for e in ('utf-8', 'windows-1252'): | |
315 if self._usable(e, tried): | |
316 yield e | |
317 | |
318 @classmethod | |
319 def strip_byte_order_mark(cls, data): | |
320 """If a byte-order mark is present, strip it and return the encoding it implies. | |
321 | |
322 :param data: Some markup. | |
323 :return: A 2-tuple (modified data, implied encoding) | |
324 """ | |
325 encoding = None | |
326 if isinstance(data, str): | |
327 # Unicode data cannot have a byte-order mark. | |
328 return data, encoding | |
329 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | |
330 and (data[2:4] != '\x00\x00'): | |
331 encoding = 'utf-16be' | |
332 data = data[2:] | |
333 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | |
334 and (data[2:4] != '\x00\x00'): | |
335 encoding = 'utf-16le' | |
336 data = data[2:] | |
337 elif data[:3] == b'\xef\xbb\xbf': | |
338 encoding = 'utf-8' | |
339 data = data[3:] | |
340 elif data[:4] == b'\x00\x00\xfe\xff': | |
341 encoding = 'utf-32be' | |
342 data = data[4:] | |
343 elif data[:4] == b'\xff\xfe\x00\x00': | |
344 encoding = 'utf-32le' | |
345 data = data[4:] | |
346 return data, encoding | |
347 | |
348 @classmethod | |
349 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | |
350 """Given a document, tries to find its declared encoding. | |
351 | |
352 An XML encoding is declared at the beginning of the document. | |
353 | |
354 An HTML encoding is declared in a <meta> tag, hopefully near the | |
355 beginning of the document. | |
356 | |
357 :param markup: Some markup. | |
358 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
359 it's assumed to be XML. | |
360 :param search_entire_document: Since an encoding is supposed to declared near the beginning | |
361 of the document, most of the time it's only necessary to search a few kilobytes of data. | |
362 Set this to True to force this method to search the entire document. | |
363 """ | |
364 if search_entire_document: | |
365 xml_endpos = html_endpos = len(markup) | |
366 else: | |
367 xml_endpos = 1024 | |
368 html_endpos = max(2048, int(len(markup) * 0.05)) | |
369 | |
370 if isinstance(markup, bytes): | |
371 res = encoding_res[bytes] | |
372 else: | |
373 res = encoding_res[str] | |
374 | |
375 xml_re = res['xml'] | |
376 html_re = res['html'] | |
377 declared_encoding = None | |
378 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) | |
379 if not declared_encoding_match and is_html: | |
380 declared_encoding_match = html_re.search(markup, endpos=html_endpos) | |
381 if declared_encoding_match is not None: | |
382 declared_encoding = declared_encoding_match.groups()[0] | |
383 if declared_encoding: | |
384 if isinstance(declared_encoding, bytes): | |
385 declared_encoding = declared_encoding.decode('ascii', 'replace') | |
386 return declared_encoding.lower() | |
387 return None | |
388 | |
389 class UnicodeDammit: | |
390 """A class for detecting the encoding of a *ML document and | |
391 converting it to a Unicode string. If the source encoding is | |
392 windows-1252, can replace MS smart quotes with their HTML or XML | |
393 equivalents.""" | |
394 | |
395 # This dictionary maps commonly seen values for "charset" in HTML | |
396 # meta tags to the corresponding Python codec names. It only covers | |
397 # values that aren't in Python's aliases and can't be determined | |
398 # by the heuristics in find_codec. | |
399 CHARSET_ALIASES = {"macintosh": "mac-roman", | |
400 "x-sjis": "shift-jis"} | |
401 | |
402 ENCODINGS_WITH_SMART_QUOTES = [ | |
403 "windows-1252", | |
404 "iso-8859-1", | |
405 "iso-8859-2", | |
406 ] | |
407 | |
408 def __init__(self, markup, override_encodings=[], | |
409 smart_quotes_to=None, is_html=False, exclude_encodings=[]): | |
410 """Constructor. | |
411 | |
412 :param markup: A bytestring representing markup in an unknown encoding. | |
413 :param override_encodings: These encodings will be tried first, | |
414 before any sniffing code is run. | |
415 | |
416 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted | |
417 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. | |
418 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' | |
419 will convert them to HTML entity references. | |
420 :param is_html: If True, this markup is considered to be HTML. Otherwise | |
421 it's assumed to be XML. | |
422 :param exclude_encodings: These encodings will not be considered, even | |
423 if the sniffing code thinks they might make sense. | |
424 """ | |
425 self.smart_quotes_to = smart_quotes_to | |
426 self.tried_encodings = [] | |
427 self.contains_replacement_characters = False | |
428 self.is_html = is_html | |
429 self.log = logging.getLogger(__name__) | |
430 self.detector = EncodingDetector( | |
431 markup, override_encodings, is_html, exclude_encodings) | |
432 | |
433 # Short-circuit if the data is in Unicode to begin with. | |
434 if isinstance(markup, str) or markup == '': | |
435 self.markup = markup | |
436 self.unicode_markup = str(markup) | |
437 self.original_encoding = None | |
438 return | |
439 | |
440 # The encoding detector may have stripped a byte-order mark. | |
441 # Use the stripped markup from this point on. | |
442 self.markup = self.detector.markup | |
443 | |
444 u = None | |
445 for encoding in self.detector.encodings: | |
446 markup = self.detector.markup | |
447 u = self._convert_from(encoding) | |
448 if u is not None: | |
449 break | |
450 | |
451 if not u: | |
452 # None of the encodings worked. As an absolute last resort, | |
453 # try them again with character replacement. | |
454 | |
455 for encoding in self.detector.encodings: | |
456 if encoding != "ascii": | |
457 u = self._convert_from(encoding, "replace") | |
458 if u is not None: | |
459 self.log.warning( | |
460 "Some characters could not be decoded, and were " | |
461 "replaced with REPLACEMENT CHARACTER." | |
462 ) | |
463 self.contains_replacement_characters = True | |
464 break | |
465 | |
466 # If none of that worked, we could at this point force it to | |
467 # ASCII, but that would destroy so much data that I think | |
468 # giving up is better. | |
469 self.unicode_markup = u | |
470 if not u: | |
471 self.original_encoding = None | |
472 | |
473 def _sub_ms_char(self, match): | |
474 """Changes a MS smart quote character to an XML or HTML | |
475 entity, or an ASCII character.""" | |
476 orig = match.group(1) | |
477 if self.smart_quotes_to == 'ascii': | |
478 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | |
479 else: | |
480 sub = self.MS_CHARS.get(orig) | |
481 if type(sub) == tuple: | |
482 if self.smart_quotes_to == 'xml': | |
483 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | |
484 else: | |
485 sub = '&'.encode() + sub[0].encode() + ';'.encode() | |
486 else: | |
487 sub = sub.encode() | |
488 return sub | |
489 | |
490 def _convert_from(self, proposed, errors="strict"): | |
491 """Attempt to convert the markup to the proposed encoding. | |
492 | |
493 :param proposed: The name of a character encoding. | |
494 """ | |
495 proposed = self.find_codec(proposed) | |
496 if not proposed or (proposed, errors) in self.tried_encodings: | |
497 return None | |
498 self.tried_encodings.append((proposed, errors)) | |
499 markup = self.markup | |
500 # Convert smart quotes to HTML if coming from an encoding | |
501 # that might have them. | |
502 if (self.smart_quotes_to is not None | |
503 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | |
504 smart_quotes_re = b"([\x80-\x9f])" | |
505 smart_quotes_compiled = re.compile(smart_quotes_re) | |
506 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | |
507 | |
508 try: | |
509 #print("Trying to convert document to %s (errors=%s)" % ( | |
510 # proposed, errors)) | |
511 u = self._to_unicode(markup, proposed, errors) | |
512 self.markup = u | |
513 self.original_encoding = proposed | |
514 except Exception as e: | |
515 #print("That didn't work!") | |
516 #print(e) | |
517 return None | |
518 #print("Correct encoding: %s" % proposed) | |
519 return self.markup | |
520 | |
521 def _to_unicode(self, data, encoding, errors="strict"): | |
522 """Given a string and its encoding, decodes the string into Unicode. | |
523 | |
524 :param encoding: The name of an encoding. | |
525 """ | |
526 return str(data, encoding, errors) | |
527 | |
528 @property | |
529 def declared_html_encoding(self): | |
530 """If the markup is an HTML document, returns the encoding declared _within_ | |
531 the document. | |
532 """ | |
533 if not self.is_html: | |
534 return None | |
535 return self.detector.declared_encoding | |
536 | |
537 def find_codec(self, charset): | |
538 """Convert the name of a character set to a codec name. | |
539 | |
540 :param charset: The name of a character set. | |
541 :return: The name of a codec. | |
542 """ | |
543 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | |
544 or (charset and self._codec(charset.replace("-", ""))) | |
545 or (charset and self._codec(charset.replace("-", "_"))) | |
546 or (charset and charset.lower()) | |
547 or charset | |
548 ) | |
549 if value: | |
550 return value.lower() | |
551 return None | |
552 | |
553 def _codec(self, charset): | |
554 if not charset: | |
555 return charset | |
556 codec = None | |
557 try: | |
558 codecs.lookup(charset) | |
559 codec = charset | |
560 except (LookupError, ValueError): | |
561 pass | |
562 return codec | |
563 | |
564 | |
565 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | |
566 MS_CHARS = {b'\x80': ('euro', '20AC'), | |
567 b'\x81': ' ', | |
568 b'\x82': ('sbquo', '201A'), | |
569 b'\x83': ('fnof', '192'), | |
570 b'\x84': ('bdquo', '201E'), | |
571 b'\x85': ('hellip', '2026'), | |
572 b'\x86': ('dagger', '2020'), | |
573 b'\x87': ('Dagger', '2021'), | |
574 b'\x88': ('circ', '2C6'), | |
575 b'\x89': ('permil', '2030'), | |
576 b'\x8A': ('Scaron', '160'), | |
577 b'\x8B': ('lsaquo', '2039'), | |
578 b'\x8C': ('OElig', '152'), | |
579 b'\x8D': '?', | |
580 b'\x8E': ('#x17D', '17D'), | |
581 b'\x8F': '?', | |
582 b'\x90': '?', | |
583 b'\x91': ('lsquo', '2018'), | |
584 b'\x92': ('rsquo', '2019'), | |
585 b'\x93': ('ldquo', '201C'), | |
586 b'\x94': ('rdquo', '201D'), | |
587 b'\x95': ('bull', '2022'), | |
588 b'\x96': ('ndash', '2013'), | |
589 b'\x97': ('mdash', '2014'), | |
590 b'\x98': ('tilde', '2DC'), | |
591 b'\x99': ('trade', '2122'), | |
592 b'\x9a': ('scaron', '161'), | |
593 b'\x9b': ('rsaquo', '203A'), | |
594 b'\x9c': ('oelig', '153'), | |
595 b'\x9d': '?', | |
596 b'\x9e': ('#x17E', '17E'), | |
597 b'\x9f': ('Yuml', ''),} | |
598 | |
599 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | |
600 # horrors like stripping diacritical marks to turn á into a, but also | |
601 # contains non-horrors like turning “ into ". | |
602 MS_CHARS_TO_ASCII = { | |
603 b'\x80' : 'EUR', | |
604 b'\x81' : ' ', | |
605 b'\x82' : ',', | |
606 b'\x83' : 'f', | |
607 b'\x84' : ',,', | |
608 b'\x85' : '...', | |
609 b'\x86' : '+', | |
610 b'\x87' : '++', | |
611 b'\x88' : '^', | |
612 b'\x89' : '%', | |
613 b'\x8a' : 'S', | |
614 b'\x8b' : '<', | |
615 b'\x8c' : 'OE', | |
616 b'\x8d' : '?', | |
617 b'\x8e' : 'Z', | |
618 b'\x8f' : '?', | |
619 b'\x90' : '?', | |
620 b'\x91' : "'", | |
621 b'\x92' : "'", | |
622 b'\x93' : '"', | |
623 b'\x94' : '"', | |
624 b'\x95' : '*', | |
625 b'\x96' : '-', | |
626 b'\x97' : '--', | |
627 b'\x98' : '~', | |
628 b'\x99' : '(TM)', | |
629 b'\x9a' : 's', | |
630 b'\x9b' : '>', | |
631 b'\x9c' : 'oe', | |
632 b'\x9d' : '?', | |
633 b'\x9e' : 'z', | |
634 b'\x9f' : 'Y', | |
635 b'\xa0' : ' ', | |
636 b'\xa1' : '!', | |
637 b'\xa2' : 'c', | |
638 b'\xa3' : 'GBP', | |
639 b'\xa4' : '$', #This approximation is especially parochial--this is the | |
640 #generic currency symbol. | |
641 b'\xa5' : 'YEN', | |
642 b'\xa6' : '|', | |
643 b'\xa7' : 'S', | |
644 b'\xa8' : '..', | |
645 b'\xa9' : '', | |
646 b'\xaa' : '(th)', | |
647 b'\xab' : '<<', | |
648 b'\xac' : '!', | |
649 b'\xad' : ' ', | |
650 b'\xae' : '(R)', | |
651 b'\xaf' : '-', | |
652 b'\xb0' : 'o', | |
653 b'\xb1' : '+-', | |
654 b'\xb2' : '2', | |
655 b'\xb3' : '3', | |
656 b'\xb4' : ("'", 'acute'), | |
657 b'\xb5' : 'u', | |
658 b'\xb6' : 'P', | |
659 b'\xb7' : '*', | |
660 b'\xb8' : ',', | |
661 b'\xb9' : '1', | |
662 b'\xba' : '(th)', | |
663 b'\xbb' : '>>', | |
664 b'\xbc' : '1/4', | |
665 b'\xbd' : '1/2', | |
666 b'\xbe' : '3/4', | |
667 b'\xbf' : '?', | |
668 b'\xc0' : 'A', | |
669 b'\xc1' : 'A', | |
670 b'\xc2' : 'A', | |
671 b'\xc3' : 'A', | |
672 b'\xc4' : 'A', | |
673 b'\xc5' : 'A', | |
674 b'\xc6' : 'AE', | |
675 b'\xc7' : 'C', | |
676 b'\xc8' : 'E', | |
677 b'\xc9' : 'E', | |
678 b'\xca' : 'E', | |
679 b'\xcb' : 'E', | |
680 b'\xcc' : 'I', | |
681 b'\xcd' : 'I', | |
682 b'\xce' : 'I', | |
683 b'\xcf' : 'I', | |
684 b'\xd0' : 'D', | |
685 b'\xd1' : 'N', | |
686 b'\xd2' : 'O', | |
687 b'\xd3' : 'O', | |
688 b'\xd4' : 'O', | |
689 b'\xd5' : 'O', | |
690 b'\xd6' : 'O', | |
691 b'\xd7' : '*', | |
692 b'\xd8' : 'O', | |
693 b'\xd9' : 'U', | |
694 b'\xda' : 'U', | |
695 b'\xdb' : 'U', | |
696 b'\xdc' : 'U', | |
697 b'\xdd' : 'Y', | |
698 b'\xde' : 'b', | |
699 b'\xdf' : 'B', | |
700 b'\xe0' : 'a', | |
701 b'\xe1' : 'a', | |
702 b'\xe2' : 'a', | |
703 b'\xe3' : 'a', | |
704 b'\xe4' : 'a', | |
705 b'\xe5' : 'a', | |
706 b'\xe6' : 'ae', | |
707 b'\xe7' : 'c', | |
708 b'\xe8' : 'e', | |
709 b'\xe9' : 'e', | |
710 b'\xea' : 'e', | |
711 b'\xeb' : 'e', | |
712 b'\xec' : 'i', | |
713 b'\xed' : 'i', | |
714 b'\xee' : 'i', | |
715 b'\xef' : 'i', | |
716 b'\xf0' : 'o', | |
717 b'\xf1' : 'n', | |
718 b'\xf2' : 'o', | |
719 b'\xf3' : 'o', | |
720 b'\xf4' : 'o', | |
721 b'\xf5' : 'o', | |
722 b'\xf6' : 'o', | |
723 b'\xf7' : '/', | |
724 b'\xf8' : 'o', | |
725 b'\xf9' : 'u', | |
726 b'\xfa' : 'u', | |
727 b'\xfb' : 'u', | |
728 b'\xfc' : 'u', | |
729 b'\xfd' : 'y', | |
730 b'\xfe' : 'b', | |
731 b'\xff' : 'y', | |
732 } | |
733 | |
734 # A map used when removing rogue Windows-1252/ISO-8859-1 | |
735 # characters in otherwise UTF-8 documents. | |
736 # | |
737 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | |
738 # Windows-1252. | |
739 WINDOWS_1252_TO_UTF8 = { | |
740 0x80 : b'\xe2\x82\xac', # € | |
741 0x82 : b'\xe2\x80\x9a', # ‚ | |
742 0x83 : b'\xc6\x92', # ƒ | |
743 0x84 : b'\xe2\x80\x9e', # „ | |
744 0x85 : b'\xe2\x80\xa6', # … | |
745 0x86 : b'\xe2\x80\xa0', # † | |
746 0x87 : b'\xe2\x80\xa1', # ‡ | |
747 0x88 : b'\xcb\x86', # ˆ | |
748 0x89 : b'\xe2\x80\xb0', # ‰ | |
749 0x8a : b'\xc5\xa0', # Š | |
750 0x8b : b'\xe2\x80\xb9', # ‹ | |
751 0x8c : b'\xc5\x92', # Œ | |
752 0x8e : b'\xc5\xbd', # Ž | |
753 0x91 : b'\xe2\x80\x98', # ‘ | |
754 0x92 : b'\xe2\x80\x99', # ’ | |
755 0x93 : b'\xe2\x80\x9c', # “ | |
756 0x94 : b'\xe2\x80\x9d', # ” | |
757 0x95 : b'\xe2\x80\xa2', # • | |
758 0x96 : b'\xe2\x80\x93', # – | |
759 0x97 : b'\xe2\x80\x94', # — | |
760 0x98 : b'\xcb\x9c', # ˜ | |
761 0x99 : b'\xe2\x84\xa2', # ™ | |
762 0x9a : b'\xc5\xa1', # š | |
763 0x9b : b'\xe2\x80\xba', # › | |
764 0x9c : b'\xc5\x93', # œ | |
765 0x9e : b'\xc5\xbe', # ž | |
766 0x9f : b'\xc5\xb8', # Ÿ | |
767 0xa0 : b'\xc2\xa0', # | |
768 0xa1 : b'\xc2\xa1', # ¡ | |
769 0xa2 : b'\xc2\xa2', # ¢ | |
770 0xa3 : b'\xc2\xa3', # £ | |
771 0xa4 : b'\xc2\xa4', # ¤ | |
772 0xa5 : b'\xc2\xa5', # ¥ | |
773 0xa6 : b'\xc2\xa6', # ¦ | |
774 0xa7 : b'\xc2\xa7', # § | |
775 0xa8 : b'\xc2\xa8', # ¨ | |
776 0xa9 : b'\xc2\xa9', # © | |
777 0xaa : b'\xc2\xaa', # ª | |
778 0xab : b'\xc2\xab', # « | |
779 0xac : b'\xc2\xac', # ¬ | |
780 0xad : b'\xc2\xad', # | |
781 0xae : b'\xc2\xae', # ® | |
782 0xaf : b'\xc2\xaf', # ¯ | |
783 0xb0 : b'\xc2\xb0', # ° | |
784 0xb1 : b'\xc2\xb1', # ± | |
785 0xb2 : b'\xc2\xb2', # ² | |
786 0xb3 : b'\xc2\xb3', # ³ | |
787 0xb4 : b'\xc2\xb4', # ´ | |
788 0xb5 : b'\xc2\xb5', # µ | |
789 0xb6 : b'\xc2\xb6', # ¶ | |
790 0xb7 : b'\xc2\xb7', # · | |
791 0xb8 : b'\xc2\xb8', # ¸ | |
792 0xb9 : b'\xc2\xb9', # ¹ | |
793 0xba : b'\xc2\xba', # º | |
794 0xbb : b'\xc2\xbb', # » | |
795 0xbc : b'\xc2\xbc', # ¼ | |
796 0xbd : b'\xc2\xbd', # ½ | |
797 0xbe : b'\xc2\xbe', # ¾ | |
798 0xbf : b'\xc2\xbf', # ¿ | |
799 0xc0 : b'\xc3\x80', # À | |
800 0xc1 : b'\xc3\x81', # Á | |
801 0xc2 : b'\xc3\x82', # Â | |
802 0xc3 : b'\xc3\x83', # Ã | |
803 0xc4 : b'\xc3\x84', # Ä | |
804 0xc5 : b'\xc3\x85', # Å | |
805 0xc6 : b'\xc3\x86', # Æ | |
806 0xc7 : b'\xc3\x87', # Ç | |
807 0xc8 : b'\xc3\x88', # È | |
808 0xc9 : b'\xc3\x89', # É | |
809 0xca : b'\xc3\x8a', # Ê | |
810 0xcb : b'\xc3\x8b', # Ë | |
811 0xcc : b'\xc3\x8c', # Ì | |
812 0xcd : b'\xc3\x8d', # Í | |
813 0xce : b'\xc3\x8e', # Î | |
814 0xcf : b'\xc3\x8f', # Ï | |
815 0xd0 : b'\xc3\x90', # Ð | |
816 0xd1 : b'\xc3\x91', # Ñ | |
817 0xd2 : b'\xc3\x92', # Ò | |
818 0xd3 : b'\xc3\x93', # Ó | |
819 0xd4 : b'\xc3\x94', # Ô | |
820 0xd5 : b'\xc3\x95', # Õ | |
821 0xd6 : b'\xc3\x96', # Ö | |
822 0xd7 : b'\xc3\x97', # × | |
823 0xd8 : b'\xc3\x98', # Ø | |
824 0xd9 : b'\xc3\x99', # Ù | |
825 0xda : b'\xc3\x9a', # Ú | |
826 0xdb : b'\xc3\x9b', # Û | |
827 0xdc : b'\xc3\x9c', # Ü | |
828 0xdd : b'\xc3\x9d', # Ý | |
829 0xde : b'\xc3\x9e', # Þ | |
830 0xdf : b'\xc3\x9f', # ß | |
831 0xe0 : b'\xc3\xa0', # à | |
832 0xe1 : b'\xa1', # á | |
833 0xe2 : b'\xc3\xa2', # â | |
834 0xe3 : b'\xc3\xa3', # ã | |
835 0xe4 : b'\xc3\xa4', # ä | |
836 0xe5 : b'\xc3\xa5', # å | |
837 0xe6 : b'\xc3\xa6', # æ | |
838 0xe7 : b'\xc3\xa7', # ç | |
839 0xe8 : b'\xc3\xa8', # è | |
840 0xe9 : b'\xc3\xa9', # é | |
841 0xea : b'\xc3\xaa', # ê | |
842 0xeb : b'\xc3\xab', # ë | |
843 0xec : b'\xc3\xac', # ì | |
844 0xed : b'\xc3\xad', # í | |
845 0xee : b'\xc3\xae', # î | |
846 0xef : b'\xc3\xaf', # ï | |
847 0xf0 : b'\xc3\xb0', # ð | |
848 0xf1 : b'\xc3\xb1', # ñ | |
849 0xf2 : b'\xc3\xb2', # ò | |
850 0xf3 : b'\xc3\xb3', # ó | |
851 0xf4 : b'\xc3\xb4', # ô | |
852 0xf5 : b'\xc3\xb5', # õ | |
853 0xf6 : b'\xc3\xb6', # ö | |
854 0xf7 : b'\xc3\xb7', # ÷ | |
855 0xf8 : b'\xc3\xb8', # ø | |
856 0xf9 : b'\xc3\xb9', # ù | |
857 0xfa : b'\xc3\xba', # ú | |
858 0xfb : b'\xc3\xbb', # û | |
859 0xfc : b'\xc3\xbc', # ü | |
860 0xfd : b'\xc3\xbd', # ý | |
861 0xfe : b'\xc3\xbe', # þ | |
862 } | |
863 | |
864 MULTIBYTE_MARKERS_AND_SIZES = [ | |
865 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | |
866 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | |
867 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | |
868 ] | |
869 | |
870 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | |
871 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | |
872 | |
873 @classmethod | |
874 def detwingle(cls, in_bytes, main_encoding="utf8", | |
875 embedded_encoding="windows-1252"): | |
876 """Fix characters from one encoding embedded in some other encoding. | |
877 | |
878 Currently the only situation supported is Windows-1252 (or its | |
879 subset ISO-8859-1), embedded in UTF-8. | |
880 | |
881 :param in_bytes: A bytestring that you suspect contains | |
882 characters from multiple encodings. Note that this _must_ | |
883 be a bytestring. If you've already converted the document | |
884 to Unicode, you're too late. | |
885 :param main_encoding: The primary encoding of `in_bytes`. | |
886 :param embedded_encoding: The encoding that was used to embed characters | |
887 in the main document. | |
888 :return: A bytestring in which `embedded_encoding` | |
889 characters have been converted to their `main_encoding` | |
890 equivalents. | |
891 """ | |
892 if embedded_encoding.replace('_', '-').lower() not in ( | |
893 'windows-1252', 'windows_1252'): | |
894 raise NotImplementedError( | |
895 "Windows-1252 and ISO-8859-1 are the only currently supported " | |
896 "embedded encodings.") | |
897 | |
898 if main_encoding.lower() not in ('utf8', 'utf-8'): | |
899 raise NotImplementedError( | |
900 "UTF-8 is the only currently supported main encoding.") | |
901 | |
902 byte_chunks = [] | |
903 | |
904 chunk_start = 0 | |
905 pos = 0 | |
906 while pos < len(in_bytes): | |
907 byte = in_bytes[pos] | |
908 if not isinstance(byte, int): | |
909 # Python 2.x | |
910 byte = ord(byte) | |
911 if (byte >= cls.FIRST_MULTIBYTE_MARKER | |
912 and byte <= cls.LAST_MULTIBYTE_MARKER): | |
913 # This is the start of a UTF-8 multibyte character. Skip | |
914 # to the end. | |
915 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | |
916 if byte >= start and byte <= end: | |
917 pos += size | |
918 break | |
919 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | |
920 # We found a Windows-1252 character! | |
921 # Save the string up to this point as a chunk. | |
922 byte_chunks.append(in_bytes[chunk_start:pos]) | |
923 | |
924 # Now translate the Windows-1252 character into UTF-8 | |
925 # and add it as another, one-byte chunk. | |
926 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | |
927 pos += 1 | |
928 chunk_start = pos | |
929 else: | |
930 # Go on to the next character. | |
931 pos += 1 | |
932 if chunk_start == 0: | |
933 # The string is unchanged. | |
934 return in_bytes | |
935 else: | |
936 # Store the final chunk. | |
937 byte_chunks.append(in_bytes[chunk_start:]) | |
938 return b''.join(byte_chunks) | |
939 |