Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/bs4/dammit.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
rev | line source |
---|---|
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
1 # -*- coding: utf-8 -*- |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
2 """Beautiful Soup bonus library: Unicode, Dammit |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
3 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
4 This library converts a bytestream to Unicode through any means |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
5 necessary. It is heavily based on code from Mark Pilgrim's Universal |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
6 Feed Parser. It works best on XML and HTML, but it does not rewrite the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
7 XML or HTML to reflect a new encoding; that's the tree builder's job. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
8 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
9 # Use of this source code is governed by the MIT license. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
10 __license__ = "MIT" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
11 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
12 import codecs |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
13 from html.entities import codepoint2name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
14 import re |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
15 import logging |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
16 import string |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
17 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
18 # Import a library to autodetect character encodings. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
19 chardet_type = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
20 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
21 # First try the fast C implementation. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
22 # PyPI package: cchardet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
23 import cchardet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
24 def chardet_dammit(s): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
25 if isinstance(s, str): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
26 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
27 return cchardet.detect(s)['encoding'] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
28 except ImportError: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
29 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
30 # Fall back to the pure Python implementation |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
31 # Debian package: python-chardet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
32 # PyPI package: chardet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
33 import chardet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
34 def chardet_dammit(s): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
35 if isinstance(s, str): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
36 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
37 return chardet.detect(s)['encoding'] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
38 #import chardet.constants |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
39 #chardet.constants._debug = 1 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
40 except ImportError: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
41 # No chardet available. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
42 def chardet_dammit(s): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
43 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
44 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
45 # Available from http://cjkpython.i18n.org/. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
46 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
47 # TODO: This doesn't work anymore and the closest thing, iconv_codecs, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
48 # is GPL-licensed. Check whether this is still necessary. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
49 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
50 import iconv_codec |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
51 except ImportError: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
52 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
53 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
54 # Build bytestring and Unicode versions of regular expressions for finding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
55 # a declared encoding inside an XML or HTML document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
56 xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
57 html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
58 encoding_res = dict() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
59 encoding_res[bytes] = { |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
60 'html' : re.compile(html_meta.encode("ascii"), re.I), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
61 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
62 } |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
63 encoding_res[str] = { |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
64 'html' : re.compile(html_meta, re.I), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
65 'xml' : re.compile(xml_encoding, re.I) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
66 } |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
67 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
68 class EntitySubstitution(object): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
69 """The ability to substitute XML or HTML entities for certain characters.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
70 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
71 def _populate_class_variables(): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
72 lookup = {} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
73 reverse_lookup = {} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
74 characters_for_re = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
75 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
76 # &apos is an XHTML entity and an HTML 5, but not an HTML 4 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
77 # entity. We don't want to use it, but we want to recognize it on the way in. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
78 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
79 # TODO: Ideally we would be able to recognize all HTML 5 named |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
80 # entities, but that's a little tricky. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
81 extra = [(39, 'apos')] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
82 for codepoint, name in list(codepoint2name.items()) + extra: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
83 character = chr(codepoint) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
84 if codepoint not in (34, 39): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
85 # There's no point in turning the quotation mark into |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
86 # " or the single quote into ', unless it |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
87 # happens within an attribute value, which is handled |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
88 # elsewhere. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
89 characters_for_re.append(character) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
90 lookup[character] = name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
91 # But we do want to recognize those entities on the way in and |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
92 # convert them to Unicode characters. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
93 reverse_lookup[name] = character |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
94 re_definition = "[%s]" % "".join(characters_for_re) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
95 return lookup, reverse_lookup, re.compile(re_definition) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
96 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
97 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
98 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
99 CHARACTER_TO_XML_ENTITY = { |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
100 "'": "apos", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
101 '"': "quot", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
102 "&": "amp", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
103 "<": "lt", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
104 ">": "gt", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
105 } |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
106 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
107 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
108 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
109 ")") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
110 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
111 AMPERSAND_OR_BRACKET = re.compile("([<>&])") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
112 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
113 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
114 def _substitute_html_entity(cls, matchobj): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
115 """Used with a regular expression to substitute the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
116 appropriate HTML entity for a special character.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
117 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
118 return "&%s;" % entity |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
119 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
120 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
121 def _substitute_xml_entity(cls, matchobj): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
122 """Used with a regular expression to substitute the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
123 appropriate XML entity for a special character.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
124 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
125 return "&%s;" % entity |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
126 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
127 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
128 def quoted_attribute_value(self, value): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
129 """Make a value into a quoted XML attribute, possibly escaping it. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
130 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
131 Most strings will be quoted using double quotes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
132 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
133 Bob's Bar -> "Bob's Bar" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
134 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
135 If a string contains double quotes, it will be quoted using |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
136 single quotes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
137 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
138 Welcome to "my bar" -> 'Welcome to "my bar"' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
139 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
140 If a string contains both single and double quotes, the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
141 double quotes will be escaped, and the string will be quoted |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
142 using double quotes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
143 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
144 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
145 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
146 quote_with = '"' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
147 if '"' in value: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
148 if "'" in value: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
149 # The string contains both single and double |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
150 # quotes. Turn the double quotes into |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
151 # entities. We quote the double quotes rather than |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
152 # the single quotes because the entity name is |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
153 # """ whether this is HTML or XML. If we |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
154 # quoted the single quotes, we'd have to decide |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
155 # between ' and &squot;. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
156 replace_with = """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
157 value = value.replace('"', replace_with) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
158 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
159 # There are double quotes but no single quotes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
160 # We can use single quotes to quote the attribute. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
161 quote_with = "'" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
162 return quote_with + value + quote_with |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
163 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
164 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
165 def substitute_xml(cls, value, make_quoted_attribute=False): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
166 """Substitute XML entities for special XML characters. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
167 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
168 :param value: A string to be substituted. The less-than sign |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
169 will become <, the greater-than sign will become >, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
170 and any ampersands will become &. If you want ampersands |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
171 that appear to be part of an entity definition to be left |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
172 alone, use substitute_xml_containing_entities() instead. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
173 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
174 :param make_quoted_attribute: If True, then the string will be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
175 quoted, as befits an attribute value. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
176 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
177 # Escape angle brackets and ampersands. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
178 value = cls.AMPERSAND_OR_BRACKET.sub( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
179 cls._substitute_xml_entity, value) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
180 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
181 if make_quoted_attribute: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
182 value = cls.quoted_attribute_value(value) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
183 return value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
184 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
185 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
186 def substitute_xml_containing_entities( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
187 cls, value, make_quoted_attribute=False): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
188 """Substitute XML entities for special XML characters. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
189 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
190 :param value: A string to be substituted. The less-than sign will |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
191 become <, the greater-than sign will become >, and any |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
192 ampersands that are not part of an entity defition will |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
193 become &. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
194 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
195 :param make_quoted_attribute: If True, then the string will be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
196 quoted, as befits an attribute value. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
197 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
198 # Escape angle brackets, and ampersands that aren't part of |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
199 # entities. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
200 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
201 cls._substitute_xml_entity, value) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
202 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
203 if make_quoted_attribute: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
204 value = cls.quoted_attribute_value(value) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
205 return value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
206 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
207 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
208 def substitute_html(cls, s): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
209 """Replace certain Unicode characters with named HTML entities. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
210 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
211 This differs from data.encode(encoding, 'xmlcharrefreplace') |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
212 in that the goal is to make the result more readable (to those |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
213 with ASCII displays) rather than to recover from |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
214 errors. There's absolutely nothing wrong with a UTF-8 string |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
215 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
216 character with "é" will make it more readable to some |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
217 people. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
218 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
219 :param s: A Unicode string. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
220 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
221 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
222 cls._substitute_html_entity, s) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
223 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
224 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
225 class EncodingDetector: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
226 """Suggests a number of possible encodings for a bytestring. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
227 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
228 Order of precedence: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
229 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
230 1. Encodings you specifically tell EncodingDetector to try first |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
231 (the override_encodings argument to the constructor). |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
232 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
233 2. An encoding declared within the bytestring itself, either in an |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
234 XML declaration (if the bytestring is to be interpreted as an XML |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
235 document), or in a <meta> tag (if the bytestring is to be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
236 interpreted as an HTML document.) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
237 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
238 3. An encoding detected through textual analysis by chardet, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
239 cchardet, or a similar external library. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
240 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
241 4. UTF-8. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
242 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
243 5. Windows-1252. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
244 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
245 def __init__(self, markup, override_encodings=None, is_html=False, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
246 exclude_encodings=None): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
247 """Constructor. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
248 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
249 :param markup: Some markup in an unknown encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
250 :param override_encodings: These encodings will be tried first. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
251 :param is_html: If True, this markup is considered to be HTML. Otherwise |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
252 it's assumed to be XML. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
253 :param exclude_encodings: These encodings will not be tried, even |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
254 if they otherwise would be. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
255 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
256 self.override_encodings = override_encodings or [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
257 exclude_encodings = exclude_encodings or [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
258 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
259 self.chardet_encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
260 self.is_html = is_html |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
261 self.declared_encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
262 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
263 # First order of business: strip a byte-order mark. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
264 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
265 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
266 def _usable(self, encoding, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
267 """Should we even bother to try this encoding? |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
268 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
269 :param encoding: Name of an encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
270 :param tried: Encodings that have already been tried. This will be modified |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
271 as a side effect. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
272 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
273 if encoding is not None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
274 encoding = encoding.lower() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
275 if encoding in self.exclude_encodings: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
276 return False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
277 if encoding not in tried: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
278 tried.add(encoding) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
279 return True |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
280 return False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
281 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
282 @property |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
283 def encodings(self): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
284 """Yield a number of encodings that might work for this markup. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
285 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
286 :yield: A sequence of strings. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
287 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
288 tried = set() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
289 for e in self.override_encodings: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
290 if self._usable(e, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
291 yield e |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
292 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
293 # Did the document originally start with a byte-order mark |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
294 # that indicated its encoding? |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
295 if self._usable(self.sniffed_encoding, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
296 yield self.sniffed_encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
297 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
298 # Look within the document for an XML or HTML encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
299 # declaration. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
300 if self.declared_encoding is None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
301 self.declared_encoding = self.find_declared_encoding( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
302 self.markup, self.is_html) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
303 if self._usable(self.declared_encoding, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
304 yield self.declared_encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
305 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
306 # Use third-party character set detection to guess at the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
307 # encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
308 if self.chardet_encoding is None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
309 self.chardet_encoding = chardet_dammit(self.markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
310 if self._usable(self.chardet_encoding, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
311 yield self.chardet_encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
312 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
313 # As a last-ditch effort, try utf-8 and windows-1252. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
314 for e in ('utf-8', 'windows-1252'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
315 if self._usable(e, tried): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
316 yield e |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
317 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
318 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
319 def strip_byte_order_mark(cls, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
320 """If a byte-order mark is present, strip it and return the encoding it implies. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
321 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
322 :param data: Some markup. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
323 :return: A 2-tuple (modified data, implied encoding) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
324 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
325 encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
326 if isinstance(data, str): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
327 # Unicode data cannot have a byte-order mark. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
328 return data, encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
329 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
330 and (data[2:4] != '\x00\x00'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
331 encoding = 'utf-16be' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
332 data = data[2:] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
333 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
334 and (data[2:4] != '\x00\x00'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
335 encoding = 'utf-16le' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
336 data = data[2:] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
337 elif data[:3] == b'\xef\xbb\xbf': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
338 encoding = 'utf-8' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
339 data = data[3:] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
340 elif data[:4] == b'\x00\x00\xfe\xff': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
341 encoding = 'utf-32be' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
342 data = data[4:] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
343 elif data[:4] == b'\xff\xfe\x00\x00': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
344 encoding = 'utf-32le' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
345 data = data[4:] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
346 return data, encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
347 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
348 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
349 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
350 """Given a document, tries to find its declared encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
351 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
352 An XML encoding is declared at the beginning of the document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
353 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
354 An HTML encoding is declared in a <meta> tag, hopefully near the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
355 beginning of the document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
356 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
357 :param markup: Some markup. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
358 :param is_html: If True, this markup is considered to be HTML. Otherwise |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
359 it's assumed to be XML. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
360 :param search_entire_document: Since an encoding is supposed to declared near the beginning |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
361 of the document, most of the time it's only necessary to search a few kilobytes of data. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
362 Set this to True to force this method to search the entire document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
363 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
364 if search_entire_document: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
365 xml_endpos = html_endpos = len(markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
366 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
367 xml_endpos = 1024 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
368 html_endpos = max(2048, int(len(markup) * 0.05)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
369 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
370 if isinstance(markup, bytes): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
371 res = encoding_res[bytes] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
372 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
373 res = encoding_res[str] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
374 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
375 xml_re = res['xml'] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
376 html_re = res['html'] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
377 declared_encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
378 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
379 if not declared_encoding_match and is_html: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
380 declared_encoding_match = html_re.search(markup, endpos=html_endpos) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
381 if declared_encoding_match is not None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
382 declared_encoding = declared_encoding_match.groups()[0] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
383 if declared_encoding: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
384 if isinstance(declared_encoding, bytes): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
385 declared_encoding = declared_encoding.decode('ascii', 'replace') |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
386 return declared_encoding.lower() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
387 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
388 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
389 class UnicodeDammit: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
390 """A class for detecting the encoding of a *ML document and |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
391 converting it to a Unicode string. If the source encoding is |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
392 windows-1252, can replace MS smart quotes with their HTML or XML |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
393 equivalents.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
394 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
395 # This dictionary maps commonly seen values for "charset" in HTML |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
396 # meta tags to the corresponding Python codec names. It only covers |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
397 # values that aren't in Python's aliases and can't be determined |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
398 # by the heuristics in find_codec. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
399 CHARSET_ALIASES = {"macintosh": "mac-roman", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
400 "x-sjis": "shift-jis"} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
401 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
402 ENCODINGS_WITH_SMART_QUOTES = [ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
403 "windows-1252", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
404 "iso-8859-1", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
405 "iso-8859-2", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
406 ] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
407 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
408 def __init__(self, markup, override_encodings=[], |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
409 smart_quotes_to=None, is_html=False, exclude_encodings=[]): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
410 """Constructor. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
411 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
412 :param markup: A bytestring representing markup in an unknown encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
413 :param override_encodings: These encodings will be tried first, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
414 before any sniffing code is run. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
415 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
416 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
417 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
418 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
419 will convert them to HTML entity references. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
420 :param is_html: If True, this markup is considered to be HTML. Otherwise |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
421 it's assumed to be XML. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
422 :param exclude_encodings: These encodings will not be considered, even |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
423 if the sniffing code thinks they might make sense. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
424 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
425 self.smart_quotes_to = smart_quotes_to |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
426 self.tried_encodings = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
427 self.contains_replacement_characters = False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
428 self.is_html = is_html |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
429 self.log = logging.getLogger(__name__) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
430 self.detector = EncodingDetector( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
431 markup, override_encodings, is_html, exclude_encodings) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
432 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
433 # Short-circuit if the data is in Unicode to begin with. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
434 if isinstance(markup, str) or markup == '': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
435 self.markup = markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
436 self.unicode_markup = str(markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
437 self.original_encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
438 return |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
439 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
440 # The encoding detector may have stripped a byte-order mark. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
441 # Use the stripped markup from this point on. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
442 self.markup = self.detector.markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
443 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
444 u = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
445 for encoding in self.detector.encodings: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
446 markup = self.detector.markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
447 u = self._convert_from(encoding) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
448 if u is not None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
449 break |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
450 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
451 if not u: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
452 # None of the encodings worked. As an absolute last resort, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
453 # try them again with character replacement. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
454 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
455 for encoding in self.detector.encodings: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
456 if encoding != "ascii": |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
457 u = self._convert_from(encoding, "replace") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
458 if u is not None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
459 self.log.warning( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
460 "Some characters could not be decoded, and were " |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
461 "replaced with REPLACEMENT CHARACTER." |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
462 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
463 self.contains_replacement_characters = True |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
464 break |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
465 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
466 # If none of that worked, we could at this point force it to |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
467 # ASCII, but that would destroy so much data that I think |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
468 # giving up is better. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
469 self.unicode_markup = u |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
470 if not u: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
471 self.original_encoding = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
472 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
473 def _sub_ms_char(self, match): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
474 """Changes a MS smart quote character to an XML or HTML |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
475 entity, or an ASCII character.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
476 orig = match.group(1) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
477 if self.smart_quotes_to == 'ascii': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
478 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
479 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
480 sub = self.MS_CHARS.get(orig) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
481 if type(sub) == tuple: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
482 if self.smart_quotes_to == 'xml': |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
483 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
484 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
485 sub = '&'.encode() + sub[0].encode() + ';'.encode() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
486 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
487 sub = sub.encode() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
488 return sub |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
489 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
490 def _convert_from(self, proposed, errors="strict"): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
491 """Attempt to convert the markup to the proposed encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
492 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
493 :param proposed: The name of a character encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
494 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
495 proposed = self.find_codec(proposed) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
496 if not proposed or (proposed, errors) in self.tried_encodings: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
497 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
498 self.tried_encodings.append((proposed, errors)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
499 markup = self.markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
500 # Convert smart quotes to HTML if coming from an encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
501 # that might have them. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
502 if (self.smart_quotes_to is not None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
503 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
504 smart_quotes_re = b"([\x80-\x9f])" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
505 smart_quotes_compiled = re.compile(smart_quotes_re) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
506 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
507 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
508 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
509 #print("Trying to convert document to %s (errors=%s)" % ( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
510 # proposed, errors)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
511 u = self._to_unicode(markup, proposed, errors) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
512 self.markup = u |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
513 self.original_encoding = proposed |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
514 except Exception as e: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
515 #print("That didn't work!") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
516 #print(e) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
517 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
518 #print("Correct encoding: %s" % proposed) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
519 return self.markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
520 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
521 def _to_unicode(self, data, encoding, errors="strict"): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
522 """Given a string and its encoding, decodes the string into Unicode. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
523 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
524 :param encoding: The name of an encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
525 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
526 return str(data, encoding, errors) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
527 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
528 @property |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
529 def declared_html_encoding(self): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
530 """If the markup is an HTML document, returns the encoding declared _within_ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
531 the document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
532 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
533 if not self.is_html: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
534 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
535 return self.detector.declared_encoding |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
536 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
537 def find_codec(self, charset): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
538 """Convert the name of a character set to a codec name. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
539 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
540 :param charset: The name of a character set. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
541 :return: The name of a codec. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
542 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
543 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
544 or (charset and self._codec(charset.replace("-", ""))) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
545 or (charset and self._codec(charset.replace("-", "_"))) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
546 or (charset and charset.lower()) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
547 or charset |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
548 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
549 if value: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
550 return value.lower() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
551 return None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
552 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
553 def _codec(self, charset): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
554 if not charset: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
555 return charset |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
556 codec = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
557 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
558 codecs.lookup(charset) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
559 codec = charset |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
560 except (LookupError, ValueError): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
561 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
562 return codec |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
563 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
564 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
565 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
566 MS_CHARS = {b'\x80': ('euro', '20AC'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
567 b'\x81': ' ', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
568 b'\x82': ('sbquo', '201A'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
569 b'\x83': ('fnof', '192'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
570 b'\x84': ('bdquo', '201E'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
571 b'\x85': ('hellip', '2026'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
572 b'\x86': ('dagger', '2020'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
573 b'\x87': ('Dagger', '2021'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
574 b'\x88': ('circ', '2C6'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
575 b'\x89': ('permil', '2030'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
576 b'\x8A': ('Scaron', '160'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
577 b'\x8B': ('lsaquo', '2039'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
578 b'\x8C': ('OElig', '152'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
579 b'\x8D': '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
580 b'\x8E': ('#x17D', '17D'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
581 b'\x8F': '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
582 b'\x90': '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
583 b'\x91': ('lsquo', '2018'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
584 b'\x92': ('rsquo', '2019'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
585 b'\x93': ('ldquo', '201C'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
586 b'\x94': ('rdquo', '201D'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
587 b'\x95': ('bull', '2022'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
588 b'\x96': ('ndash', '2013'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
589 b'\x97': ('mdash', '2014'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
590 b'\x98': ('tilde', '2DC'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
591 b'\x99': ('trade', '2122'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
592 b'\x9a': ('scaron', '161'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
593 b'\x9b': ('rsaquo', '203A'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
594 b'\x9c': ('oelig', '153'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
595 b'\x9d': '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
596 b'\x9e': ('#x17E', '17E'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
597 b'\x9f': ('Yuml', ''),} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
598 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
599 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
600 # horrors like stripping diacritical marks to turn á into a, but also |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
601 # contains non-horrors like turning “ into ". |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
602 MS_CHARS_TO_ASCII = { |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
603 b'\x80' : 'EUR', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
604 b'\x81' : ' ', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
605 b'\x82' : ',', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
606 b'\x83' : 'f', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
607 b'\x84' : ',,', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
608 b'\x85' : '...', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
609 b'\x86' : '+', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
610 b'\x87' : '++', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
611 b'\x88' : '^', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
612 b'\x89' : '%', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
613 b'\x8a' : 'S', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
614 b'\x8b' : '<', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
615 b'\x8c' : 'OE', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
616 b'\x8d' : '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
617 b'\x8e' : 'Z', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
618 b'\x8f' : '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
619 b'\x90' : '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
620 b'\x91' : "'", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
621 b'\x92' : "'", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
622 b'\x93' : '"', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
623 b'\x94' : '"', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
624 b'\x95' : '*', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
625 b'\x96' : '-', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
626 b'\x97' : '--', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
627 b'\x98' : '~', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
628 b'\x99' : '(TM)', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
629 b'\x9a' : 's', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
630 b'\x9b' : '>', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
631 b'\x9c' : 'oe', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
632 b'\x9d' : '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
633 b'\x9e' : 'z', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
634 b'\x9f' : 'Y', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
635 b'\xa0' : ' ', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
636 b'\xa1' : '!', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
637 b'\xa2' : 'c', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
638 b'\xa3' : 'GBP', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
639 b'\xa4' : '$', #This approximation is especially parochial--this is the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
640 #generic currency symbol. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
641 b'\xa5' : 'YEN', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
642 b'\xa6' : '|', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
643 b'\xa7' : 'S', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
644 b'\xa8' : '..', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
645 b'\xa9' : '', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
646 b'\xaa' : '(th)', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
647 b'\xab' : '<<', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
648 b'\xac' : '!', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
649 b'\xad' : ' ', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
650 b'\xae' : '(R)', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
651 b'\xaf' : '-', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
652 b'\xb0' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
653 b'\xb1' : '+-', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
654 b'\xb2' : '2', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
655 b'\xb3' : '3', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
656 b'\xb4' : ("'", 'acute'), |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
657 b'\xb5' : 'u', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
658 b'\xb6' : 'P', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
659 b'\xb7' : '*', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
660 b'\xb8' : ',', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
661 b'\xb9' : '1', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
662 b'\xba' : '(th)', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
663 b'\xbb' : '>>', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
664 b'\xbc' : '1/4', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
665 b'\xbd' : '1/2', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
666 b'\xbe' : '3/4', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
667 b'\xbf' : '?', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
668 b'\xc0' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
669 b'\xc1' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
670 b'\xc2' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
671 b'\xc3' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
672 b'\xc4' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
673 b'\xc5' : 'A', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
674 b'\xc6' : 'AE', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
675 b'\xc7' : 'C', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
676 b'\xc8' : 'E', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
677 b'\xc9' : 'E', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
678 b'\xca' : 'E', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
679 b'\xcb' : 'E', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
680 b'\xcc' : 'I', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
681 b'\xcd' : 'I', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
682 b'\xce' : 'I', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
683 b'\xcf' : 'I', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
684 b'\xd0' : 'D', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
685 b'\xd1' : 'N', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
686 b'\xd2' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
687 b'\xd3' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
688 b'\xd4' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
689 b'\xd5' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
690 b'\xd6' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
691 b'\xd7' : '*', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
692 b'\xd8' : 'O', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
693 b'\xd9' : 'U', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
694 b'\xda' : 'U', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
695 b'\xdb' : 'U', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
696 b'\xdc' : 'U', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
697 b'\xdd' : 'Y', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
698 b'\xde' : 'b', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
699 b'\xdf' : 'B', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
700 b'\xe0' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
701 b'\xe1' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
702 b'\xe2' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
703 b'\xe3' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
704 b'\xe4' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
705 b'\xe5' : 'a', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
706 b'\xe6' : 'ae', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
707 b'\xe7' : 'c', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
708 b'\xe8' : 'e', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
709 b'\xe9' : 'e', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
710 b'\xea' : 'e', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
711 b'\xeb' : 'e', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
712 b'\xec' : 'i', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
713 b'\xed' : 'i', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
714 b'\xee' : 'i', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
715 b'\xef' : 'i', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
716 b'\xf0' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
717 b'\xf1' : 'n', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
718 b'\xf2' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
719 b'\xf3' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
720 b'\xf4' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
721 b'\xf5' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
722 b'\xf6' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
723 b'\xf7' : '/', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
724 b'\xf8' : 'o', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
725 b'\xf9' : 'u', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
726 b'\xfa' : 'u', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
727 b'\xfb' : 'u', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
728 b'\xfc' : 'u', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
729 b'\xfd' : 'y', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
730 b'\xfe' : 'b', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
731 b'\xff' : 'y', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
732 } |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
733 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
734 # A map used when removing rogue Windows-1252/ISO-8859-1 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
735 # characters in otherwise UTF-8 documents. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
736 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
737 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
738 # Windows-1252. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
739 WINDOWS_1252_TO_UTF8 = { |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
740 0x80 : b'\xe2\x82\xac', # € |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
741 0x82 : b'\xe2\x80\x9a', # ‚ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
742 0x83 : b'\xc6\x92', # ƒ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
743 0x84 : b'\xe2\x80\x9e', # „ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
744 0x85 : b'\xe2\x80\xa6', # … |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
745 0x86 : b'\xe2\x80\xa0', # † |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
746 0x87 : b'\xe2\x80\xa1', # ‡ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
747 0x88 : b'\xcb\x86', # ˆ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
748 0x89 : b'\xe2\x80\xb0', # ‰ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
749 0x8a : b'\xc5\xa0', # Š |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
750 0x8b : b'\xe2\x80\xb9', # ‹ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
751 0x8c : b'\xc5\x92', # Œ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
752 0x8e : b'\xc5\xbd', # Ž |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
753 0x91 : b'\xe2\x80\x98', # ‘ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
754 0x92 : b'\xe2\x80\x99', # ’ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
755 0x93 : b'\xe2\x80\x9c', # “ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
756 0x94 : b'\xe2\x80\x9d', # ” |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
757 0x95 : b'\xe2\x80\xa2', # • |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
758 0x96 : b'\xe2\x80\x93', # – |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
759 0x97 : b'\xe2\x80\x94', # — |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
760 0x98 : b'\xcb\x9c', # ˜ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
761 0x99 : b'\xe2\x84\xa2', # ™ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
762 0x9a : b'\xc5\xa1', # š |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
763 0x9b : b'\xe2\x80\xba', # › |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
764 0x9c : b'\xc5\x93', # œ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
765 0x9e : b'\xc5\xbe', # ž |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
766 0x9f : b'\xc5\xb8', # Ÿ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
767 0xa0 : b'\xc2\xa0', # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
768 0xa1 : b'\xc2\xa1', # ¡ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
769 0xa2 : b'\xc2\xa2', # ¢ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
770 0xa3 : b'\xc2\xa3', # £ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
771 0xa4 : b'\xc2\xa4', # ¤ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
772 0xa5 : b'\xc2\xa5', # ¥ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
773 0xa6 : b'\xc2\xa6', # ¦ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
774 0xa7 : b'\xc2\xa7', # § |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
775 0xa8 : b'\xc2\xa8', # ¨ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
776 0xa9 : b'\xc2\xa9', # © |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
777 0xaa : b'\xc2\xaa', # ª |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
778 0xab : b'\xc2\xab', # « |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
779 0xac : b'\xc2\xac', # ¬ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
780 0xad : b'\xc2\xad', # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
781 0xae : b'\xc2\xae', # ® |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
782 0xaf : b'\xc2\xaf', # ¯ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
783 0xb0 : b'\xc2\xb0', # ° |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
784 0xb1 : b'\xc2\xb1', # ± |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
785 0xb2 : b'\xc2\xb2', # ² |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
786 0xb3 : b'\xc2\xb3', # ³ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
787 0xb4 : b'\xc2\xb4', # ´ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
788 0xb5 : b'\xc2\xb5', # µ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
789 0xb6 : b'\xc2\xb6', # ¶ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
790 0xb7 : b'\xc2\xb7', # · |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
791 0xb8 : b'\xc2\xb8', # ¸ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
792 0xb9 : b'\xc2\xb9', # ¹ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
793 0xba : b'\xc2\xba', # º |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
794 0xbb : b'\xc2\xbb', # » |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
795 0xbc : b'\xc2\xbc', # ¼ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
796 0xbd : b'\xc2\xbd', # ½ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
797 0xbe : b'\xc2\xbe', # ¾ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
798 0xbf : b'\xc2\xbf', # ¿ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
799 0xc0 : b'\xc3\x80', # À |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
800 0xc1 : b'\xc3\x81', # Á |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
801 0xc2 : b'\xc3\x82', # Â |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
802 0xc3 : b'\xc3\x83', # Ã |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
803 0xc4 : b'\xc3\x84', # Ä |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
804 0xc5 : b'\xc3\x85', # Å |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
805 0xc6 : b'\xc3\x86', # Æ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
806 0xc7 : b'\xc3\x87', # Ç |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
807 0xc8 : b'\xc3\x88', # È |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
808 0xc9 : b'\xc3\x89', # É |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
809 0xca : b'\xc3\x8a', # Ê |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
810 0xcb : b'\xc3\x8b', # Ë |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
811 0xcc : b'\xc3\x8c', # Ì |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
812 0xcd : b'\xc3\x8d', # Í |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
813 0xce : b'\xc3\x8e', # Î |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
814 0xcf : b'\xc3\x8f', # Ï |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
815 0xd0 : b'\xc3\x90', # Ð |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
816 0xd1 : b'\xc3\x91', # Ñ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
817 0xd2 : b'\xc3\x92', # Ò |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
818 0xd3 : b'\xc3\x93', # Ó |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
819 0xd4 : b'\xc3\x94', # Ô |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
820 0xd5 : b'\xc3\x95', # Õ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
821 0xd6 : b'\xc3\x96', # Ö |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
822 0xd7 : b'\xc3\x97', # × |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
823 0xd8 : b'\xc3\x98', # Ø |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
824 0xd9 : b'\xc3\x99', # Ù |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
825 0xda : b'\xc3\x9a', # Ú |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
826 0xdb : b'\xc3\x9b', # Û |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
827 0xdc : b'\xc3\x9c', # Ü |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
828 0xdd : b'\xc3\x9d', # Ý |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
829 0xde : b'\xc3\x9e', # Þ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
830 0xdf : b'\xc3\x9f', # ß |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
831 0xe0 : b'\xc3\xa0', # à |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
832 0xe1 : b'\xa1', # á |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
833 0xe2 : b'\xc3\xa2', # â |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
834 0xe3 : b'\xc3\xa3', # ã |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
835 0xe4 : b'\xc3\xa4', # ä |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
836 0xe5 : b'\xc3\xa5', # å |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
837 0xe6 : b'\xc3\xa6', # æ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
838 0xe7 : b'\xc3\xa7', # ç |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
839 0xe8 : b'\xc3\xa8', # è |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
840 0xe9 : b'\xc3\xa9', # é |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
841 0xea : b'\xc3\xaa', # ê |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
842 0xeb : b'\xc3\xab', # ë |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
843 0xec : b'\xc3\xac', # ì |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
844 0xed : b'\xc3\xad', # í |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
845 0xee : b'\xc3\xae', # î |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
846 0xef : b'\xc3\xaf', # ï |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
847 0xf0 : b'\xc3\xb0', # ð |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
848 0xf1 : b'\xc3\xb1', # ñ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
849 0xf2 : b'\xc3\xb2', # ò |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
850 0xf3 : b'\xc3\xb3', # ó |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
851 0xf4 : b'\xc3\xb4', # ô |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
852 0xf5 : b'\xc3\xb5', # õ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
853 0xf6 : b'\xc3\xb6', # ö |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
854 0xf7 : b'\xc3\xb7', # ÷ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
855 0xf8 : b'\xc3\xb8', # ø |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
856 0xf9 : b'\xc3\xb9', # ù |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
857 0xfa : b'\xc3\xba', # ú |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
858 0xfb : b'\xc3\xbb', # û |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
859 0xfc : b'\xc3\xbc', # ü |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
860 0xfd : b'\xc3\xbd', # ý |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
861 0xfe : b'\xc3\xbe', # þ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
862 } |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
863 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
864 MULTIBYTE_MARKERS_AND_SIZES = [ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
865 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
866 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
867 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
868 ] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
869 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
870 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
871 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
872 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
873 @classmethod |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
874 def detwingle(cls, in_bytes, main_encoding="utf8", |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
875 embedded_encoding="windows-1252"): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
876 """Fix characters from one encoding embedded in some other encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
877 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
878 Currently the only situation supported is Windows-1252 (or its |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
879 subset ISO-8859-1), embedded in UTF-8. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
880 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
881 :param in_bytes: A bytestring that you suspect contains |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
882 characters from multiple encodings. Note that this _must_ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
883 be a bytestring. If you've already converted the document |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
884 to Unicode, you're too late. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
885 :param main_encoding: The primary encoding of `in_bytes`. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
886 :param embedded_encoding: The encoding that was used to embed characters |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
887 in the main document. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
888 :return: A bytestring in which `embedded_encoding` |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
889 characters have been converted to their `main_encoding` |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
890 equivalents. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
891 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
892 if embedded_encoding.replace('_', '-').lower() not in ( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
893 'windows-1252', 'windows_1252'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
894 raise NotImplementedError( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
895 "Windows-1252 and ISO-8859-1 are the only currently supported " |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
896 "embedded encodings.") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
897 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
898 if main_encoding.lower() not in ('utf8', 'utf-8'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
899 raise NotImplementedError( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
900 "UTF-8 is the only currently supported main encoding.") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
901 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
902 byte_chunks = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
903 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
904 chunk_start = 0 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
905 pos = 0 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
906 while pos < len(in_bytes): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
907 byte = in_bytes[pos] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
908 if not isinstance(byte, int): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
909 # Python 2.x |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
910 byte = ord(byte) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
911 if (byte >= cls.FIRST_MULTIBYTE_MARKER |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
912 and byte <= cls.LAST_MULTIBYTE_MARKER): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
913 # This is the start of a UTF-8 multibyte character. Skip |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
914 # to the end. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
915 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
916 if byte >= start and byte <= end: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
917 pos += size |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
918 break |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
919 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
920 # We found a Windows-1252 character! |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
921 # Save the string up to this point as a chunk. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
922 byte_chunks.append(in_bytes[chunk_start:pos]) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
923 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
924 # Now translate the Windows-1252 character into UTF-8 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
925 # and add it as another, one-byte chunk. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
926 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
927 pos += 1 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
928 chunk_start = pos |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
929 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
930 # Go on to the next character. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
931 pos += 1 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
932 if chunk_start == 0: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
933 # The string is unchanged. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
934 return in_bytes |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
935 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
936 # Store the final chunk. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
937 byte_chunks.append(in_bytes[chunk_start:]) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
938 return b''.join(byte_chunks) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
939 |