Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bs4/tests/test_soup.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """Tests of Beautiful Soup as a whole.""" | |
3 | |
4 from pdb import set_trace | |
5 import logging | |
6 import unittest | |
7 import sys | |
8 import tempfile | |
9 | |
10 from bs4 import ( | |
11 BeautifulSoup, | |
12 BeautifulStoneSoup, | |
13 GuessedAtParserWarning, | |
14 MarkupResemblesLocatorWarning, | |
15 ) | |
16 from bs4.builder import ( | |
17 TreeBuilder, | |
18 ParserRejectedMarkup, | |
19 ) | |
20 from bs4.element import ( | |
21 CharsetMetaAttributeValue, | |
22 Comment, | |
23 ContentMetaAttributeValue, | |
24 SoupStrainer, | |
25 NamespacedAttribute, | |
26 Tag, | |
27 NavigableString, | |
28 ) | |
29 | |
30 import bs4.dammit | |
31 from bs4.dammit import ( | |
32 EntitySubstitution, | |
33 UnicodeDammit, | |
34 EncodingDetector, | |
35 ) | |
36 from bs4.testing import ( | |
37 default_builder, | |
38 SoupTest, | |
39 skipIf, | |
40 ) | |
41 import warnings | |
42 | |
43 try: | |
44 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | |
45 LXML_PRESENT = True | |
46 except ImportError as e: | |
47 LXML_PRESENT = False | |
48 | |
49 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | |
50 | |
51 class TestConstructor(SoupTest): | |
52 | |
53 def test_short_unicode_input(self): | |
54 data = "<h1>éé</h1>" | |
55 soup = self.soup(data) | |
56 self.assertEqual("éé", soup.h1.string) | |
57 | |
58 def test_embedded_null(self): | |
59 data = "<h1>foo\0bar</h1>" | |
60 soup = self.soup(data) | |
61 self.assertEqual("foo\0bar", soup.h1.string) | |
62 | |
63 def test_exclude_encodings(self): | |
64 utf8_data = "Räksmörgås".encode("utf-8") | |
65 soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) | |
66 self.assertEqual("windows-1252", soup.original_encoding) | |
67 | |
68 def test_custom_builder_class(self): | |
69 # Verify that you can pass in a custom Builder class and | |
70 # it'll be instantiated with the appropriate keyword arguments. | |
71 class Mock(object): | |
72 def __init__(self, **kwargs): | |
73 self.called_with = kwargs | |
74 self.is_xml = True | |
75 self.store_line_numbers = False | |
76 self.cdata_list_attributes = [] | |
77 self.preserve_whitespace_tags = [] | |
78 self.string_containers = {} | |
79 def initialize_soup(self, soup): | |
80 pass | |
81 def feed(self, markup): | |
82 self.fed = markup | |
83 def reset(self): | |
84 pass | |
85 def ignore(self, ignore): | |
86 pass | |
87 set_up_substitutions = can_be_empty_element = ignore | |
88 def prepare_markup(self, *args, **kwargs): | |
89 yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters" | |
90 | |
91 kwargs = dict( | |
92 var="value", | |
93 # This is a deprecated BS3-era keyword argument, which | |
94 # will be stripped out. | |
95 convertEntities=True, | |
96 ) | |
97 with warnings.catch_warnings(record=True): | |
98 soup = BeautifulSoup('', builder=Mock, **kwargs) | |
99 assert isinstance(soup.builder, Mock) | |
100 self.assertEqual(dict(var="value"), soup.builder.called_with) | |
101 self.assertEqual("prepared markup", soup.builder.fed) | |
102 | |
103 # You can also instantiate the TreeBuilder yourself. In this | |
104 # case, that specific object is used and any keyword arguments | |
105 # to the BeautifulSoup constructor are ignored. | |
106 builder = Mock(**kwargs) | |
107 with warnings.catch_warnings(record=True) as w: | |
108 soup = BeautifulSoup( | |
109 '', builder=builder, ignored_value=True, | |
110 ) | |
111 msg = str(w[0].message) | |
112 assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") | |
113 self.assertEqual(builder, soup.builder) | |
114 self.assertEqual(kwargs, builder.called_with) | |
115 | |
116 def test_parser_markup_rejection(self): | |
117 # If markup is completely rejected by the parser, an | |
118 # explanatory ParserRejectedMarkup exception is raised. | |
119 class Mock(TreeBuilder): | |
120 def feed(self, *args, **kwargs): | |
121 raise ParserRejectedMarkup("Nope.") | |
122 | |
123 def prepare_markup(self, *args, **kwargs): | |
124 # We're going to try two different ways of preparing this markup, | |
125 # but feed() will reject both of them. | |
126 yield markup, None, None, False | |
127 yield markup, None, None, False | |
128 | |
129 import re | |
130 self.assertRaisesRegex( | |
131 ParserRejectedMarkup, | |
132 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", | |
133 BeautifulSoup, '', builder=Mock, | |
134 ) | |
135 | |
136 def test_cdata_list_attributes(self): | |
137 # Most attribute values are represented as scalars, but the | |
138 # HTML standard says that some attributes, like 'class' have | |
139 # space-separated lists as values. | |
140 markup = '<a id=" an id " class=" a class "></a>' | |
141 soup = self.soup(markup) | |
142 | |
143 # Note that the spaces are stripped for 'class' but not for 'id'. | |
144 a = soup.a | |
145 self.assertEqual(" an id ", a['id']) | |
146 self.assertEqual(["a", "class"], a['class']) | |
147 | |
148 # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets | |
149 # you customize or disable this. As always, you can customize the TreeBuilder | |
150 # by passing in a keyword argument to the BeautifulSoup constructor. | |
151 soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) | |
152 self.assertEqual(" a class ", soup.a['class']) | |
153 | |
154 # Here are two ways of saying that `id` is a multi-valued | |
155 # attribute in this context, but 'class' is not. | |
156 for switcheroo in ({'*': 'id'}, {'a': 'id'}): | |
157 with warnings.catch_warnings(record=True) as w: | |
158 # This will create a warning about not explicitly | |
159 # specifying a parser, but we'll ignore it. | |
160 soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) | |
161 a = soup.a | |
162 self.assertEqual(["an", "id"], a['id']) | |
163 self.assertEqual(" a class ", a['class']) | |
164 | |
165 def test_replacement_classes(self): | |
166 # Test the ability to pass in replacements for element classes | |
167 # which will be used when building the tree. | |
168 class TagPlus(Tag): | |
169 pass | |
170 | |
171 class StringPlus(NavigableString): | |
172 pass | |
173 | |
174 class CommentPlus(Comment): | |
175 pass | |
176 | |
177 soup = self.soup( | |
178 "<a><b>foo</b>bar</a><!--whee-->", | |
179 element_classes = { | |
180 Tag: TagPlus, | |
181 NavigableString: StringPlus, | |
182 Comment: CommentPlus, | |
183 } | |
184 ) | |
185 | |
186 # The tree was built with TagPlus, StringPlus, and CommentPlus objects, | |
187 # rather than Tag, String, and Comment objects. | |
188 assert all( | |
189 isinstance(x, (TagPlus, StringPlus, CommentPlus)) | |
190 for x in soup.recursiveChildGenerator() | |
191 ) | |
192 | |
193 def test_alternate_string_containers(self): | |
194 # Test the ability to customize the string containers for | |
195 # different types of tags. | |
196 class PString(NavigableString): | |
197 pass | |
198 | |
199 class BString(NavigableString): | |
200 pass | |
201 | |
202 soup = self.soup( | |
203 "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", | |
204 string_containers = { | |
205 'b': BString, | |
206 'p': PString, | |
207 } | |
208 ) | |
209 | |
210 # The string before the <p> tag is a regular NavigableString. | |
211 assert isinstance(soup.div.contents[0], NavigableString) | |
212 | |
213 # The string inside the <p> tag, but not inside the <i> tag, | |
214 # is a PString. | |
215 assert isinstance(soup.p.contents[0], PString) | |
216 | |
217 # Every string inside the <b> tag is a BString, even the one that | |
218 # was also inside an <i> tag. | |
219 for s in soup.b.strings: | |
220 assert isinstance(s, BString) | |
221 | |
222 # Now that parsing was complete, the string_container_stack | |
223 # (where this information was kept) has been cleared out. | |
224 self.assertEqual([], soup.string_container_stack) | |
225 | |
226 | |
227 class TestWarnings(SoupTest): | |
228 | |
229 def _assert_warning(self, warnings, cls): | |
230 for w in warnings: | |
231 if isinstance(w.message, cls): | |
232 return w | |
233 raise Exception("%s warning not found in %r" % cls, warnings) | |
234 | |
235 def _assert_no_parser_specified(self, w): | |
236 warning = self._assert_warning(w, GuessedAtParserWarning) | |
237 message = str(warning.message) | |
238 self.assertTrue( | |
239 message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60]) | |
240 ) | |
241 | |
242 def test_warning_if_no_parser_specified(self): | |
243 with warnings.catch_warnings(record=True) as w: | |
244 soup = BeautifulSoup("<a><b></b></a>") | |
245 self._assert_no_parser_specified(w) | |
246 | |
247 def test_warning_if_parser_specified_too_vague(self): | |
248 with warnings.catch_warnings(record=True) as w: | |
249 soup = BeautifulSoup("<a><b></b></a>", "html") | |
250 self._assert_no_parser_specified(w) | |
251 | |
252 def test_no_warning_if_explicit_parser_specified(self): | |
253 with warnings.catch_warnings(record=True) as w: | |
254 soup = BeautifulSoup("<a><b></b></a>", "html.parser") | |
255 self.assertEqual([], w) | |
256 | |
257 def test_parseOnlyThese_renamed_to_parse_only(self): | |
258 with warnings.catch_warnings(record=True) as w: | |
259 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | |
260 msg = str(w[0].message) | |
261 self.assertTrue("parseOnlyThese" in msg) | |
262 self.assertTrue("parse_only" in msg) | |
263 self.assertEqual(b"<b></b>", soup.encode()) | |
264 | |
265 def test_fromEncoding_renamed_to_from_encoding(self): | |
266 with warnings.catch_warnings(record=True) as w: | |
267 utf8 = b"\xc3\xa9" | |
268 soup = self.soup(utf8, fromEncoding="utf8") | |
269 msg = str(w[0].message) | |
270 self.assertTrue("fromEncoding" in msg) | |
271 self.assertTrue("from_encoding" in msg) | |
272 self.assertEqual("utf8", soup.original_encoding) | |
273 | |
274 def test_unrecognized_keyword_argument(self): | |
275 self.assertRaises( | |
276 TypeError, self.soup, "<a>", no_such_argument=True) | |
277 | |
278 def test_disk_file_warning(self): | |
279 filehandle = tempfile.NamedTemporaryFile() | |
280 filename = filehandle.name | |
281 try: | |
282 with warnings.catch_warnings(record=True) as w: | |
283 soup = self.soup(filename) | |
284 warning = self._assert_warning(w, MarkupResemblesLocatorWarning) | |
285 self.assertTrue("looks like a filename" in str(warning.message)) | |
286 finally: | |
287 filehandle.close() | |
288 | |
289 # The file no longer exists, so Beautiful Soup will no longer issue the warning. | |
290 with warnings.catch_warnings(record=True) as w: | |
291 soup = self.soup(filename) | |
292 self.assertEqual([], w) | |
293 | |
294 def test_url_warning_with_bytes_url(self): | |
295 with warnings.catch_warnings(record=True) as warning_list: | |
296 soup = self.soup(b"http://www.crummybytes.com/") | |
297 warning = self._assert_warning( | |
298 warning_list, MarkupResemblesLocatorWarning | |
299 ) | |
300 self.assertTrue("looks like a URL" in str(warning.message)) | |
301 | |
302 def test_url_warning_with_unicode_url(self): | |
303 with warnings.catch_warnings(record=True) as warning_list: | |
304 # note - this url must differ from the bytes one otherwise | |
305 # python's warnings system swallows the second warning | |
306 soup = self.soup("http://www.crummyunicode.com/") | |
307 warning = self._assert_warning( | |
308 warning_list, MarkupResemblesLocatorWarning | |
309 ) | |
310 self.assertTrue("looks like a URL" in str(warning.message)) | |
311 | |
312 def test_url_warning_with_bytes_and_space(self): | |
313 # Here the markup contains something besides a URL, so no warning | |
314 # is issued. | |
315 with warnings.catch_warnings(record=True) as warning_list: | |
316 soup = self.soup(b"http://www.crummybytes.com/ is great") | |
317 self.assertFalse(any("looks like a URL" in str(w.message) | |
318 for w in warning_list)) | |
319 | |
320 def test_url_warning_with_unicode_and_space(self): | |
321 with warnings.catch_warnings(record=True) as warning_list: | |
322 soup = self.soup("http://www.crummyuncode.com/ is great") | |
323 self.assertFalse(any("looks like a URL" in str(w.message) | |
324 for w in warning_list)) | |
325 | |
326 | |
327 class TestSelectiveParsing(SoupTest): | |
328 | |
329 def test_parse_with_soupstrainer(self): | |
330 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | |
331 strainer = SoupStrainer("b") | |
332 soup = self.soup(markup, parse_only=strainer) | |
333 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | |
334 | |
335 | |
336 class TestEntitySubstitution(unittest.TestCase): | |
337 """Standalone tests of the EntitySubstitution class.""" | |
338 def setUp(self): | |
339 self.sub = EntitySubstitution | |
340 | |
341 def test_simple_html_substitution(self): | |
342 # Unicode characters corresponding to named HTML entites | |
343 # are substituted, and no others. | |
344 s = "foo\u2200\N{SNOWMAN}\u00f5bar" | |
345 self.assertEqual(self.sub.substitute_html(s), | |
346 "foo∀\N{SNOWMAN}õbar") | |
347 | |
348 def test_smart_quote_substitution(self): | |
349 # MS smart quotes are a common source of frustration, so we | |
350 # give them a special test. | |
351 quotes = b"\x91\x92foo\x93\x94" | |
352 dammit = UnicodeDammit(quotes) | |
353 self.assertEqual(self.sub.substitute_html(dammit.markup), | |
354 "‘’foo“”") | |
355 | |
356 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | |
357 s = 'Welcome to "my bar"' | |
358 self.assertEqual(self.sub.substitute_xml(s, False), s) | |
359 | |
360 def test_xml_attribute_quoting_normally_uses_double_quotes(self): | |
361 self.assertEqual(self.sub.substitute_xml("Welcome", True), | |
362 '"Welcome"') | |
363 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | |
364 '"Bob\'s Bar"') | |
365 | |
366 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | |
367 s = 'Welcome to "my bar"' | |
368 self.assertEqual(self.sub.substitute_xml(s, True), | |
369 "'Welcome to \"my bar\"'") | |
370 | |
371 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | |
372 s = 'Welcome to "Bob\'s Bar"' | |
373 self.assertEqual( | |
374 self.sub.substitute_xml(s, True), | |
375 '"Welcome to "Bob\'s Bar""') | |
376 | |
377 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | |
378 quoted = 'Welcome to "Bob\'s Bar"' | |
379 self.assertEqual(self.sub.substitute_xml(quoted), quoted) | |
380 | |
381 def test_xml_quoting_handles_angle_brackets(self): | |
382 self.assertEqual( | |
383 self.sub.substitute_xml("foo<bar>"), | |
384 "foo<bar>") | |
385 | |
386 def test_xml_quoting_handles_ampersands(self): | |
387 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | |
388 | |
389 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | |
390 self.assertEqual( | |
391 self.sub.substitute_xml("ÁT&T"), | |
392 "&Aacute;T&T") | |
393 | |
394 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | |
395 self.assertEqual( | |
396 self.sub.substitute_xml_containing_entities("ÁT&T"), | |
397 "ÁT&T") | |
398 | |
399 def test_quotes_not_html_substituted(self): | |
400 """There's no need to do this except inside attribute values.""" | |
401 text = 'Bob\'s "bar"' | |
402 self.assertEqual(self.sub.substitute_html(text), text) | |
403 | |
404 | |
405 class TestEncodingConversion(SoupTest): | |
406 # Test Beautiful Soup's ability to decode and encode from various | |
407 # encodings. | |
408 | |
409 def setUp(self): | |
410 super(TestEncodingConversion, self).setUp() | |
411 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | |
412 self.utf8_data = self.unicode_data.encode("utf-8") | |
413 # Just so you know what it looks like. | |
414 self.assertEqual( | |
415 self.utf8_data, | |
416 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | |
417 | |
418 def test_ascii_in_unicode_out(self): | |
419 # ASCII input is converted to Unicode. The original_encoding | |
420 # attribute is set to 'utf-8', a superset of ASCII. | |
421 chardet = bs4.dammit.chardet_dammit | |
422 logging.disable(logging.WARNING) | |
423 try: | |
424 def noop(str): | |
425 return None | |
426 # Disable chardet, which will realize that the ASCII is ASCII. | |
427 bs4.dammit.chardet_dammit = noop | |
428 ascii = b"<foo>a</foo>" | |
429 soup_from_ascii = self.soup(ascii) | |
430 unicode_output = soup_from_ascii.decode() | |
431 self.assertTrue(isinstance(unicode_output, str)) | |
432 self.assertEqual(unicode_output, self.document_for(ascii.decode())) | |
433 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | |
434 finally: | |
435 logging.disable(logging.NOTSET) | |
436 bs4.dammit.chardet_dammit = chardet | |
437 | |
438 def test_unicode_in_unicode_out(self): | |
439 # Unicode input is left alone. The original_encoding attribute | |
440 # is not set. | |
441 soup_from_unicode = self.soup(self.unicode_data) | |
442 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | |
443 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') | |
444 self.assertEqual(soup_from_unicode.original_encoding, None) | |
445 | |
446 def test_utf8_in_unicode_out(self): | |
447 # UTF-8 input is converted to Unicode. The original_encoding | |
448 # attribute is set. | |
449 soup_from_utf8 = self.soup(self.utf8_data) | |
450 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | |
451 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') | |
452 | |
453 def test_utf8_out(self): | |
454 # The internal data structures can be encoded as UTF-8. | |
455 soup_from_unicode = self.soup(self.unicode_data) | |
456 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | |
457 | |
458 @skipIf( | |
459 PYTHON_3_PRE_3_2, | |
460 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | |
461 def test_attribute_name_containing_unicode_characters(self): | |
462 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' | |
463 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | |
464 | |
465 class TestUnicodeDammit(unittest.TestCase): | |
466 """Standalone tests of UnicodeDammit.""" | |
467 | |
468 def test_unicode_input(self): | |
469 markup = "I'm already Unicode! \N{SNOWMAN}" | |
470 dammit = UnicodeDammit(markup) | |
471 self.assertEqual(dammit.unicode_markup, markup) | |
472 | |
473 def test_smart_quotes_to_unicode(self): | |
474 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
475 dammit = UnicodeDammit(markup) | |
476 self.assertEqual( | |
477 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") | |
478 | |
479 def test_smart_quotes_to_xml_entities(self): | |
480 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
481 dammit = UnicodeDammit(markup, smart_quotes_to="xml") | |
482 self.assertEqual( | |
483 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
484 | |
485 def test_smart_quotes_to_html_entities(self): | |
486 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
487 dammit = UnicodeDammit(markup, smart_quotes_to="html") | |
488 self.assertEqual( | |
489 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
490 | |
491 def test_smart_quotes_to_ascii(self): | |
492 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
493 dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | |
494 self.assertEqual( | |
495 dammit.unicode_markup, """<foo>''""</foo>""") | |
496 | |
497 def test_detect_utf8(self): | |
498 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" | |
499 dammit = UnicodeDammit(utf8) | |
500 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
501 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') | |
502 | |
503 | |
504 def test_convert_hebrew(self): | |
505 hebrew = b"\xed\xe5\xec\xf9" | |
506 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | |
507 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | |
508 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') | |
509 | |
510 def test_dont_see_smart_quotes_where_there_are_none(self): | |
511 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | |
512 dammit = UnicodeDammit(utf_8) | |
513 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
514 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | |
515 | |
516 def test_ignore_inappropriate_codecs(self): | |
517 utf8_data = "Räksmörgås".encode("utf-8") | |
518 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | |
519 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
520 | |
521 def test_ignore_invalid_codecs(self): | |
522 utf8_data = "Räksmörgås".encode("utf-8") | |
523 for bad_encoding in ['.utf8', '...', 'utF---16.!']: | |
524 dammit = UnicodeDammit(utf8_data, [bad_encoding]) | |
525 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
526 | |
527 def test_exclude_encodings(self): | |
528 # This is UTF-8. | |
529 utf8_data = "Räksmörgås".encode("utf-8") | |
530 | |
531 # But if we exclude UTF-8 from consideration, the guess is | |
532 # Windows-1252. | |
533 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | |
534 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') | |
535 | |
536 # And if we exclude that, there is no valid guess at all. | |
537 dammit = UnicodeDammit( | |
538 utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | |
539 self.assertEqual(dammit.original_encoding, None) | |
540 | |
541 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | |
542 detected = EncodingDetector( | |
543 b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | |
544 encodings = list(detected.encodings) | |
545 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | |
546 | |
547 def test_detect_html5_style_meta_tag(self): | |
548 | |
549 for data in ( | |
550 b'<html><meta charset="euc-jp" /></html>', | |
551 b"<html><meta charset='euc-jp' /></html>", | |
552 b"<html><meta charset=euc-jp /></html>", | |
553 b"<html><meta charset=euc-jp/></html>"): | |
554 dammit = UnicodeDammit(data, is_html=True) | |
555 self.assertEqual( | |
556 "euc-jp", dammit.original_encoding) | |
557 | |
558 def test_last_ditch_entity_replacement(self): | |
559 # This is a UTF-8 document that contains bytestrings | |
560 # completely incompatible with UTF-8 (ie. encoded with some other | |
561 # encoding). | |
562 # | |
563 # Since there is no consistent encoding for the document, | |
564 # Unicode, Dammit will eventually encode the document as UTF-8 | |
565 # and encode the incompatible characters as REPLACEMENT | |
566 # CHARACTER. | |
567 # | |
568 # If chardet is installed, it will detect that the document | |
569 # can be converted into ISO-8859-1 without errors. This happens | |
570 # to be the wrong encoding, but it is a consistent encoding, so the | |
571 # code we're testing here won't run. | |
572 # | |
573 # So we temporarily disable chardet if it's present. | |
574 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | |
575 <html><b>\330\250\330\252\330\261</b> | |
576 <i>\310\322\321\220\312\321\355\344</i></html>""" | |
577 chardet = bs4.dammit.chardet_dammit | |
578 logging.disable(logging.WARNING) | |
579 try: | |
580 def noop(str): | |
581 return None | |
582 bs4.dammit.chardet_dammit = noop | |
583 dammit = UnicodeDammit(doc) | |
584 self.assertEqual(True, dammit.contains_replacement_characters) | |
585 self.assertTrue("\ufffd" in dammit.unicode_markup) | |
586 | |
587 soup = BeautifulSoup(doc, "html.parser") | |
588 self.assertTrue(soup.contains_replacement_characters) | |
589 finally: | |
590 logging.disable(logging.NOTSET) | |
591 bs4.dammit.chardet_dammit = chardet | |
592 | |
593 def test_byte_order_mark_removed(self): | |
594 # A document written in UTF-16LE will have its byte order marker stripped. | |
595 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | |
596 dammit = UnicodeDammit(data) | |
597 self.assertEqual("<a>áé</a>", dammit.unicode_markup) | |
598 self.assertEqual("utf-16le", dammit.original_encoding) | |
599 | |
600 def test_detwingle(self): | |
601 # Here's a UTF8 document. | |
602 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") | |
603 | |
604 # Here's a Windows-1252 document. | |
605 windows_1252 = ( | |
606 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | |
607 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | |
608 | |
609 # Through some unholy alchemy, they've been stuck together. | |
610 doc = utf8 + windows_1252 + utf8 | |
611 | |
612 # The document can't be turned into UTF-8: | |
613 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | |
614 | |
615 # Unicode, Dammit thinks the whole document is Windows-1252, | |
616 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | |
617 | |
618 # But if we run it through fix_embedded_windows_1252, it's fixed: | |
619 | |
620 fixed = UnicodeDammit.detwingle(doc) | |
621 self.assertEqual( | |
622 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | |
623 | |
624 def test_detwingle_ignores_multibyte_characters(self): | |
625 # Each of these characters has a UTF-8 representation ending | |
626 # in \x93. \x93 is a smart quote if interpreted as | |
627 # Windows-1252. But our code knows to skip over multibyte | |
628 # UTF-8 characters, so they'll survive the process unscathed. | |
629 for tricky_unicode_char in ( | |
630 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | |
631 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | |
632 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | |
633 ): | |
634 input = tricky_unicode_char.encode("utf8") | |
635 self.assertTrue(input.endswith(b'\x93')) | |
636 output = UnicodeDammit.detwingle(input) | |
637 self.assertEqual(output, input) | |
638 | |
639 def test_find_declared_encoding(self): | |
640 # Test our ability to find a declared encoding inside an | |
641 # XML or HTML document. | |
642 # | |
643 # Even if the document comes in as Unicode, it may be | |
644 # interesting to know what encoding was claimed | |
645 # originally. | |
646 | |
647 html_unicode = '<html><head><meta charset="utf-8"></head></html>' | |
648 html_bytes = html_unicode.encode("ascii") | |
649 | |
650 xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>' | |
651 xml_bytes = xml_unicode.encode("ascii") | |
652 | |
653 m = EncodingDetector.find_declared_encoding | |
654 self.assertEqual(None, m(html_unicode, is_html=False)) | |
655 self.assertEqual("utf-8", m(html_unicode, is_html=True)) | |
656 self.assertEqual("utf-8", m(html_bytes, is_html=True)) | |
657 | |
658 self.assertEqual("iso-8859-1", m(xml_unicode)) | |
659 self.assertEqual("iso-8859-1", m(xml_bytes)) | |
660 | |
661 # Normally, only the first few kilobytes of a document are checked for | |
662 # an encoding. | |
663 spacer = b' ' * 5000 | |
664 self.assertEqual(None, m(spacer + html_bytes)) | |
665 self.assertEqual(None, m(spacer + xml_bytes)) | |
666 | |
667 # But you can tell find_declared_encoding to search an entire | |
668 # HTML document. | |
669 self.assertEqual( | |
670 "utf-8", | |
671 m(spacer + html_bytes, is_html=True, search_entire_document=True) | |
672 ) | |
673 | |
674 # The XML encoding declaration has to be the very first thing | |
675 # in the document. We'll allow whitespace before the document | |
676 # starts, but nothing else. | |
677 self.assertEqual( | |
678 "iso-8859-1", | |
679 m(xml_bytes, search_entire_document=True) | |
680 ) | |
681 self.assertEqual( | |
682 None, m(b'a' + xml_bytes, search_entire_document=True) | |
683 ) | |
684 | |
685 class TestNamedspacedAttribute(SoupTest): | |
686 | |
687 def test_name_may_be_none_or_missing(self): | |
688 a = NamespacedAttribute("xmlns", None) | |
689 self.assertEqual(a, "xmlns") | |
690 | |
691 a = NamespacedAttribute("xmlns") | |
692 self.assertEqual(a, "xmlns") | |
693 | |
694 def test_attribute_is_equivalent_to_colon_separated_string(self): | |
695 a = NamespacedAttribute("a", "b") | |
696 self.assertEqual("a:b", a) | |
697 | |
698 def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | |
699 a = NamespacedAttribute("a", "b", "c") | |
700 b = NamespacedAttribute("a", "b", "c") | |
701 self.assertEqual(a, b) | |
702 | |
703 # The actual namespace is not considered. | |
704 c = NamespacedAttribute("a", "b", None) | |
705 self.assertEqual(a, c) | |
706 | |
707 # But name and prefix are important. | |
708 d = NamespacedAttribute("a", "z", "c") | |
709 self.assertNotEqual(a, d) | |
710 | |
711 e = NamespacedAttribute("z", "b", "c") | |
712 self.assertNotEqual(a, e) | |
713 | |
714 | |
715 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | |
716 | |
717 def test_content_meta_attribute_value(self): | |
718 value = CharsetMetaAttributeValue("euc-jp") | |
719 self.assertEqual("euc-jp", value) | |
720 self.assertEqual("euc-jp", value.original_value) | |
721 self.assertEqual("utf8", value.encode("utf8")) | |
722 | |
723 | |
724 def test_content_meta_attribute_value(self): | |
725 value = ContentMetaAttributeValue("text/html; charset=euc-jp") | |
726 self.assertEqual("text/html; charset=euc-jp", value) | |
727 self.assertEqual("text/html; charset=euc-jp", value.original_value) | |
728 self.assertEqual("text/html; charset=utf8", value.encode("utf8")) |