comparison env/lib/python3.7/site-packages/bs4/tests/test_soup.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # -*- coding: utf-8 -*-
2 """Tests of Beautiful Soup as a whole."""
3
4 from pdb import set_trace
5 import logging
6 import unittest
7 import sys
8 import tempfile
9
10 from bs4 import (
11 BeautifulSoup,
12 BeautifulStoneSoup,
13 )
14 from bs4.builder import (
15 TreeBuilder,
16 ParserRejectedMarkup,
17 )
18 from bs4.element import (
19 CharsetMetaAttributeValue,
20 Comment,
21 ContentMetaAttributeValue,
22 SoupStrainer,
23 NamespacedAttribute,
24 Tag,
25 NavigableString,
26 )
27
28 import bs4.dammit
29 from bs4.dammit import (
30 EntitySubstitution,
31 UnicodeDammit,
32 EncodingDetector,
33 )
34 from bs4.testing import (
35 default_builder,
36 SoupTest,
37 skipIf,
38 )
39 import warnings
40
41 try:
42 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
43 LXML_PRESENT = True
44 except ImportError as e:
45 LXML_PRESENT = False
46
47 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
48
49 class TestConstructor(SoupTest):
50
51 def test_short_unicode_input(self):
52 data = "<h1>éé</h1>"
53 soup = self.soup(data)
54 self.assertEqual("éé", soup.h1.string)
55
56 def test_embedded_null(self):
57 data = "<h1>foo\0bar</h1>"
58 soup = self.soup(data)
59 self.assertEqual("foo\0bar", soup.h1.string)
60
61 def test_exclude_encodings(self):
62 utf8_data = "Räksmörgås".encode("utf-8")
63 soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
64 self.assertEqual("windows-1252", soup.original_encoding)
65
66 def test_custom_builder_class(self):
67 # Verify that you can pass in a custom Builder class and
68 # it'll be instantiated with the appropriate keyword arguments.
69 class Mock(object):
70 def __init__(self, **kwargs):
71 self.called_with = kwargs
72 self.is_xml = True
73 self.store_line_numbers = False
74 self.cdata_list_attributes = []
75 self.preserve_whitespace_tags = []
76 self.string_containers = {}
77 def initialize_soup(self, soup):
78 pass
79 def feed(self, markup):
80 self.fed = markup
81 def reset(self):
82 pass
83 def ignore(self, ignore):
84 pass
85 set_up_substitutions = can_be_empty_element = ignore
86 def prepare_markup(self, *args, **kwargs):
87 yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
88
89 kwargs = dict(
90 var="value",
91 # This is a deprecated BS3-era keyword argument, which
92 # will be stripped out.
93 convertEntities=True,
94 )
95 with warnings.catch_warnings(record=True):
96 soup = BeautifulSoup('', builder=Mock, **kwargs)
97 assert isinstance(soup.builder, Mock)
98 self.assertEqual(dict(var="value"), soup.builder.called_with)
99 self.assertEqual("prepared markup", soup.builder.fed)
100
101 # You can also instantiate the TreeBuilder yourself. In this
102 # case, that specific object is used and any keyword arguments
103 # to the BeautifulSoup constructor are ignored.
104 builder = Mock(**kwargs)
105 with warnings.catch_warnings(record=True) as w:
106 soup = BeautifulSoup(
107 '', builder=builder, ignored_value=True,
108 )
109 msg = str(w[0].message)
110 assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
111 self.assertEqual(builder, soup.builder)
112 self.assertEqual(kwargs, builder.called_with)
113
114 def test_parser_markup_rejection(self):
115 # If markup is completely rejected by the parser, an
116 # explanatory ParserRejectedMarkup exception is raised.
117 class Mock(TreeBuilder):
118 def feed(self, *args, **kwargs):
119 raise ParserRejectedMarkup("Nope.")
120
121 def prepare_markup(self, *args, **kwargs):
122 # We're going to try two different ways of preparing this markup,
123 # but feed() will reject both of them.
124 yield markup, None, None, False
125 yield markup, None, None, False
126
127 import re
128 self.assertRaisesRegex(
129 ParserRejectedMarkup,
130 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
131 BeautifulSoup, '', builder=Mock,
132 )
133
134 def test_cdata_list_attributes(self):
135 # Most attribute values are represented as scalars, but the
136 # HTML standard says that some attributes, like 'class' have
137 # space-separated lists as values.
138 markup = '<a id=" an id " class=" a class "></a>'
139 soup = self.soup(markup)
140
141 # Note that the spaces are stripped for 'class' but not for 'id'.
142 a = soup.a
143 self.assertEqual(" an id ", a['id'])
144 self.assertEqual(["a", "class"], a['class'])
145
146 # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
147 # you customize or disable this. As always, you can customize the TreeBuilder
148 # by passing in a keyword argument to the BeautifulSoup constructor.
149 soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
150 self.assertEqual(" a class ", soup.a['class'])
151
152 # Here are two ways of saying that `id` is a multi-valued
153 # attribute in this context, but 'class' is not.
154 for switcheroo in ({'*': 'id'}, {'a': 'id'}):
155 with warnings.catch_warnings(record=True) as w:
156 # This will create a warning about not explicitly
157 # specifying a parser, but we'll ignore it.
158 soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
159 a = soup.a
160 self.assertEqual(["an", "id"], a['id'])
161 self.assertEqual(" a class ", a['class'])
162
163 def test_replacement_classes(self):
164 # Test the ability to pass in replacements for element classes
165 # which will be used when building the tree.
166 class TagPlus(Tag):
167 pass
168
169 class StringPlus(NavigableString):
170 pass
171
172 class CommentPlus(Comment):
173 pass
174
175 soup = self.soup(
176 "<a><b>foo</b>bar</a><!--whee-->",
177 element_classes = {
178 Tag: TagPlus,
179 NavigableString: StringPlus,
180 Comment: CommentPlus,
181 }
182 )
183
184 # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
185 # rather than Tag, String, and Comment objects.
186 assert all(
187 isinstance(x, (TagPlus, StringPlus, CommentPlus))
188 for x in soup.recursiveChildGenerator()
189 )
190
191 def test_alternate_string_containers(self):
192 # Test the ability to customize the string containers for
193 # different types of tags.
194 class PString(NavigableString):
195 pass
196
197 class BString(NavigableString):
198 pass
199
200 soup = self.soup(
201 "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
202 string_containers = {
203 'b': BString,
204 'p': PString,
205 }
206 )
207
208 # The string before the <p> tag is a regular NavigableString.
209 assert isinstance(soup.div.contents[0], NavigableString)
210
211 # The string inside the <p> tag, but not inside the <i> tag,
212 # is a PString.
213 assert isinstance(soup.p.contents[0], PString)
214
215 # Every string inside the <b> tag is a BString, even the one that
216 # was also inside an <i> tag.
217 for s in soup.b.strings:
218 assert isinstance(s, BString)
219
220 # Now that parsing was complete, the string_container_stack
221 # (where this information was kept) has been cleared out.
222 self.assertEqual([], soup.string_container_stack)
223
224
225 class TestWarnings(SoupTest):
226
227 def _no_parser_specified(self, s, is_there=True):
228 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
229 self.assertTrue(v)
230
231 def test_warning_if_no_parser_specified(self):
232 with warnings.catch_warnings(record=True) as w:
233 soup = self.soup("<a><b></b></a>")
234 msg = str(w[0].message)
235 self._assert_no_parser_specified(msg)
236
237 def test_warning_if_parser_specified_too_vague(self):
238 with warnings.catch_warnings(record=True) as w:
239 soup = self.soup("<a><b></b></a>", "html")
240 msg = str(w[0].message)
241 self._assert_no_parser_specified(msg)
242
243 def test_no_warning_if_explicit_parser_specified(self):
244 with warnings.catch_warnings(record=True) as w:
245 soup = self.soup("<a><b></b></a>", "html.parser")
246 self.assertEqual([], w)
247
248 def test_parseOnlyThese_renamed_to_parse_only(self):
249 with warnings.catch_warnings(record=True) as w:
250 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
251 msg = str(w[0].message)
252 self.assertTrue("parseOnlyThese" in msg)
253 self.assertTrue("parse_only" in msg)
254 self.assertEqual(b"<b></b>", soup.encode())
255
256 def test_fromEncoding_renamed_to_from_encoding(self):
257 with warnings.catch_warnings(record=True) as w:
258 utf8 = b"\xc3\xa9"
259 soup = self.soup(utf8, fromEncoding="utf8")
260 msg = str(w[0].message)
261 self.assertTrue("fromEncoding" in msg)
262 self.assertTrue("from_encoding" in msg)
263 self.assertEqual("utf8", soup.original_encoding)
264
265 def test_unrecognized_keyword_argument(self):
266 self.assertRaises(
267 TypeError, self.soup, "<a>", no_such_argument=True)
268
269 class TestWarnings(SoupTest):
270
271 def test_disk_file_warning(self):
272 filehandle = tempfile.NamedTemporaryFile()
273 filename = filehandle.name
274 try:
275 with warnings.catch_warnings(record=True) as w:
276 soup = self.soup(filename)
277 msg = str(w[0].message)
278 self.assertTrue("looks like a filename" in msg)
279 finally:
280 filehandle.close()
281
282 # The file no longer exists, so Beautiful Soup will no longer issue the warning.
283 with warnings.catch_warnings(record=True) as w:
284 soup = self.soup(filename)
285 self.assertEqual(0, len(w))
286
287 def test_url_warning_with_bytes_url(self):
288 with warnings.catch_warnings(record=True) as warning_list:
289 soup = self.soup(b"http://www.crummybytes.com/")
290 # Be aware this isn't the only warning that can be raised during
291 # execution..
292 self.assertTrue(any("looks like a URL" in str(w.message)
293 for w in warning_list))
294
295 def test_url_warning_with_unicode_url(self):
296 with warnings.catch_warnings(record=True) as warning_list:
297 # note - this url must differ from the bytes one otherwise
298 # python's warnings system swallows the second warning
299 soup = self.soup("http://www.crummyunicode.com/")
300 self.assertTrue(any("looks like a URL" in str(w.message)
301 for w in warning_list))
302
303 def test_url_warning_with_bytes_and_space(self):
304 with warnings.catch_warnings(record=True) as warning_list:
305 soup = self.soup(b"http://www.crummybytes.com/ is great")
306 self.assertFalse(any("looks like a URL" in str(w.message)
307 for w in warning_list))
308
309 def test_url_warning_with_unicode_and_space(self):
310 with warnings.catch_warnings(record=True) as warning_list:
311 soup = self.soup("http://www.crummyuncode.com/ is great")
312 self.assertFalse(any("looks like a URL" in str(w.message)
313 for w in warning_list))
314
315
316 class TestSelectiveParsing(SoupTest):
317
318 def test_parse_with_soupstrainer(self):
319 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
320 strainer = SoupStrainer("b")
321 soup = self.soup(markup, parse_only=strainer)
322 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
323
324
325 class TestEntitySubstitution(unittest.TestCase):
326 """Standalone tests of the EntitySubstitution class."""
327 def setUp(self):
328 self.sub = EntitySubstitution
329
330 def test_simple_html_substitution(self):
331 # Unicode characters corresponding to named HTML entites
332 # are substituted, and no others.
333 s = "foo\u2200\N{SNOWMAN}\u00f5bar"
334 self.assertEqual(self.sub.substitute_html(s),
335 "foo&forall;\N{SNOWMAN}&otilde;bar")
336
337 def test_smart_quote_substitution(self):
338 # MS smart quotes are a common source of frustration, so we
339 # give them a special test.
340 quotes = b"\x91\x92foo\x93\x94"
341 dammit = UnicodeDammit(quotes)
342 self.assertEqual(self.sub.substitute_html(dammit.markup),
343 "&lsquo;&rsquo;foo&ldquo;&rdquo;")
344
345 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
346 s = 'Welcome to "my bar"'
347 self.assertEqual(self.sub.substitute_xml(s, False), s)
348
349 def test_xml_attribute_quoting_normally_uses_double_quotes(self):
350 self.assertEqual(self.sub.substitute_xml("Welcome", True),
351 '"Welcome"')
352 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
353 '"Bob\'s Bar"')
354
355 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
356 s = 'Welcome to "my bar"'
357 self.assertEqual(self.sub.substitute_xml(s, True),
358 "'Welcome to \"my bar\"'")
359
360 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
361 s = 'Welcome to "Bob\'s Bar"'
362 self.assertEqual(
363 self.sub.substitute_xml(s, True),
364 '"Welcome to &quot;Bob\'s Bar&quot;"')
365
366 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
367 quoted = 'Welcome to "Bob\'s Bar"'
368 self.assertEqual(self.sub.substitute_xml(quoted), quoted)
369
370 def test_xml_quoting_handles_angle_brackets(self):
371 self.assertEqual(
372 self.sub.substitute_xml("foo<bar>"),
373 "foo&lt;bar&gt;")
374
375 def test_xml_quoting_handles_ampersands(self):
376 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
377
378 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
379 self.assertEqual(
380 self.sub.substitute_xml("&Aacute;T&T"),
381 "&amp;Aacute;T&amp;T")
382
383 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
384 self.assertEqual(
385 self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
386 "&Aacute;T&amp;T")
387
388 def test_quotes_not_html_substituted(self):
389 """There's no need to do this except inside attribute values."""
390 text = 'Bob\'s "bar"'
391 self.assertEqual(self.sub.substitute_html(text), text)
392
393
394 class TestEncodingConversion(SoupTest):
395 # Test Beautiful Soup's ability to decode and encode from various
396 # encodings.
397
398 def setUp(self):
399 super(TestEncodingConversion, self).setUp()
400 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
401 self.utf8_data = self.unicode_data.encode("utf-8")
402 # Just so you know what it looks like.
403 self.assertEqual(
404 self.utf8_data,
405 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
406
407 def test_ascii_in_unicode_out(self):
408 # ASCII input is converted to Unicode. The original_encoding
409 # attribute is set to 'utf-8', a superset of ASCII.
410 chardet = bs4.dammit.chardet_dammit
411 logging.disable(logging.WARNING)
412 try:
413 def noop(str):
414 return None
415 # Disable chardet, which will realize that the ASCII is ASCII.
416 bs4.dammit.chardet_dammit = noop
417 ascii = b"<foo>a</foo>"
418 soup_from_ascii = self.soup(ascii)
419 unicode_output = soup_from_ascii.decode()
420 self.assertTrue(isinstance(unicode_output, str))
421 self.assertEqual(unicode_output, self.document_for(ascii.decode()))
422 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
423 finally:
424 logging.disable(logging.NOTSET)
425 bs4.dammit.chardet_dammit = chardet
426
427 def test_unicode_in_unicode_out(self):
428 # Unicode input is left alone. The original_encoding attribute
429 # is not set.
430 soup_from_unicode = self.soup(self.unicode_data)
431 self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
432 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
433 self.assertEqual(soup_from_unicode.original_encoding, None)
434
435 def test_utf8_in_unicode_out(self):
436 # UTF-8 input is converted to Unicode. The original_encoding
437 # attribute is set.
438 soup_from_utf8 = self.soup(self.utf8_data)
439 self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
440 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
441
442 def test_utf8_out(self):
443 # The internal data structures can be encoded as UTF-8.
444 soup_from_unicode = self.soup(self.unicode_data)
445 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
446
447 @skipIf(
448 PYTHON_3_PRE_3_2,
449 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
450 def test_attribute_name_containing_unicode_characters(self):
451 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
452 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
453
454 class TestUnicodeDammit(unittest.TestCase):
455 """Standalone tests of UnicodeDammit."""
456
457 def test_unicode_input(self):
458 markup = "I'm already Unicode! \N{SNOWMAN}"
459 dammit = UnicodeDammit(markup)
460 self.assertEqual(dammit.unicode_markup, markup)
461
462 def test_smart_quotes_to_unicode(self):
463 markup = b"<foo>\x91\x92\x93\x94</foo>"
464 dammit = UnicodeDammit(markup)
465 self.assertEqual(
466 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
467
468 def test_smart_quotes_to_xml_entities(self):
469 markup = b"<foo>\x91\x92\x93\x94</foo>"
470 dammit = UnicodeDammit(markup, smart_quotes_to="xml")
471 self.assertEqual(
472 dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
473
474 def test_smart_quotes_to_html_entities(self):
475 markup = b"<foo>\x91\x92\x93\x94</foo>"
476 dammit = UnicodeDammit(markup, smart_quotes_to="html")
477 self.assertEqual(
478 dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
479
480 def test_smart_quotes_to_ascii(self):
481 markup = b"<foo>\x91\x92\x93\x94</foo>"
482 dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
483 self.assertEqual(
484 dammit.unicode_markup, """<foo>''""</foo>""")
485
486 def test_detect_utf8(self):
487 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
488 dammit = UnicodeDammit(utf8)
489 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
490 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
491
492
493 def test_convert_hebrew(self):
494 hebrew = b"\xed\xe5\xec\xf9"
495 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
496 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
497 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
498
499 def test_dont_see_smart_quotes_where_there_are_none(self):
500 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
501 dammit = UnicodeDammit(utf_8)
502 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
503 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
504
505 def test_ignore_inappropriate_codecs(self):
506 utf8_data = "Räksmörgås".encode("utf-8")
507 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
508 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
509
510 def test_ignore_invalid_codecs(self):
511 utf8_data = "Räksmörgås".encode("utf-8")
512 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
513 dammit = UnicodeDammit(utf8_data, [bad_encoding])
514 self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
515
516 def test_exclude_encodings(self):
517 # This is UTF-8.
518 utf8_data = "Räksmörgås".encode("utf-8")
519
520 # But if we exclude UTF-8 from consideration, the guess is
521 # Windows-1252.
522 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
523 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
524
525 # And if we exclude that, there is no valid guess at all.
526 dammit = UnicodeDammit(
527 utf8_data, exclude_encodings=["utf-8", "windows-1252"])
528 self.assertEqual(dammit.original_encoding, None)
529
530 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
531 detected = EncodingDetector(
532 b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
533 encodings = list(detected.encodings)
534 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
535
536 def test_detect_html5_style_meta_tag(self):
537
538 for data in (
539 b'<html><meta charset="euc-jp" /></html>',
540 b"<html><meta charset='euc-jp' /></html>",
541 b"<html><meta charset=euc-jp /></html>",
542 b"<html><meta charset=euc-jp/></html>"):
543 dammit = UnicodeDammit(data, is_html=True)
544 self.assertEqual(
545 "euc-jp", dammit.original_encoding)
546
547 def test_last_ditch_entity_replacement(self):
548 # This is a UTF-8 document that contains bytestrings
549 # completely incompatible with UTF-8 (ie. encoded with some other
550 # encoding).
551 #
552 # Since there is no consistent encoding for the document,
553 # Unicode, Dammit will eventually encode the document as UTF-8
554 # and encode the incompatible characters as REPLACEMENT
555 # CHARACTER.
556 #
557 # If chardet is installed, it will detect that the document
558 # can be converted into ISO-8859-1 without errors. This happens
559 # to be the wrong encoding, but it is a consistent encoding, so the
560 # code we're testing here won't run.
561 #
562 # So we temporarily disable chardet if it's present.
563 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
564 <html><b>\330\250\330\252\330\261</b>
565 <i>\310\322\321\220\312\321\355\344</i></html>"""
566 chardet = bs4.dammit.chardet_dammit
567 logging.disable(logging.WARNING)
568 try:
569 def noop(str):
570 return None
571 bs4.dammit.chardet_dammit = noop
572 dammit = UnicodeDammit(doc)
573 self.assertEqual(True, dammit.contains_replacement_characters)
574 self.assertTrue("\ufffd" in dammit.unicode_markup)
575
576 soup = BeautifulSoup(doc, "html.parser")
577 self.assertTrue(soup.contains_replacement_characters)
578 finally:
579 logging.disable(logging.NOTSET)
580 bs4.dammit.chardet_dammit = chardet
581
582 def test_byte_order_mark_removed(self):
583 # A document written in UTF-16LE will have its byte order marker stripped.
584 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
585 dammit = UnicodeDammit(data)
586 self.assertEqual("<a>áé</a>", dammit.unicode_markup)
587 self.assertEqual("utf-16le", dammit.original_encoding)
588
589 def test_detwingle(self):
590 # Here's a UTF8 document.
591 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
592
593 # Here's a Windows-1252 document.
594 windows_1252 = (
595 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
596 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
597
598 # Through some unholy alchemy, they've been stuck together.
599 doc = utf8 + windows_1252 + utf8
600
601 # The document can't be turned into UTF-8:
602 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
603
604 # Unicode, Dammit thinks the whole document is Windows-1252,
605 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
606
607 # But if we run it through fix_embedded_windows_1252, it's fixed:
608
609 fixed = UnicodeDammit.detwingle(doc)
610 self.assertEqual(
611 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
612
613 def test_detwingle_ignores_multibyte_characters(self):
614 # Each of these characters has a UTF-8 representation ending
615 # in \x93. \x93 is a smart quote if interpreted as
616 # Windows-1252. But our code knows to skip over multibyte
617 # UTF-8 characters, so they'll survive the process unscathed.
618 for tricky_unicode_char in (
619 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
620 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
621 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
622 ):
623 input = tricky_unicode_char.encode("utf8")
624 self.assertTrue(input.endswith(b'\x93'))
625 output = UnicodeDammit.detwingle(input)
626 self.assertEqual(output, input)
627
628 def test_find_declared_encoding(self):
629 # Test our ability to find a declared encoding inside an
630 # XML or HTML document.
631 #
632 # Even if the document comes in as Unicode, it may be
633 # interesting to know what encoding was claimed
634 # originally.
635
636 html_unicode = '<html><head><meta charset="utf-8"></head></html>'
637 html_bytes = html_unicode.encode("ascii")
638
639 xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
640 xml_bytes = xml_unicode.encode("ascii")
641
642 m = EncodingDetector.find_declared_encoding
643 self.assertEqual(None, m(html_unicode, is_html=False))
644 self.assertEqual("utf-8", m(html_unicode, is_html=True))
645 self.assertEqual("utf-8", m(html_bytes, is_html=True))
646
647 self.assertEqual("iso-8859-1", m(xml_unicode))
648 self.assertEqual("iso-8859-1", m(xml_bytes))
649
650 # Normally, only the first few kilobytes of a document are checked for
651 # an encoding.
652 spacer = b' ' * 5000
653 self.assertEqual(None, m(spacer + html_bytes))
654 self.assertEqual(None, m(spacer + xml_bytes))
655
656 # But you can tell find_declared_encoding to search an entire
657 # HTML document.
658 self.assertEqual(
659 "utf-8",
660 m(spacer + html_bytes, is_html=True, search_entire_document=True)
661 )
662
663 # The XML encoding declaration has to be the very first thing
664 # in the document. We'll allow whitespace before the document
665 # starts, but nothing else.
666 self.assertEqual(
667 "iso-8859-1",
668 m(xml_bytes, search_entire_document=True)
669 )
670 self.assertEqual(
671 None, m(b'a' + xml_bytes, search_entire_document=True)
672 )
673
674 class TestNamedspacedAttribute(SoupTest):
675
676 def test_name_may_be_none_or_missing(self):
677 a = NamespacedAttribute("xmlns", None)
678 self.assertEqual(a, "xmlns")
679
680 a = NamespacedAttribute("xmlns")
681 self.assertEqual(a, "xmlns")
682
683 def test_attribute_is_equivalent_to_colon_separated_string(self):
684 a = NamespacedAttribute("a", "b")
685 self.assertEqual("a:b", a)
686
687 def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
688 a = NamespacedAttribute("a", "b", "c")
689 b = NamespacedAttribute("a", "b", "c")
690 self.assertEqual(a, b)
691
692 # The actual namespace is not considered.
693 c = NamespacedAttribute("a", "b", None)
694 self.assertEqual(a, c)
695
696 # But name and prefix are important.
697 d = NamespacedAttribute("a", "z", "c")
698 self.assertNotEqual(a, d)
699
700 e = NamespacedAttribute("z", "b", "c")
701 self.assertNotEqual(a, e)
702
703
704 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
705
706 def test_content_meta_attribute_value(self):
707 value = CharsetMetaAttributeValue("euc-jp")
708 self.assertEqual("euc-jp", value)
709 self.assertEqual("euc-jp", value.original_value)
710 self.assertEqual("utf8", value.encode("utf8"))
711
712
713 def test_content_meta_attribute_value(self):
714 value = ContentMetaAttributeValue("text/html; charset=euc-jp")
715 self.assertEqual("text/html; charset=euc-jp", value)
716 self.assertEqual("text/html; charset=euc-jp", value.original_value)
717 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))