Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bs4/tests/test_soup.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """Tests of Beautiful Soup as a whole.""" | |
| 3 | |
| 4 from pdb import set_trace | |
| 5 import logging | |
| 6 import unittest | |
| 7 import sys | |
| 8 import tempfile | |
| 9 | |
| 10 from bs4 import ( | |
| 11 BeautifulSoup, | |
| 12 BeautifulStoneSoup, | |
| 13 ) | |
| 14 from bs4.builder import ( | |
| 15 TreeBuilder, | |
| 16 ParserRejectedMarkup, | |
| 17 ) | |
| 18 from bs4.element import ( | |
| 19 CharsetMetaAttributeValue, | |
| 20 Comment, | |
| 21 ContentMetaAttributeValue, | |
| 22 SoupStrainer, | |
| 23 NamespacedAttribute, | |
| 24 Tag, | |
| 25 NavigableString, | |
| 26 ) | |
| 27 | |
| 28 import bs4.dammit | |
| 29 from bs4.dammit import ( | |
| 30 EntitySubstitution, | |
| 31 UnicodeDammit, | |
| 32 EncodingDetector, | |
| 33 ) | |
| 34 from bs4.testing import ( | |
| 35 default_builder, | |
| 36 SoupTest, | |
| 37 skipIf, | |
| 38 ) | |
| 39 import warnings | |
| 40 | |
| 41 try: | |
| 42 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | |
| 43 LXML_PRESENT = True | |
| 44 except ImportError as e: | |
| 45 LXML_PRESENT = False | |
| 46 | |
| 47 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | |
| 48 | |
| 49 class TestConstructor(SoupTest): | |
| 50 | |
| 51 def test_short_unicode_input(self): | |
| 52 data = "<h1>éé</h1>" | |
| 53 soup = self.soup(data) | |
| 54 self.assertEqual("éé", soup.h1.string) | |
| 55 | |
| 56 def test_embedded_null(self): | |
| 57 data = "<h1>foo\0bar</h1>" | |
| 58 soup = self.soup(data) | |
| 59 self.assertEqual("foo\0bar", soup.h1.string) | |
| 60 | |
| 61 def test_exclude_encodings(self): | |
| 62 utf8_data = "Räksmörgås".encode("utf-8") | |
| 63 soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) | |
| 64 self.assertEqual("windows-1252", soup.original_encoding) | |
| 65 | |
| 66 def test_custom_builder_class(self): | |
| 67 # Verify that you can pass in a custom Builder class and | |
| 68 # it'll be instantiated with the appropriate keyword arguments. | |
| 69 class Mock(object): | |
| 70 def __init__(self, **kwargs): | |
| 71 self.called_with = kwargs | |
| 72 self.is_xml = True | |
| 73 self.store_line_numbers = False | |
| 74 self.cdata_list_attributes = [] | |
| 75 self.preserve_whitespace_tags = [] | |
| 76 self.string_containers = {} | |
| 77 def initialize_soup(self, soup): | |
| 78 pass | |
| 79 def feed(self, markup): | |
| 80 self.fed = markup | |
| 81 def reset(self): | |
| 82 pass | |
| 83 def ignore(self, ignore): | |
| 84 pass | |
| 85 set_up_substitutions = can_be_empty_element = ignore | |
| 86 def prepare_markup(self, *args, **kwargs): | |
| 87 yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters" | |
| 88 | |
| 89 kwargs = dict( | |
| 90 var="value", | |
| 91 # This is a deprecated BS3-era keyword argument, which | |
| 92 # will be stripped out. | |
| 93 convertEntities=True, | |
| 94 ) | |
| 95 with warnings.catch_warnings(record=True): | |
| 96 soup = BeautifulSoup('', builder=Mock, **kwargs) | |
| 97 assert isinstance(soup.builder, Mock) | |
| 98 self.assertEqual(dict(var="value"), soup.builder.called_with) | |
| 99 self.assertEqual("prepared markup", soup.builder.fed) | |
| 100 | |
| 101 # You can also instantiate the TreeBuilder yourself. In this | |
| 102 # case, that specific object is used and any keyword arguments | |
| 103 # to the BeautifulSoup constructor are ignored. | |
| 104 builder = Mock(**kwargs) | |
| 105 with warnings.catch_warnings(record=True) as w: | |
| 106 soup = BeautifulSoup( | |
| 107 '', builder=builder, ignored_value=True, | |
| 108 ) | |
| 109 msg = str(w[0].message) | |
| 110 assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") | |
| 111 self.assertEqual(builder, soup.builder) | |
| 112 self.assertEqual(kwargs, builder.called_with) | |
| 113 | |
| 114 def test_parser_markup_rejection(self): | |
| 115 # If markup is completely rejected by the parser, an | |
| 116 # explanatory ParserRejectedMarkup exception is raised. | |
| 117 class Mock(TreeBuilder): | |
| 118 def feed(self, *args, **kwargs): | |
| 119 raise ParserRejectedMarkup("Nope.") | |
| 120 | |
| 121 def prepare_markup(self, *args, **kwargs): | |
| 122 # We're going to try two different ways of preparing this markup, | |
| 123 # but feed() will reject both of them. | |
| 124 yield markup, None, None, False | |
| 125 yield markup, None, None, False | |
| 126 | |
| 127 import re | |
| 128 self.assertRaisesRegex( | |
| 129 ParserRejectedMarkup, | |
| 130 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", | |
| 131 BeautifulSoup, '', builder=Mock, | |
| 132 ) | |
| 133 | |
| 134 def test_cdata_list_attributes(self): | |
| 135 # Most attribute values are represented as scalars, but the | |
| 136 # HTML standard says that some attributes, like 'class' have | |
| 137 # space-separated lists as values. | |
| 138 markup = '<a id=" an id " class=" a class "></a>' | |
| 139 soup = self.soup(markup) | |
| 140 | |
| 141 # Note that the spaces are stripped for 'class' but not for 'id'. | |
| 142 a = soup.a | |
| 143 self.assertEqual(" an id ", a['id']) | |
| 144 self.assertEqual(["a", "class"], a['class']) | |
| 145 | |
| 146 # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets | |
| 147 # you customize or disable this. As always, you can customize the TreeBuilder | |
| 148 # by passing in a keyword argument to the BeautifulSoup constructor. | |
| 149 soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) | |
| 150 self.assertEqual(" a class ", soup.a['class']) | |
| 151 | |
| 152 # Here are two ways of saying that `id` is a multi-valued | |
| 153 # attribute in this context, but 'class' is not. | |
| 154 for switcheroo in ({'*': 'id'}, {'a': 'id'}): | |
| 155 with warnings.catch_warnings(record=True) as w: | |
| 156 # This will create a warning about not explicitly | |
| 157 # specifying a parser, but we'll ignore it. | |
| 158 soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) | |
| 159 a = soup.a | |
| 160 self.assertEqual(["an", "id"], a['id']) | |
| 161 self.assertEqual(" a class ", a['class']) | |
| 162 | |
| 163 def test_replacement_classes(self): | |
| 164 # Test the ability to pass in replacements for element classes | |
| 165 # which will be used when building the tree. | |
| 166 class TagPlus(Tag): | |
| 167 pass | |
| 168 | |
| 169 class StringPlus(NavigableString): | |
| 170 pass | |
| 171 | |
| 172 class CommentPlus(Comment): | |
| 173 pass | |
| 174 | |
| 175 soup = self.soup( | |
| 176 "<a><b>foo</b>bar</a><!--whee-->", | |
| 177 element_classes = { | |
| 178 Tag: TagPlus, | |
| 179 NavigableString: StringPlus, | |
| 180 Comment: CommentPlus, | |
| 181 } | |
| 182 ) | |
| 183 | |
| 184 # The tree was built with TagPlus, StringPlus, and CommentPlus objects, | |
| 185 # rather than Tag, String, and Comment objects. | |
| 186 assert all( | |
| 187 isinstance(x, (TagPlus, StringPlus, CommentPlus)) | |
| 188 for x in soup.recursiveChildGenerator() | |
| 189 ) | |
| 190 | |
| 191 def test_alternate_string_containers(self): | |
| 192 # Test the ability to customize the string containers for | |
| 193 # different types of tags. | |
| 194 class PString(NavigableString): | |
| 195 pass | |
| 196 | |
| 197 class BString(NavigableString): | |
| 198 pass | |
| 199 | |
| 200 soup = self.soup( | |
| 201 "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", | |
| 202 string_containers = { | |
| 203 'b': BString, | |
| 204 'p': PString, | |
| 205 } | |
| 206 ) | |
| 207 | |
| 208 # The string before the <p> tag is a regular NavigableString. | |
| 209 assert isinstance(soup.div.contents[0], NavigableString) | |
| 210 | |
| 211 # The string inside the <p> tag, but not inside the <i> tag, | |
| 212 # is a PString. | |
| 213 assert isinstance(soup.p.contents[0], PString) | |
| 214 | |
| 215 # Every string inside the <b> tag is a BString, even the one that | |
| 216 # was also inside an <i> tag. | |
| 217 for s in soup.b.strings: | |
| 218 assert isinstance(s, BString) | |
| 219 | |
| 220 # Now that parsing was complete, the string_container_stack | |
| 221 # (where this information was kept) has been cleared out. | |
| 222 self.assertEqual([], soup.string_container_stack) | |
| 223 | |
| 224 | |
| 225 class TestWarnings(SoupTest): | |
| 226 | |
| 227 def _no_parser_specified(self, s, is_there=True): | |
| 228 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) | |
| 229 self.assertTrue(v) | |
| 230 | |
| 231 def test_warning_if_no_parser_specified(self): | |
| 232 with warnings.catch_warnings(record=True) as w: | |
| 233 soup = self.soup("<a><b></b></a>") | |
| 234 msg = str(w[0].message) | |
| 235 self._assert_no_parser_specified(msg) | |
| 236 | |
| 237 def test_warning_if_parser_specified_too_vague(self): | |
| 238 with warnings.catch_warnings(record=True) as w: | |
| 239 soup = self.soup("<a><b></b></a>", "html") | |
| 240 msg = str(w[0].message) | |
| 241 self._assert_no_parser_specified(msg) | |
| 242 | |
| 243 def test_no_warning_if_explicit_parser_specified(self): | |
| 244 with warnings.catch_warnings(record=True) as w: | |
| 245 soup = self.soup("<a><b></b></a>", "html.parser") | |
| 246 self.assertEqual([], w) | |
| 247 | |
| 248 def test_parseOnlyThese_renamed_to_parse_only(self): | |
| 249 with warnings.catch_warnings(record=True) as w: | |
| 250 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | |
| 251 msg = str(w[0].message) | |
| 252 self.assertTrue("parseOnlyThese" in msg) | |
| 253 self.assertTrue("parse_only" in msg) | |
| 254 self.assertEqual(b"<b></b>", soup.encode()) | |
| 255 | |
| 256 def test_fromEncoding_renamed_to_from_encoding(self): | |
| 257 with warnings.catch_warnings(record=True) as w: | |
| 258 utf8 = b"\xc3\xa9" | |
| 259 soup = self.soup(utf8, fromEncoding="utf8") | |
| 260 msg = str(w[0].message) | |
| 261 self.assertTrue("fromEncoding" in msg) | |
| 262 self.assertTrue("from_encoding" in msg) | |
| 263 self.assertEqual("utf8", soup.original_encoding) | |
| 264 | |
| 265 def test_unrecognized_keyword_argument(self): | |
| 266 self.assertRaises( | |
| 267 TypeError, self.soup, "<a>", no_such_argument=True) | |
| 268 | |
| 269 class TestWarnings(SoupTest): | |
| 270 | |
| 271 def test_disk_file_warning(self): | |
| 272 filehandle = tempfile.NamedTemporaryFile() | |
| 273 filename = filehandle.name | |
| 274 try: | |
| 275 with warnings.catch_warnings(record=True) as w: | |
| 276 soup = self.soup(filename) | |
| 277 msg = str(w[0].message) | |
| 278 self.assertTrue("looks like a filename" in msg) | |
| 279 finally: | |
| 280 filehandle.close() | |
| 281 | |
| 282 # The file no longer exists, so Beautiful Soup will no longer issue the warning. | |
| 283 with warnings.catch_warnings(record=True) as w: | |
| 284 soup = self.soup(filename) | |
| 285 self.assertEqual(0, len(w)) | |
| 286 | |
| 287 def test_url_warning_with_bytes_url(self): | |
| 288 with warnings.catch_warnings(record=True) as warning_list: | |
| 289 soup = self.soup(b"http://www.crummybytes.com/") | |
| 290 # Be aware this isn't the only warning that can be raised during | |
| 291 # execution.. | |
| 292 self.assertTrue(any("looks like a URL" in str(w.message) | |
| 293 for w in warning_list)) | |
| 294 | |
| 295 def test_url_warning_with_unicode_url(self): | |
| 296 with warnings.catch_warnings(record=True) as warning_list: | |
| 297 # note - this url must differ from the bytes one otherwise | |
| 298 # python's warnings system swallows the second warning | |
| 299 soup = self.soup("http://www.crummyunicode.com/") | |
| 300 self.assertTrue(any("looks like a URL" in str(w.message) | |
| 301 for w in warning_list)) | |
| 302 | |
| 303 def test_url_warning_with_bytes_and_space(self): | |
| 304 with warnings.catch_warnings(record=True) as warning_list: | |
| 305 soup = self.soup(b"http://www.crummybytes.com/ is great") | |
| 306 self.assertFalse(any("looks like a URL" in str(w.message) | |
| 307 for w in warning_list)) | |
| 308 | |
| 309 def test_url_warning_with_unicode_and_space(self): | |
| 310 with warnings.catch_warnings(record=True) as warning_list: | |
| 311 soup = self.soup("http://www.crummyuncode.com/ is great") | |
| 312 self.assertFalse(any("looks like a URL" in str(w.message) | |
| 313 for w in warning_list)) | |
| 314 | |
| 315 | |
| 316 class TestSelectiveParsing(SoupTest): | |
| 317 | |
| 318 def test_parse_with_soupstrainer(self): | |
| 319 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | |
| 320 strainer = SoupStrainer("b") | |
| 321 soup = self.soup(markup, parse_only=strainer) | |
| 322 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | |
| 323 | |
| 324 | |
| 325 class TestEntitySubstitution(unittest.TestCase): | |
| 326 """Standalone tests of the EntitySubstitution class.""" | |
| 327 def setUp(self): | |
| 328 self.sub = EntitySubstitution | |
| 329 | |
| 330 def test_simple_html_substitution(self): | |
| 331 # Unicode characters corresponding to named HTML entites | |
| 332 # are substituted, and no others. | |
| 333 s = "foo\u2200\N{SNOWMAN}\u00f5bar" | |
| 334 self.assertEqual(self.sub.substitute_html(s), | |
| 335 "foo∀\N{SNOWMAN}õbar") | |
| 336 | |
| 337 def test_smart_quote_substitution(self): | |
| 338 # MS smart quotes are a common source of frustration, so we | |
| 339 # give them a special test. | |
| 340 quotes = b"\x91\x92foo\x93\x94" | |
| 341 dammit = UnicodeDammit(quotes) | |
| 342 self.assertEqual(self.sub.substitute_html(dammit.markup), | |
| 343 "‘’foo“”") | |
| 344 | |
| 345 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | |
| 346 s = 'Welcome to "my bar"' | |
| 347 self.assertEqual(self.sub.substitute_xml(s, False), s) | |
| 348 | |
| 349 def test_xml_attribute_quoting_normally_uses_double_quotes(self): | |
| 350 self.assertEqual(self.sub.substitute_xml("Welcome", True), | |
| 351 '"Welcome"') | |
| 352 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | |
| 353 '"Bob\'s Bar"') | |
| 354 | |
| 355 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | |
| 356 s = 'Welcome to "my bar"' | |
| 357 self.assertEqual(self.sub.substitute_xml(s, True), | |
| 358 "'Welcome to \"my bar\"'") | |
| 359 | |
| 360 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | |
| 361 s = 'Welcome to "Bob\'s Bar"' | |
| 362 self.assertEqual( | |
| 363 self.sub.substitute_xml(s, True), | |
| 364 '"Welcome to "Bob\'s Bar""') | |
| 365 | |
| 366 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | |
| 367 quoted = 'Welcome to "Bob\'s Bar"' | |
| 368 self.assertEqual(self.sub.substitute_xml(quoted), quoted) | |
| 369 | |
| 370 def test_xml_quoting_handles_angle_brackets(self): | |
| 371 self.assertEqual( | |
| 372 self.sub.substitute_xml("foo<bar>"), | |
| 373 "foo<bar>") | |
| 374 | |
| 375 def test_xml_quoting_handles_ampersands(self): | |
| 376 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | |
| 377 | |
| 378 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | |
| 379 self.assertEqual( | |
| 380 self.sub.substitute_xml("ÁT&T"), | |
| 381 "&Aacute;T&T") | |
| 382 | |
| 383 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | |
| 384 self.assertEqual( | |
| 385 self.sub.substitute_xml_containing_entities("ÁT&T"), | |
| 386 "ÁT&T") | |
| 387 | |
| 388 def test_quotes_not_html_substituted(self): | |
| 389 """There's no need to do this except inside attribute values.""" | |
| 390 text = 'Bob\'s "bar"' | |
| 391 self.assertEqual(self.sub.substitute_html(text), text) | |
| 392 | |
| 393 | |
| 394 class TestEncodingConversion(SoupTest): | |
| 395 # Test Beautiful Soup's ability to decode and encode from various | |
| 396 # encodings. | |
| 397 | |
| 398 def setUp(self): | |
| 399 super(TestEncodingConversion, self).setUp() | |
| 400 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | |
| 401 self.utf8_data = self.unicode_data.encode("utf-8") | |
| 402 # Just so you know what it looks like. | |
| 403 self.assertEqual( | |
| 404 self.utf8_data, | |
| 405 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | |
| 406 | |
| 407 def test_ascii_in_unicode_out(self): | |
| 408 # ASCII input is converted to Unicode. The original_encoding | |
| 409 # attribute is set to 'utf-8', a superset of ASCII. | |
| 410 chardet = bs4.dammit.chardet_dammit | |
| 411 logging.disable(logging.WARNING) | |
| 412 try: | |
| 413 def noop(str): | |
| 414 return None | |
| 415 # Disable chardet, which will realize that the ASCII is ASCII. | |
| 416 bs4.dammit.chardet_dammit = noop | |
| 417 ascii = b"<foo>a</foo>" | |
| 418 soup_from_ascii = self.soup(ascii) | |
| 419 unicode_output = soup_from_ascii.decode() | |
| 420 self.assertTrue(isinstance(unicode_output, str)) | |
| 421 self.assertEqual(unicode_output, self.document_for(ascii.decode())) | |
| 422 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | |
| 423 finally: | |
| 424 logging.disable(logging.NOTSET) | |
| 425 bs4.dammit.chardet_dammit = chardet | |
| 426 | |
| 427 def test_unicode_in_unicode_out(self): | |
| 428 # Unicode input is left alone. The original_encoding attribute | |
| 429 # is not set. | |
| 430 soup_from_unicode = self.soup(self.unicode_data) | |
| 431 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | |
| 432 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') | |
| 433 self.assertEqual(soup_from_unicode.original_encoding, None) | |
| 434 | |
| 435 def test_utf8_in_unicode_out(self): | |
| 436 # UTF-8 input is converted to Unicode. The original_encoding | |
| 437 # attribute is set. | |
| 438 soup_from_utf8 = self.soup(self.utf8_data) | |
| 439 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | |
| 440 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') | |
| 441 | |
| 442 def test_utf8_out(self): | |
| 443 # The internal data structures can be encoded as UTF-8. | |
| 444 soup_from_unicode = self.soup(self.unicode_data) | |
| 445 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | |
| 446 | |
| 447 @skipIf( | |
| 448 PYTHON_3_PRE_3_2, | |
| 449 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | |
| 450 def test_attribute_name_containing_unicode_characters(self): | |
| 451 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' | |
| 452 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | |
| 453 | |
| 454 class TestUnicodeDammit(unittest.TestCase): | |
| 455 """Standalone tests of UnicodeDammit.""" | |
| 456 | |
| 457 def test_unicode_input(self): | |
| 458 markup = "I'm already Unicode! \N{SNOWMAN}" | |
| 459 dammit = UnicodeDammit(markup) | |
| 460 self.assertEqual(dammit.unicode_markup, markup) | |
| 461 | |
| 462 def test_smart_quotes_to_unicode(self): | |
| 463 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
| 464 dammit = UnicodeDammit(markup) | |
| 465 self.assertEqual( | |
| 466 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") | |
| 467 | |
| 468 def test_smart_quotes_to_xml_entities(self): | |
| 469 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
| 470 dammit = UnicodeDammit(markup, smart_quotes_to="xml") | |
| 471 self.assertEqual( | |
| 472 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
| 473 | |
| 474 def test_smart_quotes_to_html_entities(self): | |
| 475 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
| 476 dammit = UnicodeDammit(markup, smart_quotes_to="html") | |
| 477 self.assertEqual( | |
| 478 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
| 479 | |
| 480 def test_smart_quotes_to_ascii(self): | |
| 481 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
| 482 dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | |
| 483 self.assertEqual( | |
| 484 dammit.unicode_markup, """<foo>''""</foo>""") | |
| 485 | |
| 486 def test_detect_utf8(self): | |
| 487 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" | |
| 488 dammit = UnicodeDammit(utf8) | |
| 489 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
| 490 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') | |
| 491 | |
| 492 | |
| 493 def test_convert_hebrew(self): | |
| 494 hebrew = b"\xed\xe5\xec\xf9" | |
| 495 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | |
| 496 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | |
| 497 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') | |
| 498 | |
| 499 def test_dont_see_smart_quotes_where_there_are_none(self): | |
| 500 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | |
| 501 dammit = UnicodeDammit(utf_8) | |
| 502 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
| 503 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | |
| 504 | |
| 505 def test_ignore_inappropriate_codecs(self): | |
| 506 utf8_data = "Räksmörgås".encode("utf-8") | |
| 507 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | |
| 508 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
| 509 | |
| 510 def test_ignore_invalid_codecs(self): | |
| 511 utf8_data = "Räksmörgås".encode("utf-8") | |
| 512 for bad_encoding in ['.utf8', '...', 'utF---16.!']: | |
| 513 dammit = UnicodeDammit(utf8_data, [bad_encoding]) | |
| 514 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
| 515 | |
| 516 def test_exclude_encodings(self): | |
| 517 # This is UTF-8. | |
| 518 utf8_data = "Räksmörgås".encode("utf-8") | |
| 519 | |
| 520 # But if we exclude UTF-8 from consideration, the guess is | |
| 521 # Windows-1252. | |
| 522 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | |
| 523 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') | |
| 524 | |
| 525 # And if we exclude that, there is no valid guess at all. | |
| 526 dammit = UnicodeDammit( | |
| 527 utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | |
| 528 self.assertEqual(dammit.original_encoding, None) | |
| 529 | |
| 530 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | |
| 531 detected = EncodingDetector( | |
| 532 b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | |
| 533 encodings = list(detected.encodings) | |
| 534 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | |
| 535 | |
| 536 def test_detect_html5_style_meta_tag(self): | |
| 537 | |
| 538 for data in ( | |
| 539 b'<html><meta charset="euc-jp" /></html>', | |
| 540 b"<html><meta charset='euc-jp' /></html>", | |
| 541 b"<html><meta charset=euc-jp /></html>", | |
| 542 b"<html><meta charset=euc-jp/></html>"): | |
| 543 dammit = UnicodeDammit(data, is_html=True) | |
| 544 self.assertEqual( | |
| 545 "euc-jp", dammit.original_encoding) | |
| 546 | |
| 547 def test_last_ditch_entity_replacement(self): | |
| 548 # This is a UTF-8 document that contains bytestrings | |
| 549 # completely incompatible with UTF-8 (ie. encoded with some other | |
| 550 # encoding). | |
| 551 # | |
| 552 # Since there is no consistent encoding for the document, | |
| 553 # Unicode, Dammit will eventually encode the document as UTF-8 | |
| 554 # and encode the incompatible characters as REPLACEMENT | |
| 555 # CHARACTER. | |
| 556 # | |
| 557 # If chardet is installed, it will detect that the document | |
| 558 # can be converted into ISO-8859-1 without errors. This happens | |
| 559 # to be the wrong encoding, but it is a consistent encoding, so the | |
| 560 # code we're testing here won't run. | |
| 561 # | |
| 562 # So we temporarily disable chardet if it's present. | |
| 563 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | |
| 564 <html><b>\330\250\330\252\330\261</b> | |
| 565 <i>\310\322\321\220\312\321\355\344</i></html>""" | |
| 566 chardet = bs4.dammit.chardet_dammit | |
| 567 logging.disable(logging.WARNING) | |
| 568 try: | |
| 569 def noop(str): | |
| 570 return None | |
| 571 bs4.dammit.chardet_dammit = noop | |
| 572 dammit = UnicodeDammit(doc) | |
| 573 self.assertEqual(True, dammit.contains_replacement_characters) | |
| 574 self.assertTrue("\ufffd" in dammit.unicode_markup) | |
| 575 | |
| 576 soup = BeautifulSoup(doc, "html.parser") | |
| 577 self.assertTrue(soup.contains_replacement_characters) | |
| 578 finally: | |
| 579 logging.disable(logging.NOTSET) | |
| 580 bs4.dammit.chardet_dammit = chardet | |
| 581 | |
| 582 def test_byte_order_mark_removed(self): | |
| 583 # A document written in UTF-16LE will have its byte order marker stripped. | |
| 584 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | |
| 585 dammit = UnicodeDammit(data) | |
| 586 self.assertEqual("<a>áé</a>", dammit.unicode_markup) | |
| 587 self.assertEqual("utf-16le", dammit.original_encoding) | |
| 588 | |
| 589 def test_detwingle(self): | |
| 590 # Here's a UTF8 document. | |
| 591 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") | |
| 592 | |
| 593 # Here's a Windows-1252 document. | |
| 594 windows_1252 = ( | |
| 595 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | |
| 596 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | |
| 597 | |
| 598 # Through some unholy alchemy, they've been stuck together. | |
| 599 doc = utf8 + windows_1252 + utf8 | |
| 600 | |
| 601 # The document can't be turned into UTF-8: | |
| 602 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | |
| 603 | |
| 604 # Unicode, Dammit thinks the whole document is Windows-1252, | |
| 605 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | |
| 606 | |
| 607 # But if we run it through fix_embedded_windows_1252, it's fixed: | |
| 608 | |
| 609 fixed = UnicodeDammit.detwingle(doc) | |
| 610 self.assertEqual( | |
| 611 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | |
| 612 | |
| 613 def test_detwingle_ignores_multibyte_characters(self): | |
| 614 # Each of these characters has a UTF-8 representation ending | |
| 615 # in \x93. \x93 is a smart quote if interpreted as | |
| 616 # Windows-1252. But our code knows to skip over multibyte | |
| 617 # UTF-8 characters, so they'll survive the process unscathed. | |
| 618 for tricky_unicode_char in ( | |
| 619 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | |
| 620 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | |
| 621 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | |
| 622 ): | |
| 623 input = tricky_unicode_char.encode("utf8") | |
| 624 self.assertTrue(input.endswith(b'\x93')) | |
| 625 output = UnicodeDammit.detwingle(input) | |
| 626 self.assertEqual(output, input) | |
| 627 | |
| 628 def test_find_declared_encoding(self): | |
| 629 # Test our ability to find a declared encoding inside an | |
| 630 # XML or HTML document. | |
| 631 # | |
| 632 # Even if the document comes in as Unicode, it may be | |
| 633 # interesting to know what encoding was claimed | |
| 634 # originally. | |
| 635 | |
| 636 html_unicode = '<html><head><meta charset="utf-8"></head></html>' | |
| 637 html_bytes = html_unicode.encode("ascii") | |
| 638 | |
| 639 xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>' | |
| 640 xml_bytes = xml_unicode.encode("ascii") | |
| 641 | |
| 642 m = EncodingDetector.find_declared_encoding | |
| 643 self.assertEqual(None, m(html_unicode, is_html=False)) | |
| 644 self.assertEqual("utf-8", m(html_unicode, is_html=True)) | |
| 645 self.assertEqual("utf-8", m(html_bytes, is_html=True)) | |
| 646 | |
| 647 self.assertEqual("iso-8859-1", m(xml_unicode)) | |
| 648 self.assertEqual("iso-8859-1", m(xml_bytes)) | |
| 649 | |
| 650 # Normally, only the first few kilobytes of a document are checked for | |
| 651 # an encoding. | |
| 652 spacer = b' ' * 5000 | |
| 653 self.assertEqual(None, m(spacer + html_bytes)) | |
| 654 self.assertEqual(None, m(spacer + xml_bytes)) | |
| 655 | |
| 656 # But you can tell find_declared_encoding to search an entire | |
| 657 # HTML document. | |
| 658 self.assertEqual( | |
| 659 "utf-8", | |
| 660 m(spacer + html_bytes, is_html=True, search_entire_document=True) | |
| 661 ) | |
| 662 | |
| 663 # The XML encoding declaration has to be the very first thing | |
| 664 # in the document. We'll allow whitespace before the document | |
| 665 # starts, but nothing else. | |
| 666 self.assertEqual( | |
| 667 "iso-8859-1", | |
| 668 m(xml_bytes, search_entire_document=True) | |
| 669 ) | |
| 670 self.assertEqual( | |
| 671 None, m(b'a' + xml_bytes, search_entire_document=True) | |
| 672 ) | |
| 673 | |
| 674 class TestNamedspacedAttribute(SoupTest): | |
| 675 | |
| 676 def test_name_may_be_none_or_missing(self): | |
| 677 a = NamespacedAttribute("xmlns", None) | |
| 678 self.assertEqual(a, "xmlns") | |
| 679 | |
| 680 a = NamespacedAttribute("xmlns") | |
| 681 self.assertEqual(a, "xmlns") | |
| 682 | |
| 683 def test_attribute_is_equivalent_to_colon_separated_string(self): | |
| 684 a = NamespacedAttribute("a", "b") | |
| 685 self.assertEqual("a:b", a) | |
| 686 | |
| 687 def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | |
| 688 a = NamespacedAttribute("a", "b", "c") | |
| 689 b = NamespacedAttribute("a", "b", "c") | |
| 690 self.assertEqual(a, b) | |
| 691 | |
| 692 # The actual namespace is not considered. | |
| 693 c = NamespacedAttribute("a", "b", None) | |
| 694 self.assertEqual(a, c) | |
| 695 | |
| 696 # But name and prefix are important. | |
| 697 d = NamespacedAttribute("a", "z", "c") | |
| 698 self.assertNotEqual(a, d) | |
| 699 | |
| 700 e = NamespacedAttribute("z", "b", "c") | |
| 701 self.assertNotEqual(a, e) | |
| 702 | |
| 703 | |
| 704 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | |
| 705 | |
| 706 def test_content_meta_attribute_value(self): | |
| 707 value = CharsetMetaAttributeValue("euc-jp") | |
| 708 self.assertEqual("euc-jp", value) | |
| 709 self.assertEqual("euc-jp", value.original_value) | |
| 710 self.assertEqual("utf8", value.encode("utf8")) | |
| 711 | |
| 712 | |
| 713 def test_content_meta_attribute_value(self): | |
| 714 value = ContentMetaAttributeValue("text/html; charset=euc-jp") | |
| 715 self.assertEqual("text/html; charset=euc-jp", value) | |
| 716 self.assertEqual("text/html; charset=euc-jp", value.original_value) | |
| 717 self.assertEqual("text/html; charset=utf8", value.encode("utf8")) |
