Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bs4/tests/test_soup.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """Tests of Beautiful Soup as a whole.""" | |
3 | |
4 from pdb import set_trace | |
5 import logging | |
6 import unittest | |
7 import sys | |
8 import tempfile | |
9 | |
10 from bs4 import ( | |
11 BeautifulSoup, | |
12 BeautifulStoneSoup, | |
13 ) | |
14 from bs4.builder import ( | |
15 TreeBuilder, | |
16 ParserRejectedMarkup, | |
17 ) | |
18 from bs4.element import ( | |
19 CharsetMetaAttributeValue, | |
20 Comment, | |
21 ContentMetaAttributeValue, | |
22 SoupStrainer, | |
23 NamespacedAttribute, | |
24 Tag, | |
25 NavigableString, | |
26 ) | |
27 | |
28 import bs4.dammit | |
29 from bs4.dammit import ( | |
30 EntitySubstitution, | |
31 UnicodeDammit, | |
32 EncodingDetector, | |
33 ) | |
34 from bs4.testing import ( | |
35 default_builder, | |
36 SoupTest, | |
37 skipIf, | |
38 ) | |
39 import warnings | |
40 | |
41 try: | |
42 from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | |
43 LXML_PRESENT = True | |
44 except ImportError as e: | |
45 LXML_PRESENT = False | |
46 | |
47 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | |
48 | |
49 class TestConstructor(SoupTest): | |
50 | |
51 def test_short_unicode_input(self): | |
52 data = "<h1>éé</h1>" | |
53 soup = self.soup(data) | |
54 self.assertEqual("éé", soup.h1.string) | |
55 | |
56 def test_embedded_null(self): | |
57 data = "<h1>foo\0bar</h1>" | |
58 soup = self.soup(data) | |
59 self.assertEqual("foo\0bar", soup.h1.string) | |
60 | |
61 def test_exclude_encodings(self): | |
62 utf8_data = "Räksmörgås".encode("utf-8") | |
63 soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) | |
64 self.assertEqual("windows-1252", soup.original_encoding) | |
65 | |
66 def test_custom_builder_class(self): | |
67 # Verify that you can pass in a custom Builder class and | |
68 # it'll be instantiated with the appropriate keyword arguments. | |
69 class Mock(object): | |
70 def __init__(self, **kwargs): | |
71 self.called_with = kwargs | |
72 self.is_xml = True | |
73 self.store_line_numbers = False | |
74 self.cdata_list_attributes = [] | |
75 self.preserve_whitespace_tags = [] | |
76 self.string_containers = {} | |
77 def initialize_soup(self, soup): | |
78 pass | |
79 def feed(self, markup): | |
80 self.fed = markup | |
81 def reset(self): | |
82 pass | |
83 def ignore(self, ignore): | |
84 pass | |
85 set_up_substitutions = can_be_empty_element = ignore | |
86 def prepare_markup(self, *args, **kwargs): | |
87 yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters" | |
88 | |
89 kwargs = dict( | |
90 var="value", | |
91 # This is a deprecated BS3-era keyword argument, which | |
92 # will be stripped out. | |
93 convertEntities=True, | |
94 ) | |
95 with warnings.catch_warnings(record=True): | |
96 soup = BeautifulSoup('', builder=Mock, **kwargs) | |
97 assert isinstance(soup.builder, Mock) | |
98 self.assertEqual(dict(var="value"), soup.builder.called_with) | |
99 self.assertEqual("prepared markup", soup.builder.fed) | |
100 | |
101 # You can also instantiate the TreeBuilder yourself. In this | |
102 # case, that specific object is used and any keyword arguments | |
103 # to the BeautifulSoup constructor are ignored. | |
104 builder = Mock(**kwargs) | |
105 with warnings.catch_warnings(record=True) as w: | |
106 soup = BeautifulSoup( | |
107 '', builder=builder, ignored_value=True, | |
108 ) | |
109 msg = str(w[0].message) | |
110 assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") | |
111 self.assertEqual(builder, soup.builder) | |
112 self.assertEqual(kwargs, builder.called_with) | |
113 | |
114 def test_parser_markup_rejection(self): | |
115 # If markup is completely rejected by the parser, an | |
116 # explanatory ParserRejectedMarkup exception is raised. | |
117 class Mock(TreeBuilder): | |
118 def feed(self, *args, **kwargs): | |
119 raise ParserRejectedMarkup("Nope.") | |
120 | |
121 def prepare_markup(self, *args, **kwargs): | |
122 # We're going to try two different ways of preparing this markup, | |
123 # but feed() will reject both of them. | |
124 yield markup, None, None, False | |
125 yield markup, None, None, False | |
126 | |
127 import re | |
128 self.assertRaisesRegex( | |
129 ParserRejectedMarkup, | |
130 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", | |
131 BeautifulSoup, '', builder=Mock, | |
132 ) | |
133 | |
134 def test_cdata_list_attributes(self): | |
135 # Most attribute values are represented as scalars, but the | |
136 # HTML standard says that some attributes, like 'class' have | |
137 # space-separated lists as values. | |
138 markup = '<a id=" an id " class=" a class "></a>' | |
139 soup = self.soup(markup) | |
140 | |
141 # Note that the spaces are stripped for 'class' but not for 'id'. | |
142 a = soup.a | |
143 self.assertEqual(" an id ", a['id']) | |
144 self.assertEqual(["a", "class"], a['class']) | |
145 | |
146 # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets | |
147 # you customize or disable this. As always, you can customize the TreeBuilder | |
148 # by passing in a keyword argument to the BeautifulSoup constructor. | |
149 soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) | |
150 self.assertEqual(" a class ", soup.a['class']) | |
151 | |
152 # Here are two ways of saying that `id` is a multi-valued | |
153 # attribute in this context, but 'class' is not. | |
154 for switcheroo in ({'*': 'id'}, {'a': 'id'}): | |
155 with warnings.catch_warnings(record=True) as w: | |
156 # This will create a warning about not explicitly | |
157 # specifying a parser, but we'll ignore it. | |
158 soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) | |
159 a = soup.a | |
160 self.assertEqual(["an", "id"], a['id']) | |
161 self.assertEqual(" a class ", a['class']) | |
162 | |
163 def test_replacement_classes(self): | |
164 # Test the ability to pass in replacements for element classes | |
165 # which will be used when building the tree. | |
166 class TagPlus(Tag): | |
167 pass | |
168 | |
169 class StringPlus(NavigableString): | |
170 pass | |
171 | |
172 class CommentPlus(Comment): | |
173 pass | |
174 | |
175 soup = self.soup( | |
176 "<a><b>foo</b>bar</a><!--whee-->", | |
177 element_classes = { | |
178 Tag: TagPlus, | |
179 NavigableString: StringPlus, | |
180 Comment: CommentPlus, | |
181 } | |
182 ) | |
183 | |
184 # The tree was built with TagPlus, StringPlus, and CommentPlus objects, | |
185 # rather than Tag, String, and Comment objects. | |
186 assert all( | |
187 isinstance(x, (TagPlus, StringPlus, CommentPlus)) | |
188 for x in soup.recursiveChildGenerator() | |
189 ) | |
190 | |
191 def test_alternate_string_containers(self): | |
192 # Test the ability to customize the string containers for | |
193 # different types of tags. | |
194 class PString(NavigableString): | |
195 pass | |
196 | |
197 class BString(NavigableString): | |
198 pass | |
199 | |
200 soup = self.soup( | |
201 "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text", | |
202 string_containers = { | |
203 'b': BString, | |
204 'p': PString, | |
205 } | |
206 ) | |
207 | |
208 # The string before the <p> tag is a regular NavigableString. | |
209 assert isinstance(soup.div.contents[0], NavigableString) | |
210 | |
211 # The string inside the <p> tag, but not inside the <i> tag, | |
212 # is a PString. | |
213 assert isinstance(soup.p.contents[0], PString) | |
214 | |
215 # Every string inside the <b> tag is a BString, even the one that | |
216 # was also inside an <i> tag. | |
217 for s in soup.b.strings: | |
218 assert isinstance(s, BString) | |
219 | |
220 # Now that parsing was complete, the string_container_stack | |
221 # (where this information was kept) has been cleared out. | |
222 self.assertEqual([], soup.string_container_stack) | |
223 | |
224 | |
225 class TestWarnings(SoupTest): | |
226 | |
227 def _no_parser_specified(self, s, is_there=True): | |
228 v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) | |
229 self.assertTrue(v) | |
230 | |
231 def test_warning_if_no_parser_specified(self): | |
232 with warnings.catch_warnings(record=True) as w: | |
233 soup = self.soup("<a><b></b></a>") | |
234 msg = str(w[0].message) | |
235 self._assert_no_parser_specified(msg) | |
236 | |
237 def test_warning_if_parser_specified_too_vague(self): | |
238 with warnings.catch_warnings(record=True) as w: | |
239 soup = self.soup("<a><b></b></a>", "html") | |
240 msg = str(w[0].message) | |
241 self._assert_no_parser_specified(msg) | |
242 | |
243 def test_no_warning_if_explicit_parser_specified(self): | |
244 with warnings.catch_warnings(record=True) as w: | |
245 soup = self.soup("<a><b></b></a>", "html.parser") | |
246 self.assertEqual([], w) | |
247 | |
248 def test_parseOnlyThese_renamed_to_parse_only(self): | |
249 with warnings.catch_warnings(record=True) as w: | |
250 soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | |
251 msg = str(w[0].message) | |
252 self.assertTrue("parseOnlyThese" in msg) | |
253 self.assertTrue("parse_only" in msg) | |
254 self.assertEqual(b"<b></b>", soup.encode()) | |
255 | |
256 def test_fromEncoding_renamed_to_from_encoding(self): | |
257 with warnings.catch_warnings(record=True) as w: | |
258 utf8 = b"\xc3\xa9" | |
259 soup = self.soup(utf8, fromEncoding="utf8") | |
260 msg = str(w[0].message) | |
261 self.assertTrue("fromEncoding" in msg) | |
262 self.assertTrue("from_encoding" in msg) | |
263 self.assertEqual("utf8", soup.original_encoding) | |
264 | |
265 def test_unrecognized_keyword_argument(self): | |
266 self.assertRaises( | |
267 TypeError, self.soup, "<a>", no_such_argument=True) | |
268 | |
269 class TestWarnings(SoupTest): | |
270 | |
271 def test_disk_file_warning(self): | |
272 filehandle = tempfile.NamedTemporaryFile() | |
273 filename = filehandle.name | |
274 try: | |
275 with warnings.catch_warnings(record=True) as w: | |
276 soup = self.soup(filename) | |
277 msg = str(w[0].message) | |
278 self.assertTrue("looks like a filename" in msg) | |
279 finally: | |
280 filehandle.close() | |
281 | |
282 # The file no longer exists, so Beautiful Soup will no longer issue the warning. | |
283 with warnings.catch_warnings(record=True) as w: | |
284 soup = self.soup(filename) | |
285 self.assertEqual(0, len(w)) | |
286 | |
287 def test_url_warning_with_bytes_url(self): | |
288 with warnings.catch_warnings(record=True) as warning_list: | |
289 soup = self.soup(b"http://www.crummybytes.com/") | |
290 # Be aware this isn't the only warning that can be raised during | |
291 # execution.. | |
292 self.assertTrue(any("looks like a URL" in str(w.message) | |
293 for w in warning_list)) | |
294 | |
295 def test_url_warning_with_unicode_url(self): | |
296 with warnings.catch_warnings(record=True) as warning_list: | |
297 # note - this url must differ from the bytes one otherwise | |
298 # python's warnings system swallows the second warning | |
299 soup = self.soup("http://www.crummyunicode.com/") | |
300 self.assertTrue(any("looks like a URL" in str(w.message) | |
301 for w in warning_list)) | |
302 | |
303 def test_url_warning_with_bytes_and_space(self): | |
304 with warnings.catch_warnings(record=True) as warning_list: | |
305 soup = self.soup(b"http://www.crummybytes.com/ is great") | |
306 self.assertFalse(any("looks like a URL" in str(w.message) | |
307 for w in warning_list)) | |
308 | |
309 def test_url_warning_with_unicode_and_space(self): | |
310 with warnings.catch_warnings(record=True) as warning_list: | |
311 soup = self.soup("http://www.crummyuncode.com/ is great") | |
312 self.assertFalse(any("looks like a URL" in str(w.message) | |
313 for w in warning_list)) | |
314 | |
315 | |
316 class TestSelectiveParsing(SoupTest): | |
317 | |
318 def test_parse_with_soupstrainer(self): | |
319 markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | |
320 strainer = SoupStrainer("b") | |
321 soup = self.soup(markup, parse_only=strainer) | |
322 self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | |
323 | |
324 | |
325 class TestEntitySubstitution(unittest.TestCase): | |
326 """Standalone tests of the EntitySubstitution class.""" | |
327 def setUp(self): | |
328 self.sub = EntitySubstitution | |
329 | |
330 def test_simple_html_substitution(self): | |
331 # Unicode characters corresponding to named HTML entites | |
332 # are substituted, and no others. | |
333 s = "foo\u2200\N{SNOWMAN}\u00f5bar" | |
334 self.assertEqual(self.sub.substitute_html(s), | |
335 "foo∀\N{SNOWMAN}õbar") | |
336 | |
337 def test_smart_quote_substitution(self): | |
338 # MS smart quotes are a common source of frustration, so we | |
339 # give them a special test. | |
340 quotes = b"\x91\x92foo\x93\x94" | |
341 dammit = UnicodeDammit(quotes) | |
342 self.assertEqual(self.sub.substitute_html(dammit.markup), | |
343 "‘’foo“”") | |
344 | |
345 def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | |
346 s = 'Welcome to "my bar"' | |
347 self.assertEqual(self.sub.substitute_xml(s, False), s) | |
348 | |
349 def test_xml_attribute_quoting_normally_uses_double_quotes(self): | |
350 self.assertEqual(self.sub.substitute_xml("Welcome", True), | |
351 '"Welcome"') | |
352 self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | |
353 '"Bob\'s Bar"') | |
354 | |
355 def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | |
356 s = 'Welcome to "my bar"' | |
357 self.assertEqual(self.sub.substitute_xml(s, True), | |
358 "'Welcome to \"my bar\"'") | |
359 | |
360 def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | |
361 s = 'Welcome to "Bob\'s Bar"' | |
362 self.assertEqual( | |
363 self.sub.substitute_xml(s, True), | |
364 '"Welcome to "Bob\'s Bar""') | |
365 | |
366 def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | |
367 quoted = 'Welcome to "Bob\'s Bar"' | |
368 self.assertEqual(self.sub.substitute_xml(quoted), quoted) | |
369 | |
370 def test_xml_quoting_handles_angle_brackets(self): | |
371 self.assertEqual( | |
372 self.sub.substitute_xml("foo<bar>"), | |
373 "foo<bar>") | |
374 | |
375 def test_xml_quoting_handles_ampersands(self): | |
376 self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | |
377 | |
378 def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | |
379 self.assertEqual( | |
380 self.sub.substitute_xml("ÁT&T"), | |
381 "&Aacute;T&T") | |
382 | |
383 def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | |
384 self.assertEqual( | |
385 self.sub.substitute_xml_containing_entities("ÁT&T"), | |
386 "ÁT&T") | |
387 | |
388 def test_quotes_not_html_substituted(self): | |
389 """There's no need to do this except inside attribute values.""" | |
390 text = 'Bob\'s "bar"' | |
391 self.assertEqual(self.sub.substitute_html(text), text) | |
392 | |
393 | |
394 class TestEncodingConversion(SoupTest): | |
395 # Test Beautiful Soup's ability to decode and encode from various | |
396 # encodings. | |
397 | |
398 def setUp(self): | |
399 super(TestEncodingConversion, self).setUp() | |
400 self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | |
401 self.utf8_data = self.unicode_data.encode("utf-8") | |
402 # Just so you know what it looks like. | |
403 self.assertEqual( | |
404 self.utf8_data, | |
405 b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | |
406 | |
407 def test_ascii_in_unicode_out(self): | |
408 # ASCII input is converted to Unicode. The original_encoding | |
409 # attribute is set to 'utf-8', a superset of ASCII. | |
410 chardet = bs4.dammit.chardet_dammit | |
411 logging.disable(logging.WARNING) | |
412 try: | |
413 def noop(str): | |
414 return None | |
415 # Disable chardet, which will realize that the ASCII is ASCII. | |
416 bs4.dammit.chardet_dammit = noop | |
417 ascii = b"<foo>a</foo>" | |
418 soup_from_ascii = self.soup(ascii) | |
419 unicode_output = soup_from_ascii.decode() | |
420 self.assertTrue(isinstance(unicode_output, str)) | |
421 self.assertEqual(unicode_output, self.document_for(ascii.decode())) | |
422 self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | |
423 finally: | |
424 logging.disable(logging.NOTSET) | |
425 bs4.dammit.chardet_dammit = chardet | |
426 | |
427 def test_unicode_in_unicode_out(self): | |
428 # Unicode input is left alone. The original_encoding attribute | |
429 # is not set. | |
430 soup_from_unicode = self.soup(self.unicode_data) | |
431 self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | |
432 self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') | |
433 self.assertEqual(soup_from_unicode.original_encoding, None) | |
434 | |
435 def test_utf8_in_unicode_out(self): | |
436 # UTF-8 input is converted to Unicode. The original_encoding | |
437 # attribute is set. | |
438 soup_from_utf8 = self.soup(self.utf8_data) | |
439 self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | |
440 self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') | |
441 | |
442 def test_utf8_out(self): | |
443 # The internal data structures can be encoded as UTF-8. | |
444 soup_from_unicode = self.soup(self.unicode_data) | |
445 self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | |
446 | |
447 @skipIf( | |
448 PYTHON_3_PRE_3_2, | |
449 "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | |
450 def test_attribute_name_containing_unicode_characters(self): | |
451 markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' | |
452 self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | |
453 | |
454 class TestUnicodeDammit(unittest.TestCase): | |
455 """Standalone tests of UnicodeDammit.""" | |
456 | |
457 def test_unicode_input(self): | |
458 markup = "I'm already Unicode! \N{SNOWMAN}" | |
459 dammit = UnicodeDammit(markup) | |
460 self.assertEqual(dammit.unicode_markup, markup) | |
461 | |
462 def test_smart_quotes_to_unicode(self): | |
463 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
464 dammit = UnicodeDammit(markup) | |
465 self.assertEqual( | |
466 dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") | |
467 | |
468 def test_smart_quotes_to_xml_entities(self): | |
469 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
470 dammit = UnicodeDammit(markup, smart_quotes_to="xml") | |
471 self.assertEqual( | |
472 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
473 | |
474 def test_smart_quotes_to_html_entities(self): | |
475 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
476 dammit = UnicodeDammit(markup, smart_quotes_to="html") | |
477 self.assertEqual( | |
478 dammit.unicode_markup, "<foo>‘’“”</foo>") | |
479 | |
480 def test_smart_quotes_to_ascii(self): | |
481 markup = b"<foo>\x91\x92\x93\x94</foo>" | |
482 dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | |
483 self.assertEqual( | |
484 dammit.unicode_markup, """<foo>''""</foo>""") | |
485 | |
486 def test_detect_utf8(self): | |
487 utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" | |
488 dammit = UnicodeDammit(utf8) | |
489 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
490 self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') | |
491 | |
492 | |
493 def test_convert_hebrew(self): | |
494 hebrew = b"\xed\xe5\xec\xf9" | |
495 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | |
496 self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | |
497 self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') | |
498 | |
499 def test_dont_see_smart_quotes_where_there_are_none(self): | |
500 utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | |
501 dammit = UnicodeDammit(utf_8) | |
502 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
503 self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | |
504 | |
505 def test_ignore_inappropriate_codecs(self): | |
506 utf8_data = "Räksmörgås".encode("utf-8") | |
507 dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | |
508 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
509 | |
510 def test_ignore_invalid_codecs(self): | |
511 utf8_data = "Räksmörgås".encode("utf-8") | |
512 for bad_encoding in ['.utf8', '...', 'utF---16.!']: | |
513 dammit = UnicodeDammit(utf8_data, [bad_encoding]) | |
514 self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | |
515 | |
516 def test_exclude_encodings(self): | |
517 # This is UTF-8. | |
518 utf8_data = "Räksmörgås".encode("utf-8") | |
519 | |
520 # But if we exclude UTF-8 from consideration, the guess is | |
521 # Windows-1252. | |
522 dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | |
523 self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') | |
524 | |
525 # And if we exclude that, there is no valid guess at all. | |
526 dammit = UnicodeDammit( | |
527 utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | |
528 self.assertEqual(dammit.original_encoding, None) | |
529 | |
530 def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | |
531 detected = EncodingDetector( | |
532 b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | |
533 encodings = list(detected.encodings) | |
534 assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | |
535 | |
536 def test_detect_html5_style_meta_tag(self): | |
537 | |
538 for data in ( | |
539 b'<html><meta charset="euc-jp" /></html>', | |
540 b"<html><meta charset='euc-jp' /></html>", | |
541 b"<html><meta charset=euc-jp /></html>", | |
542 b"<html><meta charset=euc-jp/></html>"): | |
543 dammit = UnicodeDammit(data, is_html=True) | |
544 self.assertEqual( | |
545 "euc-jp", dammit.original_encoding) | |
546 | |
547 def test_last_ditch_entity_replacement(self): | |
548 # This is a UTF-8 document that contains bytestrings | |
549 # completely incompatible with UTF-8 (ie. encoded with some other | |
550 # encoding). | |
551 # | |
552 # Since there is no consistent encoding for the document, | |
553 # Unicode, Dammit will eventually encode the document as UTF-8 | |
554 # and encode the incompatible characters as REPLACEMENT | |
555 # CHARACTER. | |
556 # | |
557 # If chardet is installed, it will detect that the document | |
558 # can be converted into ISO-8859-1 without errors. This happens | |
559 # to be the wrong encoding, but it is a consistent encoding, so the | |
560 # code we're testing here won't run. | |
561 # | |
562 # So we temporarily disable chardet if it's present. | |
563 doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | |
564 <html><b>\330\250\330\252\330\261</b> | |
565 <i>\310\322\321\220\312\321\355\344</i></html>""" | |
566 chardet = bs4.dammit.chardet_dammit | |
567 logging.disable(logging.WARNING) | |
568 try: | |
569 def noop(str): | |
570 return None | |
571 bs4.dammit.chardet_dammit = noop | |
572 dammit = UnicodeDammit(doc) | |
573 self.assertEqual(True, dammit.contains_replacement_characters) | |
574 self.assertTrue("\ufffd" in dammit.unicode_markup) | |
575 | |
576 soup = BeautifulSoup(doc, "html.parser") | |
577 self.assertTrue(soup.contains_replacement_characters) | |
578 finally: | |
579 logging.disable(logging.NOTSET) | |
580 bs4.dammit.chardet_dammit = chardet | |
581 | |
582 def test_byte_order_mark_removed(self): | |
583 # A document written in UTF-16LE will have its byte order marker stripped. | |
584 data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | |
585 dammit = UnicodeDammit(data) | |
586 self.assertEqual("<a>áé</a>", dammit.unicode_markup) | |
587 self.assertEqual("utf-16le", dammit.original_encoding) | |
588 | |
589 def test_detwingle(self): | |
590 # Here's a UTF8 document. | |
591 utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") | |
592 | |
593 # Here's a Windows-1252 document. | |
594 windows_1252 = ( | |
595 "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | |
596 "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | |
597 | |
598 # Through some unholy alchemy, they've been stuck together. | |
599 doc = utf8 + windows_1252 + utf8 | |
600 | |
601 # The document can't be turned into UTF-8: | |
602 self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | |
603 | |
604 # Unicode, Dammit thinks the whole document is Windows-1252, | |
605 # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | |
606 | |
607 # But if we run it through fix_embedded_windows_1252, it's fixed: | |
608 | |
609 fixed = UnicodeDammit.detwingle(doc) | |
610 self.assertEqual( | |
611 "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | |
612 | |
613 def test_detwingle_ignores_multibyte_characters(self): | |
614 # Each of these characters has a UTF-8 representation ending | |
615 # in \x93. \x93 is a smart quote if interpreted as | |
616 # Windows-1252. But our code knows to skip over multibyte | |
617 # UTF-8 characters, so they'll survive the process unscathed. | |
618 for tricky_unicode_char in ( | |
619 "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | |
620 "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | |
621 "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | |
622 ): | |
623 input = tricky_unicode_char.encode("utf8") | |
624 self.assertTrue(input.endswith(b'\x93')) | |
625 output = UnicodeDammit.detwingle(input) | |
626 self.assertEqual(output, input) | |
627 | |
628 def test_find_declared_encoding(self): | |
629 # Test our ability to find a declared encoding inside an | |
630 # XML or HTML document. | |
631 # | |
632 # Even if the document comes in as Unicode, it may be | |
633 # interesting to know what encoding was claimed | |
634 # originally. | |
635 | |
636 html_unicode = '<html><head><meta charset="utf-8"></head></html>' | |
637 html_bytes = html_unicode.encode("ascii") | |
638 | |
639 xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>' | |
640 xml_bytes = xml_unicode.encode("ascii") | |
641 | |
642 m = EncodingDetector.find_declared_encoding | |
643 self.assertEqual(None, m(html_unicode, is_html=False)) | |
644 self.assertEqual("utf-8", m(html_unicode, is_html=True)) | |
645 self.assertEqual("utf-8", m(html_bytes, is_html=True)) | |
646 | |
647 self.assertEqual("iso-8859-1", m(xml_unicode)) | |
648 self.assertEqual("iso-8859-1", m(xml_bytes)) | |
649 | |
650 # Normally, only the first few kilobytes of a document are checked for | |
651 # an encoding. | |
652 spacer = b' ' * 5000 | |
653 self.assertEqual(None, m(spacer + html_bytes)) | |
654 self.assertEqual(None, m(spacer + xml_bytes)) | |
655 | |
656 # But you can tell find_declared_encoding to search an entire | |
657 # HTML document. | |
658 self.assertEqual( | |
659 "utf-8", | |
660 m(spacer + html_bytes, is_html=True, search_entire_document=True) | |
661 ) | |
662 | |
663 # The XML encoding declaration has to be the very first thing | |
664 # in the document. We'll allow whitespace before the document | |
665 # starts, but nothing else. | |
666 self.assertEqual( | |
667 "iso-8859-1", | |
668 m(xml_bytes, search_entire_document=True) | |
669 ) | |
670 self.assertEqual( | |
671 None, m(b'a' + xml_bytes, search_entire_document=True) | |
672 ) | |
673 | |
674 class TestNamedspacedAttribute(SoupTest): | |
675 | |
676 def test_name_may_be_none_or_missing(self): | |
677 a = NamespacedAttribute("xmlns", None) | |
678 self.assertEqual(a, "xmlns") | |
679 | |
680 a = NamespacedAttribute("xmlns") | |
681 self.assertEqual(a, "xmlns") | |
682 | |
683 def test_attribute_is_equivalent_to_colon_separated_string(self): | |
684 a = NamespacedAttribute("a", "b") | |
685 self.assertEqual("a:b", a) | |
686 | |
687 def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | |
688 a = NamespacedAttribute("a", "b", "c") | |
689 b = NamespacedAttribute("a", "b", "c") | |
690 self.assertEqual(a, b) | |
691 | |
692 # The actual namespace is not considered. | |
693 c = NamespacedAttribute("a", "b", None) | |
694 self.assertEqual(a, c) | |
695 | |
696 # But name and prefix are important. | |
697 d = NamespacedAttribute("a", "z", "c") | |
698 self.assertNotEqual(a, d) | |
699 | |
700 e = NamespacedAttribute("z", "b", "c") | |
701 self.assertNotEqual(a, e) | |
702 | |
703 | |
704 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | |
705 | |
706 def test_content_meta_attribute_value(self): | |
707 value = CharsetMetaAttributeValue("euc-jp") | |
708 self.assertEqual("euc-jp", value) | |
709 self.assertEqual("euc-jp", value.original_value) | |
710 self.assertEqual("utf8", value.encode("utf8")) | |
711 | |
712 | |
713 def test_content_meta_attribute_value(self): | |
714 value = ContentMetaAttributeValue("text/html; charset=euc-jp") | |
715 self.assertEqual("text/html; charset=euc-jp", value) | |
716 self.assertEqual("text/html; charset=euc-jp", value.original_value) | |
717 self.assertEqual("text/html; charset=utf8", value.encode("utf8")) |