comparison env/lib/python3.9/site-packages/bs4/testing.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # encoding: utf-8
2 """Helper classes for tests."""
3
4 # Use of this source code is governed by the MIT license.
5 __license__ = "MIT"
6
7 import pickle
8 import copy
9 import functools
10 import unittest
11 from unittest import TestCase
12 from bs4 import BeautifulSoup
13 from bs4.element import (
14 CharsetMetaAttributeValue,
15 Comment,
16 ContentMetaAttributeValue,
17 Doctype,
18 PYTHON_SPECIFIC_ENCODINGS,
19 SoupStrainer,
20 Script,
21 Stylesheet,
22 Tag
23 )
24
25 from bs4.builder import HTMLParserTreeBuilder
26 default_builder = HTMLParserTreeBuilder
27
28 BAD_DOCUMENT = """A bare string
29 <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
30 <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
31 <div><![CDATA[A CDATA section where it doesn't belong]]></div>
32 <div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
33 <div>A <meta> tag</div>
34 <div>A <br> tag that supposedly has contents.</br></div>
35 <div>AT&T</div>
36 <div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
37 <div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
38 <div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
39 <div><a href="http://example.com/</a> that attribute value never got closed</div>
40 <div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
41 <! This document starts with a bogus declaration ><div>a</div>
42 <div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
43 <div>This document ends with <!an incomplete declaration
44 <div><a style={height:21px;}>That attribute value was bogus</a></div>
45 <! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
46 <div><table><td nowrap>That boolean attribute had no value</td></table></div>
47 <div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
48 <div>This document ends before the entity finishes: &gt
49 <div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
50 <b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
51 <div><table><tr><td>Here's a table</td></tr></table></div>
52 <div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
53 <div>This tag contains nothing but whitespace: <b> </b></div>
54 <div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
55 <div><table><div>This table contains bare markup</div></table></div>
56 <div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
57 <div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
58 <div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
59 <div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
60 <div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
61 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
62 """
63
64
65 class SoupTest(unittest.TestCase):
66
67 @property
68 def default_builder(self):
69 return default_builder
70
71 def soup(self, markup, **kwargs):
72 """Build a Beautiful Soup object from markup."""
73 builder = kwargs.pop('builder', self.default_builder)
74 return BeautifulSoup(markup, builder=builder, **kwargs)
75
76 def document_for(self, markup, **kwargs):
77 """Turn an HTML fragment into a document.
78
79 The details depend on the builder.
80 """
81 return self.default_builder(**kwargs).test_fragment_to_document(markup)
82
83 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
84 builder = self.default_builder
85 obj = BeautifulSoup(to_parse, builder=builder)
86 if compare_parsed_to is None:
87 compare_parsed_to = to_parse
88
89 # Verify that the documents come out the same.
90 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
91
92 # Also run some checks on the BeautifulSoup object itself:
93
94 # Verify that every tag that was opened was eventually closed.
95
96 # There are no tags in the open tag counter.
97 assert all(v==0 for v in list(obj.open_tag_counter.values()))
98
99 # The only tag in the tag stack is the one for the root
100 # document.
101 self.assertEqual(
102 [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
103 )
104
105 def assertConnectedness(self, element):
106 """Ensure that next_element and previous_element are properly
107 set for all descendants of the given element.
108 """
109 earlier = None
110 for e in element.descendants:
111 if earlier:
112 self.assertEqual(e, earlier.next_element)
113 self.assertEqual(earlier, e.previous_element)
114 earlier = e
115
116 def linkage_validator(self, el, _recursive_call=False):
117 """Ensure proper linkage throughout the document."""
118 descendant = None
119 # Document element should have no previous element or previous sibling.
120 # It also shouldn't have a next sibling.
121 if el.parent is None:
122 assert el.previous_element is None,\
123 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
124 el, el.previous_element, None
125 )
126 assert el.previous_sibling is None,\
127 "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
128 el, el.previous_sibling, None
129 )
130 assert el.next_sibling is None,\
131 "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
132 el, el.next_sibling, None
133 )
134
135 idx = 0
136 child = None
137 last_child = None
138 last_idx = len(el.contents) - 1
139 for child in el.contents:
140 descendant = None
141
142 # Parent should link next element to their first child
143 # That child should have no previous sibling
144 if idx == 0:
145 if el.parent is not None:
146 assert el.next_element is child,\
147 "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
148 el, el.next_element, child
149 )
150 assert child.previous_element is el,\
151 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
152 child, child.previous_element, el
153 )
154 assert child.previous_sibling is None,\
155 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
156 child, child.previous_sibling, None
157 )
158
159 # If not the first child, previous index should link as sibling to this index
160 # Previous element should match the last index or the last bubbled up descendant
161 else:
162 assert child.previous_sibling is el.contents[idx - 1],\
163 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
164 child, child.previous_sibling, el.contents[idx - 1]
165 )
166 assert el.contents[idx - 1].next_sibling is child,\
167 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
168 el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
169 )
170
171 if last_child is not None:
172 assert child.previous_element is last_child,\
173 "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
174 child, child.previous_element, last_child, child.parent.contents
175 )
176 assert last_child.next_element is child,\
177 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
178 last_child, last_child.next_element, child
179 )
180
181 if isinstance(child, Tag) and child.contents:
182 descendant = self.linkage_validator(child, True)
183 # A bubbled up descendant should have no next siblings
184 assert descendant.next_sibling is None,\
185 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
186 descendant, descendant.next_sibling, None
187 )
188
189 # Mark last child as either the bubbled up descendant or the current child
190 if descendant is not None:
191 last_child = descendant
192 else:
193 last_child = child
194
195 # If last child, there are non next siblings
196 if idx == last_idx:
197 assert child.next_sibling is None,\
198 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
199 child, child.next_sibling, None
200 )
201 idx += 1
202
203 child = descendant if descendant is not None else child
204 if child is None:
205 child = el
206
207 if not _recursive_call and child is not None:
208 target = el
209 while True:
210 if target is None:
211 assert child.next_element is None, \
212 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
213 child, child.next_element, None
214 )
215 break
216 elif target.next_sibling is not None:
217 assert child.next_element is target.next_sibling, \
218 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
219 child, child.next_element, target.next_sibling
220 )
221 break
222 target = target.parent
223
224 # We are done, so nothing to return
225 return None
226 else:
227 # Return the child to the recursive caller
228 return child
229
230
231 class HTMLTreeBuilderSmokeTest(object):
232
233 """A basic test of a treebuilder's competence.
234
235 Any HTML treebuilder, present or future, should be able to pass
236 these tests. With invalid markup, there's room for interpretation,
237 and different parsers can handle it differently. But with the
238 markup in these tests, there's not much room for interpretation.
239 """
240
241 def test_empty_element_tags(self):
242 """Verify that all HTML4 and HTML5 empty element (aka void element) tags
243 are handled correctly.
244 """
245 for name in [
246 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
247 'spacer', 'frame'
248 ]:
249 soup = self.soup("")
250 new_tag = soup.new_tag(name)
251 self.assertEqual(True, new_tag.is_empty_element)
252
253 def test_special_string_containers(self):
254 soup = self.soup(
255 "<style>Some CSS</style><script>Some Javascript</script>"
256 )
257 assert isinstance(soup.style.string, Stylesheet)
258 assert isinstance(soup.script.string, Script)
259
260 soup = self.soup(
261 "<style><!--Some CSS--></style>"
262 )
263 assert isinstance(soup.style.string, Stylesheet)
264 # The contents of the style tag resemble an HTML comment, but
265 # it's not treated as a comment.
266 self.assertEqual("<!--Some CSS-->", soup.style.string)
267 assert isinstance(soup.style.string, Stylesheet)
268
269 def test_pickle_and_unpickle_identity(self):
270 # Pickling a tree, then unpickling it, yields a tree identical
271 # to the original.
272 tree = self.soup("<a><b>foo</a>")
273 dumped = pickle.dumps(tree, 2)
274 loaded = pickle.loads(dumped)
275 self.assertEqual(loaded.__class__, BeautifulSoup)
276 self.assertEqual(loaded.decode(), tree.decode())
277
278 def assertDoctypeHandled(self, doctype_fragment):
279 """Assert that a given doctype string is handled correctly."""
280 doctype_str, soup = self._document_with_doctype(doctype_fragment)
281
282 # Make sure a Doctype object was created.
283 doctype = soup.contents[0]
284 self.assertEqual(doctype.__class__, Doctype)
285 self.assertEqual(doctype, doctype_fragment)
286 self.assertEqual(
287 soup.encode("utf8")[:len(doctype_str)],
288 doctype_str
289 )
290
291 # Make sure that the doctype was correctly associated with the
292 # parse tree and that the rest of the document parsed.
293 self.assertEqual(soup.p.contents[0], 'foo')
294
295 def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
296 """Generate and parse a document with the given doctype."""
297 doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
298 markup = doctype + '\n<p>foo</p>'
299 soup = self.soup(markup)
300 return doctype.encode("utf8"), soup
301
302 def test_normal_doctypes(self):
303 """Make sure normal, everyday HTML doctypes are handled correctly."""
304 self.assertDoctypeHandled("html")
305 self.assertDoctypeHandled(
306 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
307
308 def test_empty_doctype(self):
309 soup = self.soup("<!DOCTYPE>")
310 doctype = soup.contents[0]
311 self.assertEqual("", doctype.strip())
312
313 def test_mixed_case_doctype(self):
314 # A lowercase or mixed-case doctype becomes a Doctype.
315 for doctype_fragment in ("doctype", "DocType"):
316 doctype_str, soup = self._document_with_doctype(
317 "html", doctype_fragment
318 )
319
320 # Make sure a Doctype object was created and that the DOCTYPE
321 # is uppercase.
322 doctype = soup.contents[0]
323 self.assertEqual(doctype.__class__, Doctype)
324 self.assertEqual(doctype, "html")
325 self.assertEqual(
326 soup.encode("utf8")[:len(doctype_str)],
327 b"<!DOCTYPE html>"
328 )
329
330 # Make sure that the doctype was correctly associated with the
331 # parse tree and that the rest of the document parsed.
332 self.assertEqual(soup.p.contents[0], 'foo')
333
334 def test_public_doctype_with_url(self):
335 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
336 self.assertDoctypeHandled(doctype)
337
338 def test_system_doctype(self):
339 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
340
341 def test_namespaced_system_doctype(self):
342 # We can handle a namespaced doctype with a system ID.
343 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
344
345 def test_namespaced_public_doctype(self):
346 # Test a namespaced doctype with a public id.
347 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
348
349 def test_real_xhtml_document(self):
350 """A real XHTML document should come out more or less the same as it went in."""
351 markup = b"""<?xml version="1.0" encoding="utf-8"?>
352 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
353 <html xmlns="http://www.w3.org/1999/xhtml">
354 <head><title>Hello.</title></head>
355 <body>Goodbye.</body>
356 </html>"""
357 soup = self.soup(markup)
358 self.assertEqual(
359 soup.encode("utf-8").replace(b"\n", b""),
360 markup.replace(b"\n", b""))
361
362 def test_namespaced_html(self):
363 """When a namespaced XML document is parsed as HTML it should
364 be treated as HTML with weird tag names.
365 """
366 markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
367 soup = self.soup(markup)
368 self.assertEqual(2, len(soup.find_all("ns1:foo")))
369
370 def test_processing_instruction(self):
371 # We test both Unicode and bytestring to verify that
372 # process_markup correctly sets processing_instruction_class
373 # even when the markup is already Unicode and there is no
374 # need to process anything.
375 markup = """<?PITarget PIContent?>"""
376 soup = self.soup(markup)
377 self.assertEqual(markup, soup.decode())
378
379 markup = b"""<?PITarget PIContent?>"""
380 soup = self.soup(markup)
381 self.assertEqual(markup, soup.encode("utf8"))
382
383 def test_deepcopy(self):
384 """Make sure you can copy the tree builder.
385
386 This is important because the builder is part of a
387 BeautifulSoup object, and we want to be able to copy that.
388 """
389 copy.deepcopy(self.default_builder)
390
391 def test_p_tag_is_never_empty_element(self):
392 """A <p> tag is never designated as an empty-element tag.
393
394 Even if the markup shows it as an empty-element tag, it
395 shouldn't be presented that way.
396 """
397 soup = self.soup("<p/>")
398 self.assertFalse(soup.p.is_empty_element)
399 self.assertEqual(str(soup.p), "<p></p>")
400
401 def test_unclosed_tags_get_closed(self):
402 """A tag that's not closed by the end of the document should be closed.
403
404 This applies to all tags except empty-element tags.
405 """
406 self.assertSoupEquals("<p>", "<p></p>")
407 self.assertSoupEquals("<b>", "<b></b>")
408
409 self.assertSoupEquals("<br>", "<br/>")
410
411 def test_br_is_always_empty_element_tag(self):
412 """A <br> tag is designated as an empty-element tag.
413
414 Some parsers treat <br></br> as one <br/> tag, some parsers as
415 two tags, but it should always be an empty-element tag.
416 """
417 soup = self.soup("<br></br>")
418 self.assertTrue(soup.br.is_empty_element)
419 self.assertEqual(str(soup.br), "<br/>")
420
421 def test_nested_formatting_elements(self):
422 self.assertSoupEquals("<em><em></em></em>")
423
424 def test_double_head(self):
425 html = '''<!DOCTYPE html>
426 <html>
427 <head>
428 <title>Ordinary HEAD element test</title>
429 </head>
430 <script type="text/javascript">
431 alert("Help!");
432 </script>
433 <body>
434 Hello, world!
435 </body>
436 </html>
437 '''
438 soup = self.soup(html)
439 self.assertEqual("text/javascript", soup.find('script')['type'])
440
441 def test_comment(self):
442 # Comments are represented as Comment objects.
443 markup = "<p>foo<!--foobar-->baz</p>"
444 self.assertSoupEquals(markup)
445
446 soup = self.soup(markup)
447 comment = soup.find(text="foobar")
448 self.assertEqual(comment.__class__, Comment)
449
450 # The comment is properly integrated into the tree.
451 foo = soup.find(text="foo")
452 self.assertEqual(comment, foo.next_element)
453 baz = soup.find(text="baz")
454 self.assertEqual(comment, baz.previous_element)
455
456 def test_preserved_whitespace_in_pre_and_textarea(self):
457 """Whitespace must be preserved in <pre> and <textarea> tags,
458 even if that would mean not prettifying the markup.
459 """
460 pre_markup = "<pre> </pre>"
461 textarea_markup = "<textarea> woo\nwoo </textarea>"
462 self.assertSoupEquals(pre_markup)
463 self.assertSoupEquals(textarea_markup)
464
465 soup = self.soup(pre_markup)
466 self.assertEqual(soup.pre.prettify(), pre_markup)
467
468 soup = self.soup(textarea_markup)
469 self.assertEqual(soup.textarea.prettify(), textarea_markup)
470
471 soup = self.soup("<textarea></textarea>")
472 self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
473
474 def test_nested_inline_elements(self):
475 """Inline elements can be nested indefinitely."""
476 b_tag = "<b>Inside a B tag</b>"
477 self.assertSoupEquals(b_tag)
478
479 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
480 self.assertSoupEquals(nested_b_tag)
481
482 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
483 self.assertSoupEquals(nested_b_tag)
484
485 def test_nested_block_level_elements(self):
486 """Block elements can be nested."""
487 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
488 blockquote = soup.blockquote
489 self.assertEqual(blockquote.p.b.string, 'Foo')
490 self.assertEqual(blockquote.b.string, 'Foo')
491
492 def test_correctly_nested_tables(self):
493 """One table can go inside another one."""
494 markup = ('<table id="1">'
495 '<tr>'
496 "<td>Here's another table:"
497 '<table id="2">'
498 '<tr><td>foo</td></tr>'
499 '</table></td>')
500
501 self.assertSoupEquals(
502 markup,
503 '<table id="1"><tr><td>Here\'s another table:'
504 '<table id="2"><tr><td>foo</td></tr></table>'
505 '</td></tr></table>')
506
507 self.assertSoupEquals(
508 "<table><thead><tr><td>Foo</td></tr></thead>"
509 "<tbody><tr><td>Bar</td></tr></tbody>"
510 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
511
512 def test_multivalued_attribute_with_whitespace(self):
513 # Whitespace separating the values of a multi-valued attribute
514 # should be ignored.
515
516 markup = '<div class=" foo bar "></a>'
517 soup = self.soup(markup)
518 self.assertEqual(['foo', 'bar'], soup.div['class'])
519
520 # If you search by the literal name of the class it's like the whitespace
521 # wasn't there.
522 self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
523
524 def test_deeply_nested_multivalued_attribute(self):
525 # html5lib can set the attributes of the same tag many times
526 # as it rearranges the tree. This has caused problems with
527 # multivalued attributes.
528 markup = '<table><div><div class="css"></div></div></table>'
529 soup = self.soup(markup)
530 self.assertEqual(["css"], soup.div.div['class'])
531
532 def test_multivalued_attribute_on_html(self):
533 # html5lib uses a different API to set the attributes ot the
534 # <html> tag. This has caused problems with multivalued
535 # attributes.
536 markup = '<html class="a b"></html>'
537 soup = self.soup(markup)
538 self.assertEqual(["a", "b"], soup.html['class'])
539
540 def test_angle_brackets_in_attribute_values_are_escaped(self):
541 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
542
543 def test_strings_resembling_character_entity_references(self):
544 # "&T" and "&p" look like incomplete character entities, but they are
545 # not.
546 self.assertSoupEquals(
547 "<p>&bull; AT&T is in the s&p 500</p>",
548 "<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
549 )
550
551 def test_apos_entity(self):
552 self.assertSoupEquals(
553 "<p>Bob&apos;s Bar</p>",
554 "<p>Bob's Bar</p>",
555 )
556
557 def test_entities_in_foreign_document_encoding(self):
558 # &#147; and &#148; are invalid numeric entities referencing
559 # Windows-1252 characters. &#45; references a character common
560 # to Windows-1252 and Unicode, and &#9731; references a
561 # character only found in Unicode.
562 #
563 # All of these entities should be converted to Unicode
564 # characters.
565 markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
566 soup = self.soup(markup)
567 self.assertEqual("“Hello” -☃", soup.p.string)
568
569 def test_entities_in_attributes_converted_to_unicode(self):
570 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
571 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
572 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
573 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
574 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
575
576 def test_entities_in_text_converted_to_unicode(self):
577 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
578 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
579 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
580 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
581 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
582
583 def test_quot_entity_converted_to_quotation_mark(self):
584 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
585 '<p>I said "good day!"</p>')
586
587 def test_out_of_range_entity(self):
588 expect = "\N{REPLACEMENT CHARACTER}"
589 self.assertSoupEquals("&#10000000000000;", expect)
590 self.assertSoupEquals("&#x10000000000000;", expect)
591 self.assertSoupEquals("&#1000000000;", expect)
592
593 def test_multipart_strings(self):
594 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
595 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
596 self.assertEqual("p", soup.h2.string.next_element.name)
597 self.assertEqual("p", soup.p.name)
598 self.assertConnectedness(soup)
599
600 def test_empty_element_tags(self):
601 """Verify consistent handling of empty-element tags,
602 no matter how they come in through the markup.
603 """
604 self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
605 self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
606
607 def test_head_tag_between_head_and_body(self):
608 "Prevent recurrence of a bug in the html5lib treebuilder."
609 content = """<html><head></head>
610 <link></link>
611 <body>foo</body>
612 </html>
613 """
614 soup = self.soup(content)
615 self.assertNotEqual(None, soup.html.body)
616 self.assertConnectedness(soup)
617
618 def test_multiple_copies_of_a_tag(self):
619 "Prevent recurrence of a bug in the html5lib treebuilder."
620 content = """<!DOCTYPE html>
621 <html>
622 <body>
623 <article id="a" >
624 <div><a href="1"></div>
625 <footer>
626 <a href="2"></a>
627 </footer>
628 </article>
629 </body>
630 </html>
631 """
632 soup = self.soup(content)
633 self.assertConnectedness(soup.article)
634
635 def test_basic_namespaces(self):
636 """Parsers don't need to *understand* namespaces, but at the
637 very least they should not choke on namespaces or lose
638 data."""
639
640 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
641 soup = self.soup(markup)
642 self.assertEqual(markup, soup.encode())
643 html = soup.html
644 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
645 self.assertEqual(
646 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
647 self.assertEqual(
648 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
649
650 def test_multivalued_attribute_value_becomes_list(self):
651 markup = b'<a class="foo bar">'
652 soup = self.soup(markup)
653 self.assertEqual(['foo', 'bar'], soup.a['class'])
654
655 #
656 # Generally speaking, tests below this point are more tests of
657 # Beautiful Soup than tests of the tree builders. But parsers are
658 # weird, so we run these tests separately for every tree builder
659 # to detect any differences between them.
660 #
661
662 def test_can_parse_unicode_document(self):
663 # A seemingly innocuous document... but it's in Unicode! And
664 # it contains characters that can't be represented in the
665 # encoding found in the declaration! The horror!
666 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
667 soup = self.soup(markup)
668 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
669
670 def test_soupstrainer(self):
671 """Parsers should be able to work with SoupStrainers."""
672 strainer = SoupStrainer("b")
673 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
674 parse_only=strainer)
675 self.assertEqual(soup.decode(), "<b>bold</b>")
676
677 def test_single_quote_attribute_values_become_double_quotes(self):
678 self.assertSoupEquals("<foo attr='bar'></foo>",
679 '<foo attr="bar"></foo>')
680
681 def test_attribute_values_with_nested_quotes_are_left_alone(self):
682 text = """<foo attr='bar "brawls" happen'>a</foo>"""
683 self.assertSoupEquals(text)
684
685 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
686 text = """<foo attr='bar "brawls" happen'>a</foo>"""
687 soup = self.soup(text)
688 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
689 self.assertSoupEquals(
690 soup.foo.decode(),
691 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
692
693 def test_ampersand_in_attribute_value_gets_escaped(self):
694 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
695 '<this is="really messed up &amp; stuff"></this>')
696
697 self.assertSoupEquals(
698 '<a href="http://example.org?a=1&b=2;3">foo</a>',
699 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
700
701 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
702 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
703
704 def test_entities_in_strings_converted_during_parsing(self):
705 # Both XML and HTML entities are converted to Unicode characters
706 # during parsing.
707 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
708 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
709 self.assertSoupEquals(text, expected)
710
711 def test_smart_quotes_converted_on_the_way_in(self):
712 # Microsoft smart quotes are converted to Unicode characters during
713 # parsing.
714 quote = b"<p>\x91Foo\x92</p>"
715 soup = self.soup(quote)
716 self.assertEqual(
717 soup.p.string,
718 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
719
720 def test_non_breaking_spaces_converted_on_the_way_in(self):
721 soup = self.soup("<a>&nbsp;&nbsp;</a>")
722 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
723
724 def test_entities_converted_on_the_way_out(self):
725 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
726 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
727 soup = self.soup(text)
728 self.assertEqual(soup.p.encode("utf-8"), expected)
729
730 def test_real_iso_latin_document(self):
731 # Smoke test of interrelated functionality, using an
732 # easy-to-understand document.
733
734 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
735 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
736
737 # That's because we're going to encode it into ISO-Latin-1, and use
738 # that to test.
739 iso_latin_html = unicode_html.encode("iso-8859-1")
740
741 # Parse the ISO-Latin-1 HTML.
742 soup = self.soup(iso_latin_html)
743 # Encode it to UTF-8.
744 result = soup.encode("utf-8")
745
746 # What do we expect the result to look like? Well, it would
747 # look like unicode_html, except that the META tag would say
748 # UTF-8 instead of ISO-Latin-1.
749 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
750
751 # And, of course, it would be in UTF-8, not Unicode.
752 expected = expected.encode("utf-8")
753
754 # Ta-da!
755 self.assertEqual(result, expected)
756
757 def test_real_shift_jis_document(self):
758 # Smoke test to make sure the parser can handle a document in
759 # Shift-JIS encoding, without choking.
760 shift_jis_html = (
761 b'<html><head></head><body><pre>'
762 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
763 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
764 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
765 b'</pre></body></html>')
766 unicode_html = shift_jis_html.decode("shift-jis")
767 soup = self.soup(unicode_html)
768
769 # Make sure the parse tree is correctly encoded to various
770 # encodings.
771 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
772 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
773
774 def test_real_hebrew_document(self):
775 # A real-world test to make sure we can convert ISO-8859-9 (a
776 # Hebrew encoding) to UTF-8.
777 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
778 soup = self.soup(
779 hebrew_document, from_encoding="iso8859-8")
780 # Some tree builders call it iso8859-8, others call it iso-8859-9.
781 # That's not a difference we really care about.
782 assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
783 self.assertEqual(
784 soup.encode('utf-8'),
785 hebrew_document.decode("iso8859-8").encode("utf-8"))
786
787 def test_meta_tag_reflects_current_encoding(self):
788 # Here's the <meta> tag saying that a document is
789 # encoded in Shift-JIS.
790 meta_tag = ('<meta content="text/html; charset=x-sjis" '
791 'http-equiv="Content-type"/>')
792
793 # Here's a document incorporating that meta tag.
794 shift_jis_html = (
795 '<html><head>\n%s\n'
796 '<meta http-equiv="Content-language" content="ja"/>'
797 '</head><body>Shift-JIS markup goes here.') % meta_tag
798 soup = self.soup(shift_jis_html)
799
800 # Parse the document, and the charset is seemingly unaffected.
801 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
802 content = parsed_meta['content']
803 self.assertEqual('text/html; charset=x-sjis', content)
804
805 # But that value is actually a ContentMetaAttributeValue object.
806 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
807
808 # And it will take on a value that reflects its current
809 # encoding.
810 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
811
812 # For the rest of the story, see TestSubstitutions in
813 # test_tree.py.
814
815 def test_html5_style_meta_tag_reflects_current_encoding(self):
816 # Here's the <meta> tag saying that a document is
817 # encoded in Shift-JIS.
818 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
819
820 # Here's a document incorporating that meta tag.
821 shift_jis_html = (
822 '<html><head>\n%s\n'
823 '<meta http-equiv="Content-language" content="ja"/>'
824 '</head><body>Shift-JIS markup goes here.') % meta_tag
825 soup = self.soup(shift_jis_html)
826
827 # Parse the document, and the charset is seemingly unaffected.
828 parsed_meta = soup.find('meta', id="encoding")
829 charset = parsed_meta['charset']
830 self.assertEqual('x-sjis', charset)
831
832 # But that value is actually a CharsetMetaAttributeValue object.
833 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
834
835 # And it will take on a value that reflects its current
836 # encoding.
837 self.assertEqual('utf8', charset.encode("utf8"))
838
839 def test_python_specific_encodings_not_used_in_charset(self):
840 # You can encode an HTML document using a Python-specific
841 # encoding, but that encoding won't be mentioned _inside_ the
842 # resulting document. Instead, the document will appear to
843 # have no encoding.
844 for markup in [
845 b'<meta charset="utf8"></head>'
846 b'<meta id="encoding" charset="utf-8" />'
847 ]:
848 soup = self.soup(markup)
849 for encoding in PYTHON_SPECIFIC_ENCODINGS:
850 if encoding in (
851 'idna', 'mbcs', 'oem', 'undefined',
852 'string_escape', 'string-escape'
853 ):
854 # For one reason or another, these will raise an
855 # exception if we actually try to use them, so don't
856 # bother.
857 continue
858 encoded = soup.encode(encoding)
859 assert b'meta charset=""' in encoded
860 assert encoding.encode("ascii") not in encoded
861
862 def test_tag_with_no_attributes_can_have_attributes_added(self):
863 data = self.soup("<a>text</a>")
864 data.a['foo'] = 'bar'
865 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
866
867 def test_closing_tag_with_no_opening_tag(self):
868 # Without BeautifulSoup.open_tag_counter, the </span> tag will
869 # cause _popToTag to be called over and over again as we look
870 # for a <span> tag that wasn't there. The result is that 'text2'
871 # will show up outside the body of the document.
872 soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
873 self.assertEqual(
874 "<body><div><p>text1</p>text2</div></body>", soup.body.decode()
875 )
876
877 def test_worst_case(self):
878 """Test the worst case (currently) for linking issues."""
879
880 soup = self.soup(BAD_DOCUMENT)
881 self.linkage_validator(soup)
882
883
884 class XMLTreeBuilderSmokeTest(object):
885
886 def test_pickle_and_unpickle_identity(self):
887 # Pickling a tree, then unpickling it, yields a tree identical
888 # to the original.
889 tree = self.soup("<a><b>foo</a>")
890 dumped = pickle.dumps(tree, 2)
891 loaded = pickle.loads(dumped)
892 self.assertEqual(loaded.__class__, BeautifulSoup)
893 self.assertEqual(loaded.decode(), tree.decode())
894
895 def test_docstring_generated(self):
896 soup = self.soup("<root/>")
897 self.assertEqual(
898 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
899
900 def test_xml_declaration(self):
901 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
902 soup = self.soup(markup)
903 self.assertEqual(markup, soup.encode("utf8"))
904
905 def test_python_specific_encodings_not_used_in_xml_declaration(self):
906 # You can encode an XML document using a Python-specific
907 # encoding, but that encoding won't be mentioned _inside_ the
908 # resulting document.
909 markup = b"""<?xml version="1.0"?>\n<foo/>"""
910 soup = self.soup(markup)
911 for encoding in PYTHON_SPECIFIC_ENCODINGS:
912 if encoding in (
913 'idna', 'mbcs', 'oem', 'undefined',
914 'string_escape', 'string-escape'
915 ):
916 # For one reason or another, these will raise an
917 # exception if we actually try to use them, so don't
918 # bother.
919 continue
920 encoded = soup.encode(encoding)
921 assert b'<?xml version="1.0"?>' in encoded
922 assert encoding.encode("ascii") not in encoded
923
924 def test_processing_instruction(self):
925 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
926 soup = self.soup(markup)
927 self.assertEqual(markup, soup.encode("utf8"))
928
929 def test_real_xhtml_document(self):
930 """A real XHTML document should come out *exactly* the same as it went in."""
931 markup = b"""<?xml version="1.0" encoding="utf-8"?>
932 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
933 <html xmlns="http://www.w3.org/1999/xhtml">
934 <head><title>Hello.</title></head>
935 <body>Goodbye.</body>
936 </html>"""
937 soup = self.soup(markup)
938 self.assertEqual(
939 soup.encode("utf-8"), markup)
940
941 def test_nested_namespaces(self):
942 doc = b"""<?xml version="1.0" encoding="utf-8"?>
943 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
944 <parent xmlns="http://ns1/">
945 <child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
946 <grandchild ns3:attr="value" xmlns="http://ns4/"/>
947 </child>
948 </parent>"""
949 soup = self.soup(doc)
950 self.assertEqual(doc, soup.encode())
951
952 def test_formatter_processes_script_tag_for_xml_documents(self):
953 doc = """
954 <script type="text/javascript">
955 </script>
956 """
957 soup = BeautifulSoup(doc, "lxml-xml")
958 # lxml would have stripped this while parsing, but we can add
959 # it later.
960 soup.script.string = 'console.log("< < hey > > ");'
961 encoded = soup.encode()
962 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
963
964 def test_can_parse_unicode_document(self):
965 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
966 soup = self.soup(markup)
967 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
968
969 def test_popping_namespaced_tag(self):
970 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
971 soup = self.soup(markup)
972 self.assertEqual(
973 str(soup.rss), markup)
974
975 def test_docstring_includes_correct_encoding(self):
976 soup = self.soup("<root/>")
977 self.assertEqual(
978 soup.encode("latin1"),
979 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
980
981 def test_large_xml_document(self):
982 """A large XML document should come out the same as it went in."""
983 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
984 + b'0' * (2**12)
985 + b'</root>')
986 soup = self.soup(markup)
987 self.assertEqual(soup.encode("utf-8"), markup)
988
989
990 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
991 self.assertSoupEquals("<p>", "<p/>")
992 self.assertSoupEquals("<p>foo</p>")
993
994 def test_namespaces_are_preserved(self):
995 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
996 soup = self.soup(markup)
997 root = soup.root
998 self.assertEqual("http://example.com/", root['xmlns:a'])
999 self.assertEqual("http://example.net/", root['xmlns:b'])
1000
1001 def test_closing_namespaced_tag(self):
1002 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
1003 soup = self.soup(markup)
1004 self.assertEqual(str(soup.p), markup)
1005
1006 def test_namespaced_attributes(self):
1007 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
1008 soup = self.soup(markup)
1009 self.assertEqual(str(soup.foo), markup)
1010
1011 def test_namespaced_attributes_xml_namespace(self):
1012 markup = '<foo xml:lang="fr">bar</foo>'
1013 soup = self.soup(markup)
1014 self.assertEqual(str(soup.foo), markup)
1015
1016 def test_find_by_prefixed_name(self):
1017 doc = """<?xml version="1.0" encoding="utf-8"?>
1018 <Document xmlns="http://example.com/ns0"
1019 xmlns:ns1="http://example.com/ns1"
1020 xmlns:ns2="http://example.com/ns2"
1021 <ns1:tag>foo</ns1:tag>
1022 <ns1:tag>bar</ns1:tag>
1023 <ns2:tag key="value">baz</ns2:tag>
1024 </Document>
1025 """
1026 soup = self.soup(doc)
1027
1028 # There are three <tag> tags.
1029 self.assertEqual(3, len(soup.find_all('tag')))
1030
1031 # But two of them are ns1:tag and one of them is ns2:tag.
1032 self.assertEqual(2, len(soup.find_all('ns1:tag')))
1033 self.assertEqual(1, len(soup.find_all('ns2:tag')))
1034
1035 self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
1036 self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
1037
1038 def test_copy_tag_preserves_namespace(self):
1039 xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1040 <w:document xmlns:w="http://example.com/ns0"/>"""
1041
1042 soup = self.soup(xml)
1043 tag = soup.document
1044 duplicate = copy.copy(tag)
1045
1046 # The two tags have the same namespace prefix.
1047 self.assertEqual(tag.prefix, duplicate.prefix)
1048
1049 def test_worst_case(self):
1050 """Test the worst case (currently) for linking issues."""
1051
1052 soup = self.soup(BAD_DOCUMENT)
1053 self.linkage_validator(soup)
1054
1055
1056 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
1057 """Smoke test for a tree builder that supports HTML5."""
1058
1059 def test_real_xhtml_document(self):
1060 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
1061 # XHTML documents in any particular way.
1062 pass
1063
1064 def test_html_tags_have_namespace(self):
1065 markup = "<a>"
1066 soup = self.soup(markup)
1067 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
1068
1069 def test_svg_tags_have_namespace(self):
1070 markup = '<svg><circle/></svg>'
1071 soup = self.soup(markup)
1072 namespace = "http://www.w3.org/2000/svg"
1073 self.assertEqual(namespace, soup.svg.namespace)
1074 self.assertEqual(namespace, soup.circle.namespace)
1075
1076
1077 def test_mathml_tags_have_namespace(self):
1078 markup = '<math><msqrt>5</msqrt></math>'
1079 soup = self.soup(markup)
1080 namespace = 'http://www.w3.org/1998/Math/MathML'
1081 self.assertEqual(namespace, soup.math.namespace)
1082 self.assertEqual(namespace, soup.msqrt.namespace)
1083
1084 def test_xml_declaration_becomes_comment(self):
1085 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
1086 soup = self.soup(markup)
1087 self.assertTrue(isinstance(soup.contents[0], Comment))
1088 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
1089 self.assertEqual("html", soup.contents[0].next_element.name)
1090
1091 def skipIf(condition, reason):
1092 def nothing(test, *args, **kwargs):
1093 return None
1094
1095 def decorator(test_item):
1096 if condition:
1097 return nothing
1098 else:
1099 return test_item
1100
1101 return decorator