comparison env/lib/python3.7/site-packages/bs4/testing.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # encoding: utf-8
2 """Helper classes for tests."""
3
4 # Use of this source code is governed by the MIT license.
5 __license__ = "MIT"
6
7 import pickle
8 import copy
9 import functools
10 import unittest
11 from unittest import TestCase
12 from bs4 import BeautifulSoup
13 from bs4.element import (
14 CharsetMetaAttributeValue,
15 Comment,
16 ContentMetaAttributeValue,
17 Doctype,
18 SoupStrainer,
19 Script,
20 Stylesheet,
21 Tag
22 )
23
24 from bs4.builder import HTMLParserTreeBuilder
25 default_builder = HTMLParserTreeBuilder
26
27 BAD_DOCUMENT = """A bare string
28 <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
29 <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
30 <div><![CDATA[A CDATA section where it doesn't belong]]></div>
31 <div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
32 <div>A <meta> tag</div>
33 <div>A <br> tag that supposedly has contents.</br></div>
34 <div>AT&T</div>
35 <div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
36 <div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
37 <div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
38 <div><a href="http://example.com/</a> that attribute value never got closed</div>
39 <div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
40 <! This document starts with a bogus declaration ><div>a</div>
41 <div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
42 <div>This document ends with <!an incomplete declaration
43 <div><a style={height:21px;}>That attribute value was bogus</a></div>
44 <! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
45 <div><table><td nowrap>That boolean attribute had no value</td></table></div>
46 <div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
47 <div>This document ends before the entity finishes: &gt
48 <div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
49 <b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
50 <div><table><tr><td>Here's a table</td></tr></table></div>
51 <div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
52 <div>This tag contains nothing but whitespace: <b> </b></div>
53 <div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
54 <div><table><div>This table contains bare markup</div></table></div>
55 <div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
56 <div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
57 <div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
58 <div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
59 <div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
60 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
61 """
62
63
64 class SoupTest(unittest.TestCase):
65
66 @property
67 def default_builder(self):
68 return default_builder
69
70 def soup(self, markup, **kwargs):
71 """Build a Beautiful Soup object from markup."""
72 builder = kwargs.pop('builder', self.default_builder)
73 return BeautifulSoup(markup, builder=builder, **kwargs)
74
75 def document_for(self, markup, **kwargs):
76 """Turn an HTML fragment into a document.
77
78 The details depend on the builder.
79 """
80 return self.default_builder(**kwargs).test_fragment_to_document(markup)
81
82 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
83 builder = self.default_builder
84 obj = BeautifulSoup(to_parse, builder=builder)
85 if compare_parsed_to is None:
86 compare_parsed_to = to_parse
87
88 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
89
90 def assertConnectedness(self, element):
91 """Ensure that next_element and previous_element are properly
92 set for all descendants of the given element.
93 """
94 earlier = None
95 for e in element.descendants:
96 if earlier:
97 self.assertEqual(e, earlier.next_element)
98 self.assertEqual(earlier, e.previous_element)
99 earlier = e
100
101 def linkage_validator(self, el, _recursive_call=False):
102 """Ensure proper linkage throughout the document."""
103 descendant = None
104 # Document element should have no previous element or previous sibling.
105 # It also shouldn't have a next sibling.
106 if el.parent is None:
107 assert el.previous_element is None,\
108 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
109 el, el.previous_element, None
110 )
111 assert el.previous_sibling is None,\
112 "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
113 el, el.previous_sibling, None
114 )
115 assert el.next_sibling is None,\
116 "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
117 el, el.next_sibling, None
118 )
119
120 idx = 0
121 child = None
122 last_child = None
123 last_idx = len(el.contents) - 1
124 for child in el.contents:
125 descendant = None
126
127 # Parent should link next element to their first child
128 # That child should have no previous sibling
129 if idx == 0:
130 if el.parent is not None:
131 assert el.next_element is child,\
132 "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
133 el, el.next_element, child
134 )
135 assert child.previous_element is el,\
136 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
137 child, child.previous_element, el
138 )
139 assert child.previous_sibling is None,\
140 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
141 child, child.previous_sibling, None
142 )
143
144 # If not the first child, previous index should link as sibling to this index
145 # Previous element should match the last index or the last bubbled up descendant
146 else:
147 assert child.previous_sibling is el.contents[idx - 1],\
148 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
149 child, child.previous_sibling, el.contents[idx - 1]
150 )
151 assert el.contents[idx - 1].next_sibling is child,\
152 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
153 el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
154 )
155
156 if last_child is not None:
157 assert child.previous_element is last_child,\
158 "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
159 child, child.previous_element, last_child, child.parent.contents
160 )
161 assert last_child.next_element is child,\
162 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
163 last_child, last_child.next_element, child
164 )
165
166 if isinstance(child, Tag) and child.contents:
167 descendant = self.linkage_validator(child, True)
168 # A bubbled up descendant should have no next siblings
169 assert descendant.next_sibling is None,\
170 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
171 descendant, descendant.next_sibling, None
172 )
173
174 # Mark last child as either the bubbled up descendant or the current child
175 if descendant is not None:
176 last_child = descendant
177 else:
178 last_child = child
179
180 # If last child, there are non next siblings
181 if idx == last_idx:
182 assert child.next_sibling is None,\
183 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
184 child, child.next_sibling, None
185 )
186 idx += 1
187
188 child = descendant if descendant is not None else child
189 if child is None:
190 child = el
191
192 if not _recursive_call and child is not None:
193 target = el
194 while True:
195 if target is None:
196 assert child.next_element is None, \
197 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
198 child, child.next_element, None
199 )
200 break
201 elif target.next_sibling is not None:
202 assert child.next_element is target.next_sibling, \
203 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
204 child, child.next_element, target.next_sibling
205 )
206 break
207 target = target.parent
208
209 # We are done, so nothing to return
210 return None
211 else:
212 # Return the child to the recursive caller
213 return child
214
215
216 class HTMLTreeBuilderSmokeTest(object):
217
218 """A basic test of a treebuilder's competence.
219
220 Any HTML treebuilder, present or future, should be able to pass
221 these tests. With invalid markup, there's room for interpretation,
222 and different parsers can handle it differently. But with the
223 markup in these tests, there's not much room for interpretation.
224 """
225
226 def test_empty_element_tags(self):
227 """Verify that all HTML4 and HTML5 empty element (aka void element) tags
228 are handled correctly.
229 """
230 for name in [
231 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
232 'spacer', 'frame'
233 ]:
234 soup = self.soup("")
235 new_tag = soup.new_tag(name)
236 self.assertEqual(True, new_tag.is_empty_element)
237
238 def test_special_string_containers(self):
239 soup = self.soup(
240 "<style>Some CSS</style><script>Some Javascript</script>"
241 )
242 assert isinstance(soup.style.string, Stylesheet)
243 assert isinstance(soup.script.string, Script)
244
245 soup = self.soup(
246 "<style><!--Some CSS--></style>"
247 )
248 assert isinstance(soup.style.string, Stylesheet)
249 # The contents of the style tag resemble an HTML comment, but
250 # it's not treated as a comment.
251 self.assertEqual("<!--Some CSS-->", soup.style.string)
252 assert isinstance(soup.style.string, Stylesheet)
253
254 def test_pickle_and_unpickle_identity(self):
255 # Pickling a tree, then unpickling it, yields a tree identical
256 # to the original.
257 tree = self.soup("<a><b>foo</a>")
258 dumped = pickle.dumps(tree, 2)
259 loaded = pickle.loads(dumped)
260 self.assertEqual(loaded.__class__, BeautifulSoup)
261 self.assertEqual(loaded.decode(), tree.decode())
262
263 def assertDoctypeHandled(self, doctype_fragment):
264 """Assert that a given doctype string is handled correctly."""
265 doctype_str, soup = self._document_with_doctype(doctype_fragment)
266
267 # Make sure a Doctype object was created.
268 doctype = soup.contents[0]
269 self.assertEqual(doctype.__class__, Doctype)
270 self.assertEqual(doctype, doctype_fragment)
271 self.assertEqual(
272 soup.encode("utf8")[:len(doctype_str)],
273 doctype_str
274 )
275
276 # Make sure that the doctype was correctly associated with the
277 # parse tree and that the rest of the document parsed.
278 self.assertEqual(soup.p.contents[0], 'foo')
279
280 def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
281 """Generate and parse a document with the given doctype."""
282 doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
283 markup = doctype + '\n<p>foo</p>'
284 soup = self.soup(markup)
285 return doctype.encode("utf8"), soup
286
287 def test_normal_doctypes(self):
288 """Make sure normal, everyday HTML doctypes are handled correctly."""
289 self.assertDoctypeHandled("html")
290 self.assertDoctypeHandled(
291 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
292
293 def test_empty_doctype(self):
294 soup = self.soup("<!DOCTYPE>")
295 doctype = soup.contents[0]
296 self.assertEqual("", doctype.strip())
297
298 def test_mixed_case_doctype(self):
299 # A lowercase or mixed-case doctype becomes a Doctype.
300 for doctype_fragment in ("doctype", "DocType"):
301 doctype_str, soup = self._document_with_doctype(
302 "html", doctype_fragment
303 )
304
305 # Make sure a Doctype object was created and that the DOCTYPE
306 # is uppercase.
307 doctype = soup.contents[0]
308 self.assertEqual(doctype.__class__, Doctype)
309 self.assertEqual(doctype, "html")
310 self.assertEqual(
311 soup.encode("utf8")[:len(doctype_str)],
312 b"<!DOCTYPE html>"
313 )
314
315 # Make sure that the doctype was correctly associated with the
316 # parse tree and that the rest of the document parsed.
317 self.assertEqual(soup.p.contents[0], 'foo')
318
319 def test_public_doctype_with_url(self):
320 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
321 self.assertDoctypeHandled(doctype)
322
323 def test_system_doctype(self):
324 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
325
326 def test_namespaced_system_doctype(self):
327 # We can handle a namespaced doctype with a system ID.
328 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
329
330 def test_namespaced_public_doctype(self):
331 # Test a namespaced doctype with a public id.
332 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
333
334 def test_real_xhtml_document(self):
335 """A real XHTML document should come out more or less the same as it went in."""
336 markup = b"""<?xml version="1.0" encoding="utf-8"?>
337 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
338 <html xmlns="http://www.w3.org/1999/xhtml">
339 <head><title>Hello.</title></head>
340 <body>Goodbye.</body>
341 </html>"""
342 soup = self.soup(markup)
343 self.assertEqual(
344 soup.encode("utf-8").replace(b"\n", b""),
345 markup.replace(b"\n", b""))
346
347 def test_namespaced_html(self):
348 """When a namespaced XML document is parsed as HTML it should
349 be treated as HTML with weird tag names.
350 """
351 markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
352 soup = self.soup(markup)
353 self.assertEqual(2, len(soup.find_all("ns1:foo")))
354
355 def test_processing_instruction(self):
356 # We test both Unicode and bytestring to verify that
357 # process_markup correctly sets processing_instruction_class
358 # even when the markup is already Unicode and there is no
359 # need to process anything.
360 markup = """<?PITarget PIContent?>"""
361 soup = self.soup(markup)
362 self.assertEqual(markup, soup.decode())
363
364 markup = b"""<?PITarget PIContent?>"""
365 soup = self.soup(markup)
366 self.assertEqual(markup, soup.encode("utf8"))
367
368 def test_deepcopy(self):
369 """Make sure you can copy the tree builder.
370
371 This is important because the builder is part of a
372 BeautifulSoup object, and we want to be able to copy that.
373 """
374 copy.deepcopy(self.default_builder)
375
376 def test_p_tag_is_never_empty_element(self):
377 """A <p> tag is never designated as an empty-element tag.
378
379 Even if the markup shows it as an empty-element tag, it
380 shouldn't be presented that way.
381 """
382 soup = self.soup("<p/>")
383 self.assertFalse(soup.p.is_empty_element)
384 self.assertEqual(str(soup.p), "<p></p>")
385
386 def test_unclosed_tags_get_closed(self):
387 """A tag that's not closed by the end of the document should be closed.
388
389 This applies to all tags except empty-element tags.
390 """
391 self.assertSoupEquals("<p>", "<p></p>")
392 self.assertSoupEquals("<b>", "<b></b>")
393
394 self.assertSoupEquals("<br>", "<br/>")
395
396 def test_br_is_always_empty_element_tag(self):
397 """A <br> tag is designated as an empty-element tag.
398
399 Some parsers treat <br></br> as one <br/> tag, some parsers as
400 two tags, but it should always be an empty-element tag.
401 """
402 soup = self.soup("<br></br>")
403 self.assertTrue(soup.br.is_empty_element)
404 self.assertEqual(str(soup.br), "<br/>")
405
406 def test_nested_formatting_elements(self):
407 self.assertSoupEquals("<em><em></em></em>")
408
409 def test_double_head(self):
410 html = '''<!DOCTYPE html>
411 <html>
412 <head>
413 <title>Ordinary HEAD element test</title>
414 </head>
415 <script type="text/javascript">
416 alert("Help!");
417 </script>
418 <body>
419 Hello, world!
420 </body>
421 </html>
422 '''
423 soup = self.soup(html)
424 self.assertEqual("text/javascript", soup.find('script')['type'])
425
426 def test_comment(self):
427 # Comments are represented as Comment objects.
428 markup = "<p>foo<!--foobar-->baz</p>"
429 self.assertSoupEquals(markup)
430
431 soup = self.soup(markup)
432 comment = soup.find(text="foobar")
433 self.assertEqual(comment.__class__, Comment)
434
435 # The comment is properly integrated into the tree.
436 foo = soup.find(text="foo")
437 self.assertEqual(comment, foo.next_element)
438 baz = soup.find(text="baz")
439 self.assertEqual(comment, baz.previous_element)
440
441 def test_preserved_whitespace_in_pre_and_textarea(self):
442 """Whitespace must be preserved in <pre> and <textarea> tags,
443 even if that would mean not prettifying the markup.
444 """
445 pre_markup = "<pre> </pre>"
446 textarea_markup = "<textarea> woo\nwoo </textarea>"
447 self.assertSoupEquals(pre_markup)
448 self.assertSoupEquals(textarea_markup)
449
450 soup = self.soup(pre_markup)
451 self.assertEqual(soup.pre.prettify(), pre_markup)
452
453 soup = self.soup(textarea_markup)
454 self.assertEqual(soup.textarea.prettify(), textarea_markup)
455
456 soup = self.soup("<textarea></textarea>")
457 self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
458
459 def test_nested_inline_elements(self):
460 """Inline elements can be nested indefinitely."""
461 b_tag = "<b>Inside a B tag</b>"
462 self.assertSoupEquals(b_tag)
463
464 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
465 self.assertSoupEquals(nested_b_tag)
466
467 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
468 self.assertSoupEquals(nested_b_tag)
469
470 def test_nested_block_level_elements(self):
471 """Block elements can be nested."""
472 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
473 blockquote = soup.blockquote
474 self.assertEqual(blockquote.p.b.string, 'Foo')
475 self.assertEqual(blockquote.b.string, 'Foo')
476
477 def test_correctly_nested_tables(self):
478 """One table can go inside another one."""
479 markup = ('<table id="1">'
480 '<tr>'
481 "<td>Here's another table:"
482 '<table id="2">'
483 '<tr><td>foo</td></tr>'
484 '</table></td>')
485
486 self.assertSoupEquals(
487 markup,
488 '<table id="1"><tr><td>Here\'s another table:'
489 '<table id="2"><tr><td>foo</td></tr></table>'
490 '</td></tr></table>')
491
492 self.assertSoupEquals(
493 "<table><thead><tr><td>Foo</td></tr></thead>"
494 "<tbody><tr><td>Bar</td></tr></tbody>"
495 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
496
497 def test_multivalued_attribute_with_whitespace(self):
498 # Whitespace separating the values of a multi-valued attribute
499 # should be ignored.
500
501 markup = '<div class=" foo bar "></a>'
502 soup = self.soup(markup)
503 self.assertEqual(['foo', 'bar'], soup.div['class'])
504
505 # If you search by the literal name of the class it's like the whitespace
506 # wasn't there.
507 self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
508
509 def test_deeply_nested_multivalued_attribute(self):
510 # html5lib can set the attributes of the same tag many times
511 # as it rearranges the tree. This has caused problems with
512 # multivalued attributes.
513 markup = '<table><div><div class="css"></div></div></table>'
514 soup = self.soup(markup)
515 self.assertEqual(["css"], soup.div.div['class'])
516
517 def test_multivalued_attribute_on_html(self):
518 # html5lib uses a different API to set the attributes ot the
519 # <html> tag. This has caused problems with multivalued
520 # attributes.
521 markup = '<html class="a b"></html>'
522 soup = self.soup(markup)
523 self.assertEqual(["a", "b"], soup.html['class'])
524
525 def test_angle_brackets_in_attribute_values_are_escaped(self):
526 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
527
528 def test_strings_resembling_character_entity_references(self):
529 # "&T" and "&p" look like incomplete character entities, but they are
530 # not.
531 self.assertSoupEquals(
532 "<p>&bull; AT&T is in the s&p 500</p>",
533 "<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
534 )
535
536 def test_apos_entity(self):
537 self.assertSoupEquals(
538 "<p>Bob&apos;s Bar</p>",
539 "<p>Bob's Bar</p>",
540 )
541
542 def test_entities_in_foreign_document_encoding(self):
543 # &#147; and &#148; are invalid numeric entities referencing
544 # Windows-1252 characters. &#45; references a character common
545 # to Windows-1252 and Unicode, and &#9731; references a
546 # character only found in Unicode.
547 #
548 # All of these entities should be converted to Unicode
549 # characters.
550 markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
551 soup = self.soup(markup)
552 self.assertEqual("“Hello” -☃", soup.p.string)
553
554 def test_entities_in_attributes_converted_to_unicode(self):
555 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
556 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
557 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
558 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
559 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
560
561 def test_entities_in_text_converted_to_unicode(self):
562 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
563 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
564 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
565 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
566 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
567
568 def test_quot_entity_converted_to_quotation_mark(self):
569 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
570 '<p>I said "good day!"</p>')
571
572 def test_out_of_range_entity(self):
573 expect = "\N{REPLACEMENT CHARACTER}"
574 self.assertSoupEquals("&#10000000000000;", expect)
575 self.assertSoupEquals("&#x10000000000000;", expect)
576 self.assertSoupEquals("&#1000000000;", expect)
577
578 def test_multipart_strings(self):
579 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
580 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
581 self.assertEqual("p", soup.h2.string.next_element.name)
582 self.assertEqual("p", soup.p.name)
583 self.assertConnectedness(soup)
584
585 def test_empty_element_tags(self):
586 """Verify consistent handling of empty-element tags,
587 no matter how they come in through the markup.
588 """
589 self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
590 self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
591
592 def test_head_tag_between_head_and_body(self):
593 "Prevent recurrence of a bug in the html5lib treebuilder."
594 content = """<html><head></head>
595 <link></link>
596 <body>foo</body>
597 </html>
598 """
599 soup = self.soup(content)
600 self.assertNotEqual(None, soup.html.body)
601 self.assertConnectedness(soup)
602
603 def test_multiple_copies_of_a_tag(self):
604 "Prevent recurrence of a bug in the html5lib treebuilder."
605 content = """<!DOCTYPE html>
606 <html>
607 <body>
608 <article id="a" >
609 <div><a href="1"></div>
610 <footer>
611 <a href="2"></a>
612 </footer>
613 </article>
614 </body>
615 </html>
616 """
617 soup = self.soup(content)
618 self.assertConnectedness(soup.article)
619
620 def test_basic_namespaces(self):
621 """Parsers don't need to *understand* namespaces, but at the
622 very least they should not choke on namespaces or lose
623 data."""
624
625 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
626 soup = self.soup(markup)
627 self.assertEqual(markup, soup.encode())
628 html = soup.html
629 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
630 self.assertEqual(
631 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
632 self.assertEqual(
633 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
634
635 def test_multivalued_attribute_value_becomes_list(self):
636 markup = b'<a class="foo bar">'
637 soup = self.soup(markup)
638 self.assertEqual(['foo', 'bar'], soup.a['class'])
639
640 #
641 # Generally speaking, tests below this point are more tests of
642 # Beautiful Soup than tests of the tree builders. But parsers are
643 # weird, so we run these tests separately for every tree builder
644 # to detect any differences between them.
645 #
646
647 def test_can_parse_unicode_document(self):
648 # A seemingly innocuous document... but it's in Unicode! And
649 # it contains characters that can't be represented in the
650 # encoding found in the declaration! The horror!
651 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
652 soup = self.soup(markup)
653 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
654
655 def test_soupstrainer(self):
656 """Parsers should be able to work with SoupStrainers."""
657 strainer = SoupStrainer("b")
658 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
659 parse_only=strainer)
660 self.assertEqual(soup.decode(), "<b>bold</b>")
661
662 def test_single_quote_attribute_values_become_double_quotes(self):
663 self.assertSoupEquals("<foo attr='bar'></foo>",
664 '<foo attr="bar"></foo>')
665
666 def test_attribute_values_with_nested_quotes_are_left_alone(self):
667 text = """<foo attr='bar "brawls" happen'>a</foo>"""
668 self.assertSoupEquals(text)
669
670 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
671 text = """<foo attr='bar "brawls" happen'>a</foo>"""
672 soup = self.soup(text)
673 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
674 self.assertSoupEquals(
675 soup.foo.decode(),
676 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
677
678 def test_ampersand_in_attribute_value_gets_escaped(self):
679 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
680 '<this is="really messed up &amp; stuff"></this>')
681
682 self.assertSoupEquals(
683 '<a href="http://example.org?a=1&b=2;3">foo</a>',
684 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
685
686 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
687 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
688
689 def test_entities_in_strings_converted_during_parsing(self):
690 # Both XML and HTML entities are converted to Unicode characters
691 # during parsing.
692 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
693 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
694 self.assertSoupEquals(text, expected)
695
696 def test_smart_quotes_converted_on_the_way_in(self):
697 # Microsoft smart quotes are converted to Unicode characters during
698 # parsing.
699 quote = b"<p>\x91Foo\x92</p>"
700 soup = self.soup(quote)
701 self.assertEqual(
702 soup.p.string,
703 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
704
705 def test_non_breaking_spaces_converted_on_the_way_in(self):
706 soup = self.soup("<a>&nbsp;&nbsp;</a>")
707 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
708
709 def test_entities_converted_on_the_way_out(self):
710 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
711 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
712 soup = self.soup(text)
713 self.assertEqual(soup.p.encode("utf-8"), expected)
714
715 def test_real_iso_latin_document(self):
716 # Smoke test of interrelated functionality, using an
717 # easy-to-understand document.
718
719 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
720 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
721
722 # That's because we're going to encode it into ISO-Latin-1, and use
723 # that to test.
724 iso_latin_html = unicode_html.encode("iso-8859-1")
725
726 # Parse the ISO-Latin-1 HTML.
727 soup = self.soup(iso_latin_html)
728 # Encode it to UTF-8.
729 result = soup.encode("utf-8")
730
731 # What do we expect the result to look like? Well, it would
732 # look like unicode_html, except that the META tag would say
733 # UTF-8 instead of ISO-Latin-1.
734 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
735
736 # And, of course, it would be in UTF-8, not Unicode.
737 expected = expected.encode("utf-8")
738
739 # Ta-da!
740 self.assertEqual(result, expected)
741
742 def test_real_shift_jis_document(self):
743 # Smoke test to make sure the parser can handle a document in
744 # Shift-JIS encoding, without choking.
745 shift_jis_html = (
746 b'<html><head></head><body><pre>'
747 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
748 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
749 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
750 b'</pre></body></html>')
751 unicode_html = shift_jis_html.decode("shift-jis")
752 soup = self.soup(unicode_html)
753
754 # Make sure the parse tree is correctly encoded to various
755 # encodings.
756 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
757 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
758
759 def test_real_hebrew_document(self):
760 # A real-world test to make sure we can convert ISO-8859-9 (a
761 # Hebrew encoding) to UTF-8.
762 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
763 soup = self.soup(
764 hebrew_document, from_encoding="iso8859-8")
765 # Some tree builders call it iso8859-8, others call it iso-8859-9.
766 # That's not a difference we really care about.
767 assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
768 self.assertEqual(
769 soup.encode('utf-8'),
770 hebrew_document.decode("iso8859-8").encode("utf-8"))
771
772 def test_meta_tag_reflects_current_encoding(self):
773 # Here's the <meta> tag saying that a document is
774 # encoded in Shift-JIS.
775 meta_tag = ('<meta content="text/html; charset=x-sjis" '
776 'http-equiv="Content-type"/>')
777
778 # Here's a document incorporating that meta tag.
779 shift_jis_html = (
780 '<html><head>\n%s\n'
781 '<meta http-equiv="Content-language" content="ja"/>'
782 '</head><body>Shift-JIS markup goes here.') % meta_tag
783 soup = self.soup(shift_jis_html)
784
785 # Parse the document, and the charset is seemingly unaffected.
786 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
787 content = parsed_meta['content']
788 self.assertEqual('text/html; charset=x-sjis', content)
789
790 # But that value is actually a ContentMetaAttributeValue object.
791 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
792
793 # And it will take on a value that reflects its current
794 # encoding.
795 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
796
797 # For the rest of the story, see TestSubstitutions in
798 # test_tree.py.
799
800 def test_html5_style_meta_tag_reflects_current_encoding(self):
801 # Here's the <meta> tag saying that a document is
802 # encoded in Shift-JIS.
803 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
804
805 # Here's a document incorporating that meta tag.
806 shift_jis_html = (
807 '<html><head>\n%s\n'
808 '<meta http-equiv="Content-language" content="ja"/>'
809 '</head><body>Shift-JIS markup goes here.') % meta_tag
810 soup = self.soup(shift_jis_html)
811
812 # Parse the document, and the charset is seemingly unaffected.
813 parsed_meta = soup.find('meta', id="encoding")
814 charset = parsed_meta['charset']
815 self.assertEqual('x-sjis', charset)
816
817 # But that value is actually a CharsetMetaAttributeValue object.
818 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
819
820 # And it will take on a value that reflects its current
821 # encoding.
822 self.assertEqual('utf8', charset.encode("utf8"))
823
824 def test_tag_with_no_attributes_can_have_attributes_added(self):
825 data = self.soup("<a>text</a>")
826 data.a['foo'] = 'bar'
827 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
828
829 def test_worst_case(self):
830 """Test the worst case (currently) for linking issues."""
831
832 soup = self.soup(BAD_DOCUMENT)
833 self.linkage_validator(soup)
834
835
836 class XMLTreeBuilderSmokeTest(object):
837
838 def test_pickle_and_unpickle_identity(self):
839 # Pickling a tree, then unpickling it, yields a tree identical
840 # to the original.
841 tree = self.soup("<a><b>foo</a>")
842 dumped = pickle.dumps(tree, 2)
843 loaded = pickle.loads(dumped)
844 self.assertEqual(loaded.__class__, BeautifulSoup)
845 self.assertEqual(loaded.decode(), tree.decode())
846
847 def test_docstring_generated(self):
848 soup = self.soup("<root/>")
849 self.assertEqual(
850 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
851
852 def test_xml_declaration(self):
853 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
854 soup = self.soup(markup)
855 self.assertEqual(markup, soup.encode("utf8"))
856
857 def test_processing_instruction(self):
858 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
859 soup = self.soup(markup)
860 self.assertEqual(markup, soup.encode("utf8"))
861
862 def test_real_xhtml_document(self):
863 """A real XHTML document should come out *exactly* the same as it went in."""
864 markup = b"""<?xml version="1.0" encoding="utf-8"?>
865 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
866 <html xmlns="http://www.w3.org/1999/xhtml">
867 <head><title>Hello.</title></head>
868 <body>Goodbye.</body>
869 </html>"""
870 soup = self.soup(markup)
871 self.assertEqual(
872 soup.encode("utf-8"), markup)
873
874 def test_nested_namespaces(self):
875 doc = b"""<?xml version="1.0" encoding="utf-8"?>
876 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
877 <parent xmlns="http://ns1/">
878 <child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
879 <grandchild ns3:attr="value" xmlns="http://ns4/"/>
880 </child>
881 </parent>"""
882 soup = self.soup(doc)
883 self.assertEqual(doc, soup.encode())
884
885 def test_formatter_processes_script_tag_for_xml_documents(self):
886 doc = """
887 <script type="text/javascript">
888 </script>
889 """
890 soup = BeautifulSoup(doc, "lxml-xml")
891 # lxml would have stripped this while parsing, but we can add
892 # it later.
893 soup.script.string = 'console.log("< < hey > > ");'
894 encoded = soup.encode()
895 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
896
897 def test_can_parse_unicode_document(self):
898 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
899 soup = self.soup(markup)
900 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
901
902 def test_popping_namespaced_tag(self):
903 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
904 soup = self.soup(markup)
905 self.assertEqual(
906 str(soup.rss), markup)
907
908 def test_docstring_includes_correct_encoding(self):
909 soup = self.soup("<root/>")
910 self.assertEqual(
911 soup.encode("latin1"),
912 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
913
914 def test_large_xml_document(self):
915 """A large XML document should come out the same as it went in."""
916 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
917 + b'0' * (2**12)
918 + b'</root>')
919 soup = self.soup(markup)
920 self.assertEqual(soup.encode("utf-8"), markup)
921
922
923 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
924 self.assertSoupEquals("<p>", "<p/>")
925 self.assertSoupEquals("<p>foo</p>")
926
927 def test_namespaces_are_preserved(self):
928 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
929 soup = self.soup(markup)
930 root = soup.root
931 self.assertEqual("http://example.com/", root['xmlns:a'])
932 self.assertEqual("http://example.net/", root['xmlns:b'])
933
934 def test_closing_namespaced_tag(self):
935 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
936 soup = self.soup(markup)
937 self.assertEqual(str(soup.p), markup)
938
939 def test_namespaced_attributes(self):
940 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
941 soup = self.soup(markup)
942 self.assertEqual(str(soup.foo), markup)
943
944 def test_namespaced_attributes_xml_namespace(self):
945 markup = '<foo xml:lang="fr">bar</foo>'
946 soup = self.soup(markup)
947 self.assertEqual(str(soup.foo), markup)
948
949 def test_find_by_prefixed_name(self):
950 doc = """<?xml version="1.0" encoding="utf-8"?>
951 <Document xmlns="http://example.com/ns0"
952 xmlns:ns1="http://example.com/ns1"
953 xmlns:ns2="http://example.com/ns2"
954 <ns1:tag>foo</ns1:tag>
955 <ns1:tag>bar</ns1:tag>
956 <ns2:tag key="value">baz</ns2:tag>
957 </Document>
958 """
959 soup = self.soup(doc)
960
961 # There are three <tag> tags.
962 self.assertEqual(3, len(soup.find_all('tag')))
963
964 # But two of them are ns1:tag and one of them is ns2:tag.
965 self.assertEqual(2, len(soup.find_all('ns1:tag')))
966 self.assertEqual(1, len(soup.find_all('ns2:tag')))
967
968 self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
969 self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
970
971 def test_copy_tag_preserves_namespace(self):
972 xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
973 <w:document xmlns:w="http://example.com/ns0"/>"""
974
975 soup = self.soup(xml)
976 tag = soup.document
977 duplicate = copy.copy(tag)
978
979 # The two tags have the same namespace prefix.
980 self.assertEqual(tag.prefix, duplicate.prefix)
981
982 def test_worst_case(self):
983 """Test the worst case (currently) for linking issues."""
984
985 soup = self.soup(BAD_DOCUMENT)
986 self.linkage_validator(soup)
987
988
989 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
990 """Smoke test for a tree builder that supports HTML5."""
991
992 def test_real_xhtml_document(self):
993 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
994 # XHTML documents in any particular way.
995 pass
996
997 def test_html_tags_have_namespace(self):
998 markup = "<a>"
999 soup = self.soup(markup)
1000 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
1001
1002 def test_svg_tags_have_namespace(self):
1003 markup = '<svg><circle/></svg>'
1004 soup = self.soup(markup)
1005 namespace = "http://www.w3.org/2000/svg"
1006 self.assertEqual(namespace, soup.svg.namespace)
1007 self.assertEqual(namespace, soup.circle.namespace)
1008
1009
1010 def test_mathml_tags_have_namespace(self):
1011 markup = '<math><msqrt>5</msqrt></math>'
1012 soup = self.soup(markup)
1013 namespace = 'http://www.w3.org/1998/Math/MathML'
1014 self.assertEqual(namespace, soup.math.namespace)
1015 self.assertEqual(namespace, soup.msqrt.namespace)
1016
1017 def test_xml_declaration_becomes_comment(self):
1018 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
1019 soup = self.soup(markup)
1020 self.assertTrue(isinstance(soup.contents[0], Comment))
1021 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
1022 self.assertEqual("html", soup.contents[0].next_element.name)
1023
1024 def skipIf(condition, reason):
1025 def nothing(test, *args, **kwargs):
1026 return None
1027
1028 def decorator(test_item):
1029 if condition:
1030 return nothing
1031 else:
1032 return test_item
1033
1034 return decorator