comparison planemo/lib/python3.7/site-packages/bs4/element.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 try:
5 from collections.abc import Callable # Python 3.6
6 except ImportError as e:
7 from collections import Callable
8 import re
9 import sys
10 import warnings
11 try:
12 import soupsieve
13 except ImportError as e:
14 soupsieve = None
15 warnings.warn(
16 'The soupsieve package is not installed. CSS selectors cannot be used.'
17 )
18
19 from bs4.formatter import (
20 Formatter,
21 HTMLFormatter,
22 XMLFormatter,
23 )
24
25 DEFAULT_OUTPUT_ENCODING = "utf-8"
26 PY3K = (sys.version_info[0] > 2)
27
28 nonwhitespace_re = re.compile(r"\S+")
29
30 # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31 # the off chance someone imported it for their own use.
32 whitespace_re = re.compile(r"\s+")
33
34 def _alias(attr):
35 """Alias one attribute name to another for backward compatibility"""
36 @property
37 def alias(self):
38 return getattr(self, attr)
39
40 @alias.setter
41 def alias(self):
42 return setattr(self, attr)
43 return alias
44
45
46 # These encodings are recognized by Python (so PageElement.encode
47 # could theoretically support them) but XML and HTML don't recognize
48 # them (so they should not show up in an XML or HTML document as that
49 # document's encoding).
50 #
51 # If an XML document is encoded in one of these encodings, no encoding
52 # will be mentioned in the XML declaration. If an HTML document is
53 # encoded in one of these encodings, and the HTML document has a
54 # <meta> tag that mentions an encoding, the encoding will be given as
55 # the empty string.
56 #
57 # Source:
58 # https://docs.python.org/3/library/codecs.html#python-specific-encodings
59 PYTHON_SPECIFIC_ENCODINGS = set([
60 "idna",
61 "mbcs",
62 "oem",
63 "palmos",
64 "punycode",
65 "raw_unicode_escape",
66 "undefined",
67 "unicode_escape",
68 "raw-unicode-escape",
69 "unicode-escape",
70 "string-escape",
71 "string_escape",
72 ])
73
74
75 class NamespacedAttribute(str):
76 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
77 ('xml') and the name ('lang') that were used to create it.
78 """
79
80 def __new__(cls, prefix, name=None, namespace=None):
81 if not name:
82 # This is the default namespace. Its name "has no value"
83 # per https://www.w3.org/TR/xml-names/#defaulting
84 name = None
85
86 if name is None:
87 obj = str.__new__(cls, prefix)
88 elif prefix is None:
89 # Not really namespaced.
90 obj = str.__new__(cls, name)
91 else:
92 obj = str.__new__(cls, prefix + ":" + name)
93 obj.prefix = prefix
94 obj.name = name
95 obj.namespace = namespace
96 return obj
97
98 class AttributeValueWithCharsetSubstitution(str):
99 """A stand-in object for a character encoding specified in HTML."""
100
101 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
102 """A generic stand-in for the value of a meta tag's 'charset' attribute.
103
104 When Beautiful Soup parses the markup '<meta charset="utf8">', the
105 value of the 'charset' attribute will be one of these objects.
106 """
107
108 def __new__(cls, original_value):
109 obj = str.__new__(cls, original_value)
110 obj.original_value = original_value
111 return obj
112
113 def encode(self, encoding):
114 """When an HTML document is being encoded to a given encoding, the
115 value of a meta tag's 'charset' is the name of the encoding.
116 """
117 if encoding in PYTHON_SPECIFIC_ENCODINGS:
118 return ''
119 return encoding
120
121
122 class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
123 """A generic stand-in for the value of a meta tag's 'content' attribute.
124
125 When Beautiful Soup parses the markup:
126 <meta http-equiv="content-type" content="text/html; charset=utf8">
127
128 The value of the 'content' attribute will be one of these objects.
129 """
130
131 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132
133 def __new__(cls, original_value):
134 match = cls.CHARSET_RE.search(original_value)
135 if match is None:
136 # No substitution necessary.
137 return str.__new__(str, original_value)
138
139 obj = str.__new__(cls, original_value)
140 obj.original_value = original_value
141 return obj
142
143 def encode(self, encoding):
144 if encoding in PYTHON_SPECIFIC_ENCODINGS:
145 return ''
146 def rewrite(match):
147 return match.group(1) + encoding
148 return self.CHARSET_RE.sub(rewrite, self.original_value)
149
150
151 class PageElement(object):
152 """Contains the navigational information for some part of the page:
153 that is, its current location in the parse tree.
154
155 NavigableString, Tag, etc. are all subclasses of PageElement.
156 """
157
158 def setup(self, parent=None, previous_element=None, next_element=None,
159 previous_sibling=None, next_sibling=None):
160 """Sets up the initial relations between this element and
161 other elements.
162
163 :param parent: The parent of this element.
164
165 :param previous_element: The element parsed immediately before
166 this one.
167
168 :param next_element: The element parsed immediately before
169 this one.
170
171 :param previous_sibling: The most recently encountered element
172 on the same level of the parse tree as this one.
173
174 :param previous_sibling: The next element to be encountered
175 on the same level of the parse tree as this one.
176 """
177 self.parent = parent
178
179 self.previous_element = previous_element
180 if previous_element is not None:
181 self.previous_element.next_element = self
182
183 self.next_element = next_element
184 if self.next_element is not None:
185 self.next_element.previous_element = self
186
187 self.next_sibling = next_sibling
188 if self.next_sibling is not None:
189 self.next_sibling.previous_sibling = self
190
191 if (previous_sibling is None
192 and self.parent is not None and self.parent.contents):
193 previous_sibling = self.parent.contents[-1]
194
195 self.previous_sibling = previous_sibling
196 if previous_sibling is not None:
197 self.previous_sibling.next_sibling = self
198
199 def format_string(self, s, formatter):
200 """Format the given string using the given formatter.
201
202 :param s: A string.
203 :param formatter: A Formatter object, or a string naming one of the standard formatters.
204 """
205 if formatter is None:
206 return s
207 if not isinstance(formatter, Formatter):
208 formatter = self.formatter_for_name(formatter)
209 output = formatter.substitute(s)
210 return output
211
212 def formatter_for_name(self, formatter):
213 """Look up or create a Formatter for the given identifier,
214 if necessary.
215
216 :param formatter: Can be a Formatter object (used as-is), a
217 function (used as the entity substitution hook for an
218 XMLFormatter or HTMLFormatter), or a string (used to look
219 up an XMLFormatter or HTMLFormatter in the appropriate
220 registry.
221 """
222 if isinstance(formatter, Formatter):
223 return formatter
224 if self._is_xml:
225 c = XMLFormatter
226 else:
227 c = HTMLFormatter
228 if isinstance(formatter, Callable):
229 return c(entity_substitution=formatter)
230 return c.REGISTRY[formatter]
231
232 @property
233 def _is_xml(self):
234 """Is this element part of an XML tree or an HTML tree?
235
236 This is used in formatter_for_name, when deciding whether an
237 XMLFormatter or HTMLFormatter is more appropriate. It can be
238 inefficient, but it should be called very rarely.
239 """
240 if self.known_xml is not None:
241 # Most of the time we will have determined this when the
242 # document is parsed.
243 return self.known_xml
244
245 # Otherwise, it's likely that this element was created by
246 # direct invocation of the constructor from within the user's
247 # Python code.
248 if self.parent is None:
249 # This is the top-level object. It should have .known_xml set
250 # from tree creation. If not, take a guess--BS is usually
251 # used on HTML markup.
252 return getattr(self, 'is_xml', False)
253 return self.parent._is_xml
254
255 nextSibling = _alias("next_sibling") # BS3
256 previousSibling = _alias("previous_sibling") # BS3
257
258 def replace_with(self, replace_with):
259 """Replace this PageElement with another one, keeping the rest of the
260 tree the same.
261
262 :param replace_with: A PageElement.
263 :return: `self`, no longer part of the tree.
264 """
265 if self.parent is None:
266 raise ValueError(
267 "Cannot replace one element with another when the "
268 "element to be replaced is not part of a tree.")
269 if replace_with is self:
270 return
271 if replace_with is self.parent:
272 raise ValueError("Cannot replace a Tag with its parent.")
273 old_parent = self.parent
274 my_index = self.parent.index(self)
275 self.extract(_self_index=my_index)
276 old_parent.insert(my_index, replace_with)
277 return self
278 replaceWith = replace_with # BS3
279
280 def unwrap(self):
281 """Replace this PageElement with its contents.
282
283 :return: `self`, no longer part of the tree.
284 """
285 my_parent = self.parent
286 if self.parent is None:
287 raise ValueError(
288 "Cannot replace an element with its contents when that"
289 "element is not part of a tree.")
290 my_index = self.parent.index(self)
291 self.extract(_self_index=my_index)
292 for child in reversed(self.contents[:]):
293 my_parent.insert(my_index, child)
294 return self
295 replace_with_children = unwrap
296 replaceWithChildren = unwrap # BS3
297
298 def wrap(self, wrap_inside):
299 """Wrap this PageElement inside another one.
300
301 :param wrap_inside: A PageElement.
302 :return: `wrap_inside`, occupying the position in the tree that used
303 to be occupied by `self`, and with `self` inside it.
304 """
305 me = self.replace_with(wrap_inside)
306 wrap_inside.append(me)
307 return wrap_inside
308
309 def extract(self, _self_index=None):
310 """Destructively rips this element out of the tree.
311
312 :param _self_index: The location of this element in its parent's
313 .contents, if known. Passing this in allows for a performance
314 optimization.
315
316 :return: `self`, no longer part of the tree.
317 """
318 if self.parent is not None:
319 if _self_index is None:
320 _self_index = self.parent.index(self)
321 del self.parent.contents[_self_index]
322
323 #Find the two elements that would be next to each other if
324 #this element (and any children) hadn't been parsed. Connect
325 #the two.
326 last_child = self._last_descendant()
327 next_element = last_child.next_element
328
329 if (self.previous_element is not None and
330 self.previous_element is not next_element):
331 self.previous_element.next_element = next_element
332 if next_element is not None and next_element is not self.previous_element:
333 next_element.previous_element = self.previous_element
334 self.previous_element = None
335 last_child.next_element = None
336
337 self.parent = None
338 if (self.previous_sibling is not None
339 and self.previous_sibling is not self.next_sibling):
340 self.previous_sibling.next_sibling = self.next_sibling
341 if (self.next_sibling is not None
342 and self.next_sibling is not self.previous_sibling):
343 self.next_sibling.previous_sibling = self.previous_sibling
344 self.previous_sibling = self.next_sibling = None
345 return self
346
347 def _last_descendant(self, is_initialized=True, accept_self=True):
348 """Finds the last element beneath this object to be parsed.
349
350 :param is_initialized: Has `setup` been called on this PageElement
351 yet?
352 :param accept_self: Is `self` an acceptable answer to the question?
353 """
354 if is_initialized and self.next_sibling is not None:
355 last_child = self.next_sibling.previous_element
356 else:
357 last_child = self
358 while isinstance(last_child, Tag) and last_child.contents:
359 last_child = last_child.contents[-1]
360 if not accept_self and last_child is self:
361 last_child = None
362 return last_child
363 # BS3: Not part of the API!
364 _lastRecursiveChild = _last_descendant
365
366 def insert(self, position, new_child):
367 """Insert a new PageElement in the list of this PageElement's children.
368
369 This works the same way as `list.insert`.
370
371 :param position: The numeric position that should be occupied
372 in `self.children` by the new PageElement.
373 :param new_child: A PageElement.
374 """
375 if new_child is None:
376 raise ValueError("Cannot insert None into a tag.")
377 if new_child is self:
378 raise ValueError("Cannot insert a tag into itself.")
379 if (isinstance(new_child, str)
380 and not isinstance(new_child, NavigableString)):
381 new_child = NavigableString(new_child)
382
383 from bs4 import BeautifulSoup
384 if isinstance(new_child, BeautifulSoup):
385 # We don't want to end up with a situation where one BeautifulSoup
386 # object contains another. Insert the children one at a time.
387 for subchild in list(new_child.contents):
388 self.insert(position, subchild)
389 position += 1
390 return
391 position = min(position, len(self.contents))
392 if hasattr(new_child, 'parent') and new_child.parent is not None:
393 # We're 'inserting' an element that's already one
394 # of this object's children.
395 if new_child.parent is self:
396 current_index = self.index(new_child)
397 if current_index < position:
398 # We're moving this element further down the list
399 # of this object's children. That means that when
400 # we extract this element, our target index will
401 # jump down one.
402 position -= 1
403 new_child.extract()
404
405 new_child.parent = self
406 previous_child = None
407 if position == 0:
408 new_child.previous_sibling = None
409 new_child.previous_element = self
410 else:
411 previous_child = self.contents[position - 1]
412 new_child.previous_sibling = previous_child
413 new_child.previous_sibling.next_sibling = new_child
414 new_child.previous_element = previous_child._last_descendant(False)
415 if new_child.previous_element is not None:
416 new_child.previous_element.next_element = new_child
417
418 new_childs_last_element = new_child._last_descendant(False)
419
420 if position >= len(self.contents):
421 new_child.next_sibling = None
422
423 parent = self
424 parents_next_sibling = None
425 while parents_next_sibling is None and parent is not None:
426 parents_next_sibling = parent.next_sibling
427 parent = parent.parent
428 if parents_next_sibling is not None:
429 # We found the element that comes next in the document.
430 break
431 if parents_next_sibling is not None:
432 new_childs_last_element.next_element = parents_next_sibling
433 else:
434 # The last element of this tag is the last element in
435 # the document.
436 new_childs_last_element.next_element = None
437 else:
438 next_child = self.contents[position]
439 new_child.next_sibling = next_child
440 if new_child.next_sibling is not None:
441 new_child.next_sibling.previous_sibling = new_child
442 new_childs_last_element.next_element = next_child
443
444 if new_childs_last_element.next_element is not None:
445 new_childs_last_element.next_element.previous_element = new_childs_last_element
446 self.contents.insert(position, new_child)
447
448 def append(self, tag):
449 """Appends the given PageElement to the contents of this one.
450
451 :param tag: A PageElement.
452 """
453 self.insert(len(self.contents), tag)
454
455 def extend(self, tags):
456 """Appends the given PageElements to this one's contents.
457
458 :param tags: A list of PageElements.
459 """
460 for tag in tags:
461 self.append(tag)
462
463 def insert_before(self, *args):
464 """Makes the given element(s) the immediate predecessor of this one.
465
466 All the elements will have the same parent, and the given elements
467 will be immediately before this one.
468
469 :param args: One or more PageElements.
470 """
471 parent = self.parent
472 if parent is None:
473 raise ValueError(
474 "Element has no parent, so 'before' has no meaning.")
475 if any(x is self for x in args):
476 raise ValueError("Can't insert an element before itself.")
477 for predecessor in args:
478 # Extract first so that the index won't be screwed up if they
479 # are siblings.
480 if isinstance(predecessor, PageElement):
481 predecessor.extract()
482 index = parent.index(self)
483 parent.insert(index, predecessor)
484
485 def insert_after(self, *args):
486 """Makes the given element(s) the immediate successor of this one.
487
488 The elements will have the same parent, and the given elements
489 will be immediately after this one.
490
491 :param args: One or more PageElements.
492 """
493 # Do all error checking before modifying the tree.
494 parent = self.parent
495 if parent is None:
496 raise ValueError(
497 "Element has no parent, so 'after' has no meaning.")
498 if any(x is self for x in args):
499 raise ValueError("Can't insert an element after itself.")
500
501 offset = 0
502 for successor in args:
503 # Extract first so that the index won't be screwed up if they
504 # are siblings.
505 if isinstance(successor, PageElement):
506 successor.extract()
507 index = parent.index(self)
508 parent.insert(index+1+offset, successor)
509 offset += 1
510
511 def find_next(self, name=None, attrs={}, text=None, **kwargs):
512 """Find the first PageElement that matches the given criteria and
513 appears later in the document than this PageElement.
514
515 All find_* methods take a common set of arguments. See the online
516 documentation for detailed explanations.
517
518 :param name: A filter on tag name.
519 :param attrs: A dictionary of filters on attribute values.
520 :param text: A filter for a NavigableString with specific text.
521 :kwargs: A dictionary of filters on attribute values.
522 :return: A PageElement.
523 :rtype: bs4.element.Tag | bs4.element.NavigableString
524 """
525 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
526 findNext = find_next # BS3
527
528 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
529 **kwargs):
530 """Find all PageElements that match the given criteria and appear
531 later in the document than this PageElement.
532
533 All find_* methods take a common set of arguments. See the online
534 documentation for detailed explanations.
535
536 :param name: A filter on tag name.
537 :param attrs: A dictionary of filters on attribute values.
538 :param text: A filter for a NavigableString with specific text.
539 :param limit: Stop looking after finding this many results.
540 :kwargs: A dictionary of filters on attribute values.
541 :return: A ResultSet containing PageElements.
542 """
543 return self._find_all(name, attrs, text, limit, self.next_elements,
544 **kwargs)
545 findAllNext = find_all_next # BS3
546
547 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
548 """Find the closest sibling to this PageElement that matches the
549 given criteria and appears later in the document.
550
551 All find_* methods take a common set of arguments. See the
552 online documentation for detailed explanations.
553
554 :param name: A filter on tag name.
555 :param attrs: A dictionary of filters on attribute values.
556 :param text: A filter for a NavigableString with specific text.
557 :kwargs: A dictionary of filters on attribute values.
558 :return: A PageElement.
559 :rtype: bs4.element.Tag | bs4.element.NavigableString
560 """
561 return self._find_one(self.find_next_siblings, name, attrs, text,
562 **kwargs)
563 findNextSibling = find_next_sibling # BS3
564
565 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
566 **kwargs):
567 """Find all siblings of this PageElement that match the given criteria
568 and appear later in the document.
569
570 All find_* methods take a common set of arguments. See the online
571 documentation for detailed explanations.
572
573 :param name: A filter on tag name.
574 :param attrs: A dictionary of filters on attribute values.
575 :param text: A filter for a NavigableString with specific text.
576 :param limit: Stop looking after finding this many results.
577 :kwargs: A dictionary of filters on attribute values.
578 :return: A ResultSet of PageElements.
579 :rtype: bs4.element.ResultSet
580 """
581 return self._find_all(name, attrs, text, limit,
582 self.next_siblings, **kwargs)
583 findNextSiblings = find_next_siblings # BS3
584 fetchNextSiblings = find_next_siblings # BS2
585
586 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
587 """Look backwards in the document from this PageElement and find the
588 first PageElement that matches the given criteria.
589
590 All find_* methods take a common set of arguments. See the online
591 documentation for detailed explanations.
592
593 :param name: A filter on tag name.
594 :param attrs: A dictionary of filters on attribute values.
595 :param text: A filter for a NavigableString with specific text.
596 :kwargs: A dictionary of filters on attribute values.
597 :return: A PageElement.
598 :rtype: bs4.element.Tag | bs4.element.NavigableString
599 """
600 return self._find_one(
601 self.find_all_previous, name, attrs, text, **kwargs)
602 findPrevious = find_previous # BS3
603
604 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
605 **kwargs):
606 """Look backwards in the document from this PageElement and find all
607 PageElements that match the given criteria.
608
609 All find_* methods take a common set of arguments. See the online
610 documentation for detailed explanations.
611
612 :param name: A filter on tag name.
613 :param attrs: A dictionary of filters on attribute values.
614 :param text: A filter for a NavigableString with specific text.
615 :param limit: Stop looking after finding this many results.
616 :kwargs: A dictionary of filters on attribute values.
617 :return: A ResultSet of PageElements.
618 :rtype: bs4.element.ResultSet
619 """
620 return self._find_all(name, attrs, text, limit, self.previous_elements,
621 **kwargs)
622 findAllPrevious = find_all_previous # BS3
623 fetchPrevious = find_all_previous # BS2
624
625 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
626 """Returns the closest sibling to this PageElement that matches the
627 given criteria and appears earlier in the document.
628
629 All find_* methods take a common set of arguments. See the online
630 documentation for detailed explanations.
631
632 :param name: A filter on tag name.
633 :param attrs: A dictionary of filters on attribute values.
634 :param text: A filter for a NavigableString with specific text.
635 :kwargs: A dictionary of filters on attribute values.
636 :return: A PageElement.
637 :rtype: bs4.element.Tag | bs4.element.NavigableString
638 """
639 return self._find_one(self.find_previous_siblings, name, attrs, text,
640 **kwargs)
641 findPreviousSibling = find_previous_sibling # BS3
642
643 def find_previous_siblings(self, name=None, attrs={}, text=None,
644 limit=None, **kwargs):
645 """Returns all siblings to this PageElement that match the
646 given criteria and appear earlier in the document.
647
648 All find_* methods take a common set of arguments. See the online
649 documentation for detailed explanations.
650
651 :param name: A filter on tag name.
652 :param attrs: A dictionary of filters on attribute values.
653 :param text: A filter for a NavigableString with specific text.
654 :param limit: Stop looking after finding this many results.
655 :kwargs: A dictionary of filters on attribute values.
656 :return: A ResultSet of PageElements.
657 :rtype: bs4.element.ResultSet
658 """
659 return self._find_all(name, attrs, text, limit,
660 self.previous_siblings, **kwargs)
661 findPreviousSiblings = find_previous_siblings # BS3
662 fetchPreviousSiblings = find_previous_siblings # BS2
663
664 def find_parent(self, name=None, attrs={}, **kwargs):
665 """Find the closest parent of this PageElement that matches the given
666 criteria.
667
668 All find_* methods take a common set of arguments. See the online
669 documentation for detailed explanations.
670
671 :param name: A filter on tag name.
672 :param attrs: A dictionary of filters on attribute values.
673 :kwargs: A dictionary of filters on attribute values.
674
675 :return: A PageElement.
676 :rtype: bs4.element.Tag | bs4.element.NavigableString
677 """
678 # NOTE: We can't use _find_one because findParents takes a different
679 # set of arguments.
680 r = None
681 l = self.find_parents(name, attrs, 1, **kwargs)
682 if l:
683 r = l[0]
684 return r
685 findParent = find_parent # BS3
686
687 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
688 """Find all parents of this PageElement that match the given criteria.
689
690 All find_* methods take a common set of arguments. See the online
691 documentation for detailed explanations.
692
693 :param name: A filter on tag name.
694 :param attrs: A dictionary of filters on attribute values.
695 :param limit: Stop looking after finding this many results.
696 :kwargs: A dictionary of filters on attribute values.
697
698 :return: A PageElement.
699 :rtype: bs4.element.Tag | bs4.element.NavigableString
700 """
701 return self._find_all(name, attrs, None, limit, self.parents,
702 **kwargs)
703 findParents = find_parents # BS3
704 fetchParents = find_parents # BS2
705
706 @property
707 def next(self):
708 """The PageElement, if any, that was parsed just after this one.
709
710 :return: A PageElement.
711 :rtype: bs4.element.Tag | bs4.element.NavigableString
712 """
713 return self.next_element
714
715 @property
716 def previous(self):
717 """The PageElement, if any, that was parsed just before this one.
718
719 :return: A PageElement.
720 :rtype: bs4.element.Tag | bs4.element.NavigableString
721 """
722 return self.previous_element
723
724 #These methods do the real heavy lifting.
725
726 def _find_one(self, method, name, attrs, text, **kwargs):
727 r = None
728 l = method(name, attrs, text, 1, **kwargs)
729 if l:
730 r = l[0]
731 return r
732
733 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
734 "Iterates over a generator looking for things that match."
735
736 if text is None and 'string' in kwargs:
737 text = kwargs['string']
738 del kwargs['string']
739
740 if isinstance(name, SoupStrainer):
741 strainer = name
742 else:
743 strainer = SoupStrainer(name, attrs, text, **kwargs)
744
745 if text is None and not limit and not attrs and not kwargs:
746 if name is True or name is None:
747 # Optimization to find all tags.
748 result = (element for element in generator
749 if isinstance(element, Tag))
750 return ResultSet(strainer, result)
751 elif isinstance(name, str):
752 # Optimization to find all tags with a given name.
753 if name.count(':') == 1:
754 # This is a name with a prefix. If this is a namespace-aware document,
755 # we need to match the local name against tag.name. If not,
756 # we need to match the fully-qualified name against tag.name.
757 prefix, local_name = name.split(':', 1)
758 else:
759 prefix = None
760 local_name = name
761 result = (element for element in generator
762 if isinstance(element, Tag)
763 and (
764 element.name == name
765 ) or (
766 element.name == local_name
767 and (prefix is None or element.prefix == prefix)
768 )
769 )
770 return ResultSet(strainer, result)
771 results = ResultSet(strainer)
772 while True:
773 try:
774 i = next(generator)
775 except StopIteration:
776 break
777 if i:
778 found = strainer.search(i)
779 if found:
780 results.append(found)
781 if limit and len(results) >= limit:
782 break
783 return results
784
785 #These generators can be used to navigate starting from both
786 #NavigableStrings and Tags.
787 @property
788 def next_elements(self):
789 """All PageElements that were parsed after this one.
790
791 :yield: A sequence of PageElements.
792 """
793 i = self.next_element
794 while i is not None:
795 yield i
796 i = i.next_element
797
798 @property
799 def next_siblings(self):
800 """All PageElements that are siblings of this one but were parsed
801 later.
802
803 :yield: A sequence of PageElements.
804 """
805 i = self.next_sibling
806 while i is not None:
807 yield i
808 i = i.next_sibling
809
810 @property
811 def previous_elements(self):
812 """All PageElements that were parsed before this one.
813
814 :yield: A sequence of PageElements.
815 """
816 i = self.previous_element
817 while i is not None:
818 yield i
819 i = i.previous_element
820
821 @property
822 def previous_siblings(self):
823 """All PageElements that are siblings of this one but were parsed
824 earlier.
825
826 :yield: A sequence of PageElements.
827 """
828 i = self.previous_sibling
829 while i is not None:
830 yield i
831 i = i.previous_sibling
832
833 @property
834 def parents(self):
835 """All PageElements that are parents of this PageElement.
836
837 :yield: A sequence of PageElements.
838 """
839 i = self.parent
840 while i is not None:
841 yield i
842 i = i.parent
843
844 @property
845 def decomposed(self):
846 """Check whether a PageElement has been decomposed.
847
848 :rtype: bool
849 """
850 return getattr(self, '_decomposed', False) or False
851
852 # Old non-property versions of the generators, for backwards
853 # compatibility with BS3.
854 def nextGenerator(self):
855 return self.next_elements
856
857 def nextSiblingGenerator(self):
858 return self.next_siblings
859
860 def previousGenerator(self):
861 return self.previous_elements
862
863 def previousSiblingGenerator(self):
864 return self.previous_siblings
865
866 def parentGenerator(self):
867 return self.parents
868
869
870 class NavigableString(str, PageElement):
871 """A Python Unicode string that is part of a parse tree.
872
873 When Beautiful Soup parses the markup <b>penguin</b>, it will
874 create a NavigableString for the string "penguin".
875 """
876
877 PREFIX = ''
878 SUFFIX = ''
879
880 # We can't tell just by looking at a string whether it's contained
881 # in an XML document or an HTML document.
882
883 known_xml = None
884
885 def __new__(cls, value):
886 """Create a new NavigableString.
887
888 When unpickling a NavigableString, this method is called with
889 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
890 passed in to the superclass's __new__ or the superclass won't know
891 how to handle non-ASCII characters.
892 """
893 if isinstance(value, str):
894 u = str.__new__(cls, value)
895 else:
896 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
897 u.setup()
898 return u
899
900 def __copy__(self):
901 """A copy of a NavigableString has the same contents and class
902 as the original, but it is not connected to the parse tree.
903 """
904 return type(self)(self)
905
906 def __getnewargs__(self):
907 return (str(self),)
908
909 def __getattr__(self, attr):
910 """text.string gives you text. This is for backwards
911 compatibility for Navigable*String, but for CData* it lets you
912 get the string without the CData wrapper."""
913 if attr == 'string':
914 return self
915 else:
916 raise AttributeError(
917 "'%s' object has no attribute '%s'" % (
918 self.__class__.__name__, attr))
919
920 def output_ready(self, formatter="minimal"):
921 """Run the string through the provided formatter.
922
923 :param formatter: A Formatter object, or a string naming one of the standard formatters.
924 """
925 output = self.format_string(self, formatter)
926 return self.PREFIX + output + self.SUFFIX
927
928 @property
929 def name(self):
930 """Since a NavigableString is not a Tag, it has no .name.
931
932 This property is implemented so that code like this doesn't crash
933 when run on a mixture of Tag and NavigableString objects:
934 [x.name for x in tag.children]
935 """
936 return None
937
938 @name.setter
939 def name(self, name):
940 """Prevent NavigableString.name from ever being set."""
941 raise AttributeError("A NavigableString cannot be given a name.")
942
943
944 class PreformattedString(NavigableString):
945 """A NavigableString not subject to the normal formatting rules.
946
947 This is an abstract class used for special kinds of strings such
948 as comments (the Comment class) and CDATA blocks (the CData
949 class).
950 """
951
952 PREFIX = ''
953 SUFFIX = ''
954
955 def output_ready(self, formatter=None):
956 """Make this string ready for output by adding any subclass-specific
957 prefix or suffix.
958
959 :param formatter: A Formatter object, or a string naming one
960 of the standard formatters. The string will be passed into the
961 Formatter, but only to trigger any side effects: the return
962 value is ignored.
963
964 :return: The string, with any subclass-specific prefix and
965 suffix added on.
966 """
967 if formatter is not None:
968 ignore = self.format_string(self, formatter)
969 return self.PREFIX + self + self.SUFFIX
970
971 class CData(PreformattedString):
972 """A CDATA block."""
973 PREFIX = '<![CDATA['
974 SUFFIX = ']]>'
975
976 class ProcessingInstruction(PreformattedString):
977 """A SGML processing instruction."""
978
979 PREFIX = '<?'
980 SUFFIX = '>'
981
982 class XMLProcessingInstruction(ProcessingInstruction):
983 """An XML processing instruction."""
984 PREFIX = '<?'
985 SUFFIX = '?>'
986
987 class Comment(PreformattedString):
988 """An HTML or XML comment."""
989 PREFIX = '<!--'
990 SUFFIX = '-->'
991
992
993 class Declaration(PreformattedString):
994 """An XML declaration."""
995 PREFIX = '<?'
996 SUFFIX = '?>'
997
998
999 class Doctype(PreformattedString):
1000 """A document type declaration."""
1001 @classmethod
1002 def for_name_and_ids(cls, name, pub_id, system_id):
1003 """Generate an appropriate document type declaration for a given
1004 public ID and system ID.
1005
1006 :param name: The name of the document's root element, e.g. 'html'.
1007 :param pub_id: The Formal Public Identifier for this document type,
1008 e.g. '-//W3C//DTD XHTML 1.1//EN'
1009 :param system_id: The system identifier for this document type,
1010 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1011
1012 :return: A Doctype.
1013 """
1014 value = name or ''
1015 if pub_id is not None:
1016 value += ' PUBLIC "%s"' % pub_id
1017 if system_id is not None:
1018 value += ' "%s"' % system_id
1019 elif system_id is not None:
1020 value += ' SYSTEM "%s"' % system_id
1021
1022 return Doctype(value)
1023
1024 PREFIX = '<!DOCTYPE '
1025 SUFFIX = '>\n'
1026
1027
1028 class Stylesheet(NavigableString):
1029 """A NavigableString representing an stylesheet (probably
1030 CSS).
1031
1032 Used to distinguish embedded stylesheets from textual content.
1033 """
1034 pass
1035
1036
1037 class Script(NavigableString):
1038 """A NavigableString representing an executable script (probably
1039 Javascript).
1040
1041 Used to distinguish executable code from textual content.
1042 """
1043 pass
1044
1045
1046 class TemplateString(NavigableString):
1047 """A NavigableString representing a string found inside an HTML
1048 template embedded in a larger document.
1049
1050 Used to distinguish such strings from the main body of the document.
1051 """
1052 pass
1053
1054
1055 class Tag(PageElement):
1056 """Represents an HTML or XML tag that is part of a parse tree, along
1057 with its attributes and contents.
1058
1059 When Beautiful Soup parses the markup <b>penguin</b>, it will
1060 create a Tag object representing the <b> tag.
1061 """
1062
1063 def __init__(self, parser=None, builder=None, name=None, namespace=None,
1064 prefix=None, attrs=None, parent=None, previous=None,
1065 is_xml=None, sourceline=None, sourcepos=None,
1066 can_be_empty_element=None, cdata_list_attributes=None,
1067 preserve_whitespace_tags=None
1068 ):
1069 """Basic constructor.
1070
1071 :param parser: A BeautifulSoup object.
1072 :param builder: A TreeBuilder.
1073 :param name: The name of the tag.
1074 :param namespace: The URI of this Tag's XML namespace, if any.
1075 :param prefix: The prefix for this Tag's XML namespace, if any.
1076 :param attrs: A dictionary of this Tag's attribute values.
1077 :param parent: The PageElement to use as this Tag's parent.
1078 :param previous: The PageElement that was parsed immediately before
1079 this tag.
1080 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1081 HTML tag.
1082 :param sourceline: The line number where this tag was found in its
1083 source document.
1084 :param sourcepos: The character position within `sourceline` where this
1085 tag was found.
1086 :param can_be_empty_element: If True, this tag should be
1087 represented as <tag/>. If False, this tag should be represented
1088 as <tag></tag>.
1089 :param cdata_list_attributes: A list of attributes whose values should
1090 be treated as CDATA if they ever show up on this tag.
1091 :param preserve_whitespace_tags: A list of tag names whose contents
1092 should have their whitespace preserved.
1093 """
1094 if parser is None:
1095 self.parser_class = None
1096 else:
1097 # We don't actually store the parser object: that lets extracted
1098 # chunks be garbage-collected.
1099 self.parser_class = parser.__class__
1100 if name is None:
1101 raise ValueError("No value provided for new tag's name.")
1102 self.name = name
1103 self.namespace = namespace
1104 self.prefix = prefix
1105 if ((not builder or builder.store_line_numbers)
1106 and (sourceline is not None or sourcepos is not None)):
1107 self.sourceline = sourceline
1108 self.sourcepos = sourcepos
1109 if attrs is None:
1110 attrs = {}
1111 elif attrs:
1112 if builder is not None and builder.cdata_list_attributes:
1113 attrs = builder._replace_cdata_list_attribute_values(
1114 self.name, attrs)
1115 else:
1116 attrs = dict(attrs)
1117 else:
1118 attrs = dict(attrs)
1119
1120 # If possible, determine ahead of time whether this tag is an
1121 # XML tag.
1122 if builder:
1123 self.known_xml = builder.is_xml
1124 else:
1125 self.known_xml = is_xml
1126 self.attrs = attrs
1127 self.contents = []
1128 self.setup(parent, previous)
1129 self.hidden = False
1130
1131 if builder is None:
1132 # In the absence of a TreeBuilder, use whatever values were
1133 # passed in here. They're probably None, unless this is a copy of some
1134 # other tag.
1135 self.can_be_empty_element = can_be_empty_element
1136 self.cdata_list_attributes = cdata_list_attributes
1137 self.preserve_whitespace_tags = preserve_whitespace_tags
1138 else:
1139 # Set up any substitutions for this tag, such as the charset in a META tag.
1140 builder.set_up_substitutions(self)
1141
1142 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1143 self.can_be_empty_element = builder.can_be_empty_element(name)
1144
1145 # Keep track of the list of attributes of this tag that
1146 # might need to be treated as a list.
1147 #
1148 # For performance reasons, we store the whole data structure
1149 # rather than asking the question of every tag. Asking would
1150 # require building a new data structure every time, and
1151 # (unlike can_be_empty_element), we almost never need
1152 # to check this.
1153 self.cdata_list_attributes = builder.cdata_list_attributes
1154
1155 # Keep track of the names that might cause this tag to be treated as a
1156 # whitespace-preserved tag.
1157 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1158
1159 parserClass = _alias("parser_class") # BS3
1160
1161 def __copy__(self):
1162 """A copy of a Tag is a new Tag, unconnected to the parse tree.
1163 Its contents are a copy of the old Tag's contents.
1164 """
1165 clone = type(self)(
1166 None, self.builder, self.name, self.namespace,
1167 self.prefix, self.attrs, is_xml=self._is_xml,
1168 sourceline=self.sourceline, sourcepos=self.sourcepos,
1169 can_be_empty_element=self.can_be_empty_element,
1170 cdata_list_attributes=self.cdata_list_attributes,
1171 preserve_whitespace_tags=self.preserve_whitespace_tags
1172 )
1173 for attr in ('can_be_empty_element', 'hidden'):
1174 setattr(clone, attr, getattr(self, attr))
1175 for child in self.contents:
1176 clone.append(child.__copy__())
1177 return clone
1178
1179 @property
1180 def is_empty_element(self):
1181 """Is this tag an empty-element tag? (aka a self-closing tag)
1182
1183 A tag that has contents is never an empty-element tag.
1184
1185 A tag that has no contents may or may not be an empty-element
1186 tag. It depends on the builder used to create the tag. If the
1187 builder has a designated list of empty-element tags, then only
1188 a tag whose name shows up in that list is considered an
1189 empty-element tag.
1190
1191 If the builder has no designated list of empty-element tags,
1192 then any tag with no contents is an empty-element tag.
1193 """
1194 return len(self.contents) == 0 and self.can_be_empty_element
1195 isSelfClosing = is_empty_element # BS3
1196
1197 @property
1198 def string(self):
1199 """Convenience property to get the single string within this
1200 PageElement.
1201
1202 TODO It might make sense to have NavigableString.string return
1203 itself.
1204
1205 :return: If this element has a single string child, return
1206 value is that string. If this element has one child tag,
1207 return value is the 'string' attribute of the child tag,
1208 recursively. If this element is itself a string, has no
1209 children, or has more than one child, return value is None.
1210 """
1211 if len(self.contents) != 1:
1212 return None
1213 child = self.contents[0]
1214 if isinstance(child, NavigableString):
1215 return child
1216 return child.string
1217
1218 @string.setter
1219 def string(self, string):
1220 """Replace this PageElement's contents with `string`."""
1221 self.clear()
1222 self.append(string.__class__(string))
1223
1224 def _all_strings(self, strip=False, types=(NavigableString, CData)):
1225 """Yield all strings of certain classes, possibly stripping them.
1226
1227 :param strip: If True, all strings will be stripped before being
1228 yielded.
1229
1230 :types: A tuple of NavigableString subclasses. Any strings of
1231 a subclass not found in this list will be ignored. By
1232 default, this means only NavigableString and CData objects
1233 will be considered. So no comments, processing instructions,
1234 etc.
1235
1236 :yield: A sequence of strings.
1237 """
1238 for descendant in self.descendants:
1239 if (
1240 (types is None and not isinstance(descendant, NavigableString))
1241 or
1242 (types is not None and type(descendant) not in types)):
1243 continue
1244 if strip:
1245 descendant = descendant.strip()
1246 if len(descendant) == 0:
1247 continue
1248 yield descendant
1249
1250 strings = property(_all_strings)
1251
1252 @property
1253 def stripped_strings(self):
1254 """Yield all strings in the document, stripping them first.
1255
1256 :yield: A sequence of stripped strings.
1257 """
1258 for string in self._all_strings(True):
1259 yield string
1260
1261 def get_text(self, separator="", strip=False,
1262 types=(NavigableString, CData)):
1263 """Get all child strings, concatenated using the given separator.
1264
1265 :param separator: Strings will be concatenated using this separator.
1266
1267 :param strip: If True, strings will be stripped before being
1268 concatenated.
1269
1270 :types: A tuple of NavigableString subclasses. Any strings of
1271 a subclass not found in this list will be ignored. By
1272 default, this means only NavigableString and CData objects
1273 will be considered. So no comments, processing instructions,
1274 stylesheets, etc.
1275
1276 :return: A string.
1277 """
1278 return separator.join([s for s in self._all_strings(
1279 strip, types=types)])
1280 getText = get_text
1281 text = property(get_text)
1282
1283 def decompose(self):
1284 """Recursively destroys this PageElement and its children.
1285
1286 This element will be removed from the tree and wiped out; so
1287 will everything beneath it.
1288
1289 The behavior of a decomposed PageElement is undefined and you
1290 should never use one for anything, but if you need to _check_
1291 whether an element has been decomposed, you can use the
1292 `decomposed` property.
1293 """
1294 self.extract()
1295 i = self
1296 while i is not None:
1297 n = i.next_element
1298 i.__dict__.clear()
1299 i.contents = []
1300 i._decomposed = True
1301 i = n
1302
1303 def clear(self, decompose=False):
1304 """Wipe out all children of this PageElement by calling extract()
1305 on them.
1306
1307 :param decompose: If this is True, decompose() (a more
1308 destructive method) will be called instead of extract().
1309 """
1310 if decompose:
1311 for element in self.contents[:]:
1312 if isinstance(element, Tag):
1313 element.decompose()
1314 else:
1315 element.extract()
1316 else:
1317 for element in self.contents[:]:
1318 element.extract()
1319
1320 def smooth(self):
1321 """Smooth out this element's children by consolidating consecutive
1322 strings.
1323
1324 This makes pretty-printed output look more natural following a
1325 lot of operations that modified the tree.
1326 """
1327 # Mark the first position of every pair of children that need
1328 # to be consolidated. Do this rather than making a copy of
1329 # self.contents, since in most cases very few strings will be
1330 # affected.
1331 marked = []
1332 for i, a in enumerate(self.contents):
1333 if isinstance(a, Tag):
1334 # Recursively smooth children.
1335 a.smooth()
1336 if i == len(self.contents)-1:
1337 # This is the last item in .contents, and it's not a
1338 # tag. There's no chance it needs any work.
1339 continue
1340 b = self.contents[i+1]
1341 if (isinstance(a, NavigableString)
1342 and isinstance(b, NavigableString)
1343 and not isinstance(a, PreformattedString)
1344 and not isinstance(b, PreformattedString)
1345 ):
1346 marked.append(i)
1347
1348 # Go over the marked positions in reverse order, so that
1349 # removing items from .contents won't affect the remaining
1350 # positions.
1351 for i in reversed(marked):
1352 a = self.contents[i]
1353 b = self.contents[i+1]
1354 b.extract()
1355 n = NavigableString(a+b)
1356 a.replace_with(n)
1357
1358 def index(self, element):
1359 """Find the index of a child by identity, not value.
1360
1361 Avoids issues with tag.contents.index(element) getting the
1362 index of equal elements.
1363
1364 :param element: Look for this PageElement in `self.contents`.
1365 """
1366 for i, child in enumerate(self.contents):
1367 if child is element:
1368 return i
1369 raise ValueError("Tag.index: element not in tag")
1370
1371 def get(self, key, default=None):
1372 """Returns the value of the 'key' attribute for the tag, or
1373 the value given for 'default' if it doesn't have that
1374 attribute."""
1375 return self.attrs.get(key, default)
1376
1377 def get_attribute_list(self, key, default=None):
1378 """The same as get(), but always returns a list.
1379
1380 :param key: The attribute to look for.
1381 :param default: Use this value if the attribute is not present
1382 on this PageElement.
1383 :return: A list of values, probably containing only a single
1384 value.
1385 """
1386 value = self.get(key, default)
1387 if not isinstance(value, list):
1388 value = [value]
1389 return value
1390
1391 def has_attr(self, key):
1392 """Does this PageElement have an attribute with the given name?"""
1393 return key in self.attrs
1394
1395 def __hash__(self):
1396 return str(self).__hash__()
1397
1398 def __getitem__(self, key):
1399 """tag[key] returns the value of the 'key' attribute for the Tag,
1400 and throws an exception if it's not there."""
1401 return self.attrs[key]
1402
1403 def __iter__(self):
1404 "Iterating over a Tag iterates over its contents."
1405 return iter(self.contents)
1406
1407 def __len__(self):
1408 "The length of a Tag is the length of its list of contents."
1409 return len(self.contents)
1410
1411 def __contains__(self, x):
1412 return x in self.contents
1413
1414 def __bool__(self):
1415 "A tag is non-None even if it has no contents."
1416 return True
1417
1418 def __setitem__(self, key, value):
1419 """Setting tag[key] sets the value of the 'key' attribute for the
1420 tag."""
1421 self.attrs[key] = value
1422
1423 def __delitem__(self, key):
1424 "Deleting tag[key] deletes all 'key' attributes for the tag."
1425 self.attrs.pop(key, None)
1426
1427 def __call__(self, *args, **kwargs):
1428 """Calling a Tag like a function is the same as calling its
1429 find_all() method. Eg. tag('a') returns a list of all the A tags
1430 found within this tag."""
1431 return self.find_all(*args, **kwargs)
1432
1433 def __getattr__(self, tag):
1434 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1435 #print("Getattr %s.%s" % (self.__class__, tag))
1436 if len(tag) > 3 and tag.endswith('Tag'):
1437 # BS3: soup.aTag -> "soup.find("a")
1438 tag_name = tag[:-3]
1439 warnings.warn(
1440 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1441 name=tag_name
1442 )
1443 )
1444 return self.find(tag_name)
1445 # We special case contents to avoid recursion.
1446 elif not tag.startswith("__") and not tag == "contents":
1447 return self.find(tag)
1448 raise AttributeError(
1449 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1450
1451 def __eq__(self, other):
1452 """Returns true iff this Tag has the same name, the same attributes,
1453 and the same contents (recursively) as `other`."""
1454 if self is other:
1455 return True
1456 if (not hasattr(other, 'name') or
1457 not hasattr(other, 'attrs') or
1458 not hasattr(other, 'contents') or
1459 self.name != other.name or
1460 self.attrs != other.attrs or
1461 len(self) != len(other)):
1462 return False
1463 for i, my_child in enumerate(self.contents):
1464 if my_child != other.contents[i]:
1465 return False
1466 return True
1467
1468 def __ne__(self, other):
1469 """Returns true iff this Tag is not identical to `other`,
1470 as defined in __eq__."""
1471 return not self == other
1472
1473 def __repr__(self, encoding="unicode-escape"):
1474 """Renders this PageElement as a string.
1475
1476 :param encoding: The encoding to use (Python 2 only).
1477 :return: Under Python 2, a bytestring; under Python 3,
1478 a Unicode string.
1479 """
1480 if PY3K:
1481 # "The return value must be a string object", i.e. Unicode
1482 return self.decode()
1483 else:
1484 # "The return value must be a string object", i.e. a bytestring.
1485 # By convention, the return value of __repr__ should also be
1486 # an ASCII string.
1487 return self.encode(encoding)
1488
1489 def __unicode__(self):
1490 """Renders this PageElement as a Unicode string."""
1491 return self.decode()
1492
1493 def __str__(self):
1494 """Renders this PageElement as a generic string.
1495
1496 :return: Under Python 2, a UTF-8 bytestring; under Python 3,
1497 a Unicode string.
1498 """
1499 if PY3K:
1500 return self.decode()
1501 else:
1502 return self.encode()
1503
1504 if PY3K:
1505 __str__ = __repr__ = __unicode__
1506
1507 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1508 indent_level=None, formatter="minimal",
1509 errors="xmlcharrefreplace"):
1510 """Render a bytestring representation of this PageElement and its
1511 contents.
1512
1513 :param encoding: The destination encoding.
1514 :param indent_level: Each line of the rendering will be
1515 indented this many spaces. Used internally in
1516 recursive calls while pretty-printing.
1517 :param formatter: A Formatter object, or a string naming one of
1518 the standard formatters.
1519 :param errors: An error handling strategy such as
1520 'xmlcharrefreplace'. This value is passed along into
1521 encode() and its value should be one of the constants
1522 defined by Python.
1523 :return: A bytestring.
1524
1525 """
1526 # Turn the data structure into Unicode, then encode the
1527 # Unicode.
1528 u = self.decode(indent_level, encoding, formatter)
1529 return u.encode(encoding, errors)
1530
1531 def decode(self, indent_level=None,
1532 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1533 formatter="minimal"):
1534 """Render a Unicode representation of this PageElement and its
1535 contents.
1536
1537 :param indent_level: Each line of the rendering will be
1538 indented this many spaces. Used internally in
1539 recursive calls while pretty-printing.
1540 :param eventual_encoding: The tag is destined to be
1541 encoded into this encoding. This method is _not_
1542 responsible for performing that encoding. This information
1543 is passed in so that it can be substituted in if the
1544 document contains a <META> tag that mentions the document's
1545 encoding.
1546 :param formatter: A Formatter object, or a string naming one of
1547 the standard formatters.
1548 """
1549
1550 # First off, turn a non-Formatter `formatter` into a Formatter
1551 # object. This will stop the lookup from happening over and
1552 # over again.
1553 if not isinstance(formatter, Formatter):
1554 formatter = self.formatter_for_name(formatter)
1555 attributes = formatter.attributes(self)
1556 attrs = []
1557 for key, val in attributes:
1558 if val is None:
1559 decoded = key
1560 else:
1561 if isinstance(val, list) or isinstance(val, tuple):
1562 val = ' '.join(val)
1563 elif not isinstance(val, str):
1564 val = str(val)
1565 elif (
1566 isinstance(val, AttributeValueWithCharsetSubstitution)
1567 and eventual_encoding is not None
1568 ):
1569 val = val.encode(eventual_encoding)
1570
1571 text = formatter.attribute_value(val)
1572 decoded = (
1573 str(key) + '='
1574 + formatter.quoted_attribute_value(text))
1575 attrs.append(decoded)
1576 close = ''
1577 closeTag = ''
1578
1579 prefix = ''
1580 if self.prefix:
1581 prefix = self.prefix + ":"
1582
1583 if self.is_empty_element:
1584 close = formatter.void_element_close_prefix or ''
1585 else:
1586 closeTag = '</%s%s>' % (prefix, self.name)
1587
1588 pretty_print = self._should_pretty_print(indent_level)
1589 space = ''
1590 indent_space = ''
1591 if indent_level is not None:
1592 indent_space = (' ' * (indent_level - 1))
1593 if pretty_print:
1594 space = indent_space
1595 indent_contents = indent_level + 1
1596 else:
1597 indent_contents = None
1598 contents = self.decode_contents(
1599 indent_contents, eventual_encoding, formatter
1600 )
1601
1602 if self.hidden:
1603 # This is the 'document root' object.
1604 s = contents
1605 else:
1606 s = []
1607 attribute_string = ''
1608 if attrs:
1609 attribute_string = ' ' + ' '.join(attrs)
1610 if indent_level is not None:
1611 # Even if this particular tag is not pretty-printed,
1612 # we should indent up to the start of the tag.
1613 s.append(indent_space)
1614 s.append('<%s%s%s%s>' % (
1615 prefix, self.name, attribute_string, close))
1616 if pretty_print:
1617 s.append("\n")
1618 s.append(contents)
1619 if pretty_print and contents and contents[-1] != "\n":
1620 s.append("\n")
1621 if pretty_print and closeTag:
1622 s.append(space)
1623 s.append(closeTag)
1624 if indent_level is not None and closeTag and self.next_sibling:
1625 # Even if this particular tag is not pretty-printed,
1626 # we're now done with the tag, and we should add a
1627 # newline if appropriate.
1628 s.append("\n")
1629 s = ''.join(s)
1630 return s
1631
1632 def _should_pretty_print(self, indent_level):
1633 """Should this tag be pretty-printed?
1634
1635 Most of them should, but some (such as <pre> in HTML
1636 documents) should not.
1637 """
1638 return (
1639 indent_level is not None
1640 and (
1641 not self.preserve_whitespace_tags
1642 or self.name not in self.preserve_whitespace_tags
1643 )
1644 )
1645
1646 def prettify(self, encoding=None, formatter="minimal"):
1647 """Pretty-print this PageElement as a string.
1648
1649 :param encoding: The eventual encoding of the string. If this is None,
1650 a Unicode string will be returned.
1651 :param formatter: A Formatter object, or a string naming one of
1652 the standard formatters.
1653 :return: A Unicode string (if encoding==None) or a bytestring
1654 (otherwise).
1655 """
1656 if encoding is None:
1657 return self.decode(True, formatter=formatter)
1658 else:
1659 return self.encode(encoding, True, formatter=formatter)
1660
1661 def decode_contents(self, indent_level=None,
1662 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1663 formatter="minimal"):
1664 """Renders the contents of this tag as a Unicode string.
1665
1666 :param indent_level: Each line of the rendering will be
1667 indented this many spaces. Used internally in
1668 recursive calls while pretty-printing.
1669
1670 :param eventual_encoding: The tag is destined to be
1671 encoded into this encoding. decode_contents() is _not_
1672 responsible for performing that encoding. This information
1673 is passed in so that it can be substituted in if the
1674 document contains a <META> tag that mentions the document's
1675 encoding.
1676
1677 :param formatter: A Formatter object, or a string naming one of
1678 the standard Formatters.
1679 """
1680 # First off, turn a string formatter into a Formatter object. This
1681 # will stop the lookup from happening over and over again.
1682 if not isinstance(formatter, Formatter):
1683 formatter = self.formatter_for_name(formatter)
1684
1685 pretty_print = (indent_level is not None)
1686 s = []
1687 for c in self:
1688 text = None
1689 if isinstance(c, NavigableString):
1690 text = c.output_ready(formatter)
1691 elif isinstance(c, Tag):
1692 s.append(c.decode(indent_level, eventual_encoding,
1693 formatter))
1694 preserve_whitespace = (
1695 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1696 )
1697 if text and indent_level and not preserve_whitespace:
1698 text = text.strip()
1699 if text:
1700 if pretty_print and not preserve_whitespace:
1701 s.append(" " * (indent_level - 1))
1702 s.append(text)
1703 if pretty_print and not preserve_whitespace:
1704 s.append("\n")
1705 return ''.join(s)
1706
1707 def encode_contents(
1708 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1709 formatter="minimal"):
1710 """Renders the contents of this PageElement as a bytestring.
1711
1712 :param indent_level: Each line of the rendering will be
1713 indented this many spaces. Used internally in
1714 recursive calls while pretty-printing.
1715
1716 :param eventual_encoding: The bytestring will be in this encoding.
1717
1718 :param formatter: A Formatter object, or a string naming one of
1719 the standard Formatters.
1720
1721 :return: A bytestring.
1722 """
1723 contents = self.decode_contents(indent_level, encoding, formatter)
1724 return contents.encode(encoding)
1725
1726 # Old method for BS3 compatibility
1727 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1728 prettyPrint=False, indentLevel=0):
1729 """Deprecated method for BS3 compatibility."""
1730 if not prettyPrint:
1731 indentLevel = None
1732 return self.encode_contents(
1733 indent_level=indentLevel, encoding=encoding)
1734
1735 #Soup methods
1736
1737 def find(self, name=None, attrs={}, recursive=True, text=None,
1738 **kwargs):
1739 """Look in the children of this PageElement and find the first
1740 PageElement that matches the given criteria.
1741
1742 All find_* methods take a common set of arguments. See the online
1743 documentation for detailed explanations.
1744
1745 :param name: A filter on tag name.
1746 :param attrs: A dictionary of filters on attribute values.
1747 :param recursive: If this is True, find() will perform a
1748 recursive search of this PageElement's children. Otherwise,
1749 only the direct children will be considered.
1750 :param limit: Stop looking after finding this many results.
1751 :kwargs: A dictionary of filters on attribute values.
1752 :return: A PageElement.
1753 :rtype: bs4.element.Tag | bs4.element.NavigableString
1754 """
1755 r = None
1756 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1757 if l:
1758 r = l[0]
1759 return r
1760 findChild = find #BS2
1761
1762 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1763 limit=None, **kwargs):
1764 """Look in the children of this PageElement and find all
1765 PageElements that match the given criteria.
1766
1767 All find_* methods take a common set of arguments. See the online
1768 documentation for detailed explanations.
1769
1770 :param name: A filter on tag name.
1771 :param attrs: A dictionary of filters on attribute values.
1772 :param recursive: If this is True, find_all() will perform a
1773 recursive search of this PageElement's children. Otherwise,
1774 only the direct children will be considered.
1775 :param limit: Stop looking after finding this many results.
1776 :kwargs: A dictionary of filters on attribute values.
1777 :return: A ResultSet of PageElements.
1778 :rtype: bs4.element.ResultSet
1779 """
1780 generator = self.descendants
1781 if not recursive:
1782 generator = self.children
1783 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1784 findAll = find_all # BS3
1785 findChildren = find_all # BS2
1786
1787 #Generator methods
1788 @property
1789 def children(self):
1790 """Iterate over all direct children of this PageElement.
1791
1792 :yield: A sequence of PageElements.
1793 """
1794 # return iter() to make the purpose of the method clear
1795 return iter(self.contents) # XXX This seems to be untested.
1796
1797 @property
1798 def descendants(self):
1799 """Iterate over all children of this PageElement in a
1800 breadth-first sequence.
1801
1802 :yield: A sequence of PageElements.
1803 """
1804 if not len(self.contents):
1805 return
1806 stopNode = self._last_descendant().next_element
1807 current = self.contents[0]
1808 while current is not stopNode:
1809 yield current
1810 current = current.next_element
1811
1812 # CSS selector code
1813 def select_one(self, selector, namespaces=None, **kwargs):
1814 """Perform a CSS selection operation on the current element.
1815
1816 :param selector: A CSS selector.
1817
1818 :param namespaces: A dictionary mapping namespace prefixes
1819 used in the CSS selector to namespace URIs. By default,
1820 Beautiful Soup will use the prefixes it encountered while
1821 parsing the document.
1822
1823 :param kwargs: Keyword arguments to be passed into SoupSieve's
1824 soupsieve.select() method.
1825
1826 :return: A Tag.
1827 :rtype: bs4.element.Tag
1828 """
1829 value = self.select(selector, namespaces, 1, **kwargs)
1830 if value:
1831 return value[0]
1832 return None
1833
1834 def select(self, selector, namespaces=None, limit=None, **kwargs):
1835 """Perform a CSS selection operation on the current element.
1836
1837 This uses the SoupSieve library.
1838
1839 :param selector: A string containing a CSS selector.
1840
1841 :param namespaces: A dictionary mapping namespace prefixes
1842 used in the CSS selector to namespace URIs. By default,
1843 Beautiful Soup will use the prefixes it encountered while
1844 parsing the document.
1845
1846 :param limit: After finding this number of results, stop looking.
1847
1848 :param kwargs: Keyword arguments to be passed into SoupSieve's
1849 soupsieve.select() method.
1850
1851 :return: A ResultSet of Tags.
1852 :rtype: bs4.element.ResultSet
1853 """
1854 if namespaces is None:
1855 namespaces = self._namespaces
1856
1857 if limit is None:
1858 limit = 0
1859 if soupsieve is None:
1860 raise NotImplementedError(
1861 "Cannot execute CSS selectors because the soupsieve package is not installed."
1862 )
1863
1864 results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1865
1866 # We do this because it's more consistent and because
1867 # ResultSet.__getattr__ has a helpful error message.
1868 return ResultSet(None, results)
1869
1870 # Old names for backwards compatibility
1871 def childGenerator(self):
1872 """Deprecated generator."""
1873 return self.children
1874
1875 def recursiveChildGenerator(self):
1876 """Deprecated generator."""
1877 return self.descendants
1878
1879 def has_key(self, key):
1880 """Deprecated method. This was kind of misleading because has_key()
1881 (attributes) was different from __in__ (contents).
1882
1883 has_key() is gone in Python 3, anyway.
1884 """
1885 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1886 key))
1887 return self.has_attr(key)
1888
1889 # Next, a couple classes to represent queries and their results.
1890 class SoupStrainer(object):
1891 """Encapsulates a number of ways of matching a markup element (tag or
1892 string).
1893
1894 This is primarily used to underpin the find_* methods, but you can
1895 create one yourself and pass it in as `parse_only` to the
1896 `BeautifulSoup` constructor, to parse a subset of a large
1897 document.
1898 """
1899
1900 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1901 """Constructor.
1902
1903 The SoupStrainer constructor takes the same arguments passed
1904 into the find_* methods. See the online documentation for
1905 detailed explanations.
1906
1907 :param name: A filter on tag name.
1908 :param attrs: A dictionary of filters on attribute values.
1909 :param text: A filter for a NavigableString with specific text.
1910 :kwargs: A dictionary of filters on attribute values.
1911 """
1912 self.name = self._normalize_search_value(name)
1913 if not isinstance(attrs, dict):
1914 # Treat a non-dict value for attrs as a search for the 'class'
1915 # attribute.
1916 kwargs['class'] = attrs
1917 attrs = None
1918
1919 if 'class_' in kwargs:
1920 # Treat class_="foo" as a search for the 'class'
1921 # attribute, overriding any non-dict value for attrs.
1922 kwargs['class'] = kwargs['class_']
1923 del kwargs['class_']
1924
1925 if kwargs:
1926 if attrs:
1927 attrs = attrs.copy()
1928 attrs.update(kwargs)
1929 else:
1930 attrs = kwargs
1931 normalized_attrs = {}
1932 for key, value in list(attrs.items()):
1933 normalized_attrs[key] = self._normalize_search_value(value)
1934
1935 self.attrs = normalized_attrs
1936 self.text = self._normalize_search_value(text)
1937
1938 def _normalize_search_value(self, value):
1939 # Leave it alone if it's a Unicode string, a callable, a
1940 # regular expression, a boolean, or None.
1941 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1942 or isinstance(value, bool) or value is None):
1943 return value
1944
1945 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1946 if isinstance(value, bytes):
1947 return value.decode("utf8")
1948
1949 # If it's listlike, convert it into a list of strings.
1950 if hasattr(value, '__iter__'):
1951 new_value = []
1952 for v in value:
1953 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1954 and not isinstance(v, str)):
1955 # This is almost certainly the user's mistake. In the
1956 # interests of avoiding infinite loops, we'll let
1957 # it through as-is rather than doing a recursive call.
1958 new_value.append(v)
1959 else:
1960 new_value.append(self._normalize_search_value(v))
1961 return new_value
1962
1963 # Otherwise, convert it into a Unicode string.
1964 # The unicode(str()) thing is so this will do the same thing on Python 2
1965 # and Python 3.
1966 return str(str(value))
1967
1968 def __str__(self):
1969 """A human-readable representation of this SoupStrainer."""
1970 if self.text:
1971 return self.text
1972 else:
1973 return "%s|%s" % (self.name, self.attrs)
1974
1975 def search_tag(self, markup_name=None, markup_attrs={}):
1976 """Check whether a Tag with the given name and attributes would
1977 match this SoupStrainer.
1978
1979 Used prospectively to decide whether to even bother creating a Tag
1980 object.
1981
1982 :param markup_name: A tag name as found in some markup.
1983 :param markup_attrs: A dictionary of attributes as found in some markup.
1984
1985 :return: True if the prospective tag would match this SoupStrainer;
1986 False otherwise.
1987 """
1988 found = None
1989 markup = None
1990 if isinstance(markup_name, Tag):
1991 markup = markup_name
1992 markup_attrs = markup
1993 call_function_with_tag_data = (
1994 isinstance(self.name, Callable)
1995 and not isinstance(markup_name, Tag))
1996
1997 if ((not self.name)
1998 or call_function_with_tag_data
1999 or (markup and self._matches(markup, self.name))
2000 or (not markup and self._matches(markup_name, self.name))):
2001 if call_function_with_tag_data:
2002 match = self.name(markup_name, markup_attrs)
2003 else:
2004 match = True
2005 markup_attr_map = None
2006 for attr, match_against in list(self.attrs.items()):
2007 if not markup_attr_map:
2008 if hasattr(markup_attrs, 'get'):
2009 markup_attr_map = markup_attrs
2010 else:
2011 markup_attr_map = {}
2012 for k, v in markup_attrs:
2013 markup_attr_map[k] = v
2014 attr_value = markup_attr_map.get(attr)
2015 if not self._matches(attr_value, match_against):
2016 match = False
2017 break
2018 if match:
2019 if markup:
2020 found = markup
2021 else:
2022 found = markup_name
2023 if found and self.text and not self._matches(found.string, self.text):
2024 found = None
2025 return found
2026
2027 # For BS3 compatibility.
2028 searchTag = search_tag
2029
2030 def search(self, markup):
2031 """Find all items in `markup` that match this SoupStrainer.
2032
2033 Used by the core _find_all() method, which is ultimately
2034 called by all find_* methods.
2035
2036 :param markup: A PageElement or a list of them.
2037 """
2038 # print('looking for %s in %s' % (self, markup))
2039 found = None
2040 # If given a list of items, scan it for a text element that
2041 # matches.
2042 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2043 for element in markup:
2044 if isinstance(element, NavigableString) \
2045 and self.search(element):
2046 found = element
2047 break
2048 # If it's a Tag, make sure its name or attributes match.
2049 # Don't bother with Tags if we're searching for text.
2050 elif isinstance(markup, Tag):
2051 if not self.text or self.name or self.attrs:
2052 found = self.search_tag(markup)
2053 # If it's text, make sure the text matches.
2054 elif isinstance(markup, NavigableString) or \
2055 isinstance(markup, str):
2056 if not self.name and not self.attrs and self._matches(markup, self.text):
2057 found = markup
2058 else:
2059 raise Exception(
2060 "I don't know how to match against a %s" % markup.__class__)
2061 return found
2062
2063 def _matches(self, markup, match_against, already_tried=None):
2064 # print(u"Matching %s against %s" % (markup, match_against))
2065 result = False
2066 if isinstance(markup, list) or isinstance(markup, tuple):
2067 # This should only happen when searching a multi-valued attribute
2068 # like 'class'.
2069 for item in markup:
2070 if self._matches(item, match_against):
2071 return True
2072 # We didn't match any particular value of the multivalue
2073 # attribute, but maybe we match the attribute value when
2074 # considered as a string.
2075 if self._matches(' '.join(markup), match_against):
2076 return True
2077 return False
2078
2079 if match_against is True:
2080 # True matches any non-None value.
2081 return markup is not None
2082
2083 if isinstance(match_against, Callable):
2084 return match_against(markup)
2085
2086 # Custom callables take the tag as an argument, but all
2087 # other ways of matching match the tag name as a string.
2088 original_markup = markup
2089 if isinstance(markup, Tag):
2090 markup = markup.name
2091
2092 # Ensure that `markup` is either a Unicode string, or None.
2093 markup = self._normalize_search_value(markup)
2094
2095 if markup is None:
2096 # None matches None, False, an empty string, an empty list, and so on.
2097 return not match_against
2098
2099 if (hasattr(match_against, '__iter__')
2100 and not isinstance(match_against, str)):
2101 # We're asked to match against an iterable of items.
2102 # The markup must be match at least one item in the
2103 # iterable. We'll try each one in turn.
2104 #
2105 # To avoid infinite recursion we need to keep track of
2106 # items we've already seen.
2107 if not already_tried:
2108 already_tried = set()
2109 for item in match_against:
2110 if item.__hash__:
2111 key = item
2112 else:
2113 key = id(item)
2114 if key in already_tried:
2115 continue
2116 else:
2117 already_tried.add(key)
2118 if self._matches(original_markup, item, already_tried):
2119 return True
2120 else:
2121 return False
2122
2123 # Beyond this point we might need to run the test twice: once against
2124 # the tag's name and once against its prefixed name.
2125 match = False
2126
2127 if not match and isinstance(match_against, str):
2128 # Exact string match
2129 match = markup == match_against
2130
2131 if not match and hasattr(match_against, 'search'):
2132 # Regexp match
2133 return match_against.search(markup)
2134
2135 if (not match
2136 and isinstance(original_markup, Tag)
2137 and original_markup.prefix):
2138 # Try the whole thing again with the prefixed tag name.
2139 return self._matches(
2140 original_markup.prefix + ':' + original_markup.name, match_against
2141 )
2142
2143 return match
2144
2145
2146 class ResultSet(list):
2147 """A ResultSet is just a list that keeps track of the SoupStrainer
2148 that created it."""
2149 def __init__(self, source, result=()):
2150 """Constructor.
2151
2152 :param source: A SoupStrainer.
2153 :param result: A list of PageElements.
2154 """
2155 super(ResultSet, self).__init__(result)
2156 self.source = source
2157
2158 def __getattr__(self, key):
2159 """Raise a helpful exception to explain a common code fix."""
2160 raise AttributeError(
2161 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2162 )