comparison env/lib/python3.7/site-packages/bs4/element.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 try:
5 from collections.abc import Callable # Python 3.6
6 except ImportError as e:
7 from collections import Callable
8 import re
9 import sys
10 import warnings
11 try:
12 import soupsieve
13 except ImportError as e:
14 soupsieve = None
15 warnings.warn(
16 'The soupsieve package is not installed. CSS selectors cannot be used.'
17 )
18
19 from bs4.formatter import (
20 Formatter,
21 HTMLFormatter,
22 XMLFormatter,
23 )
24
25 DEFAULT_OUTPUT_ENCODING = "utf-8"
26 PY3K = (sys.version_info[0] > 2)
27
28 nonwhitespace_re = re.compile(r"\S+")
29
30 # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31 # the off chance someone imported it for their own use.
32 whitespace_re = re.compile(r"\s+")
33
34 def _alias(attr):
35 """Alias one attribute name to another for backward compatibility"""
36 @property
37 def alias(self):
38 return getattr(self, attr)
39
40 @alias.setter
41 def alias(self):
42 return setattr(self, attr)
43 return alias
44
45
46 class NamespacedAttribute(str):
47 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
48 ('xml') and the name ('lang') that were used to create it.
49 """
50
51 def __new__(cls, prefix, name=None, namespace=None):
52 if not name:
53 # This is the default namespace. Its name "has no value"
54 # per https://www.w3.org/TR/xml-names/#defaulting
55 name = None
56
57 if name is None:
58 obj = str.__new__(cls, prefix)
59 elif prefix is None:
60 # Not really namespaced.
61 obj = str.__new__(cls, name)
62 else:
63 obj = str.__new__(cls, prefix + ":" + name)
64 obj.prefix = prefix
65 obj.name = name
66 obj.namespace = namespace
67 return obj
68
69 class AttributeValueWithCharsetSubstitution(str):
70 """A stand-in object for a character encoding specified in HTML."""
71
72 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
73 """A generic stand-in for the value of a meta tag's 'charset' attribute.
74
75 When Beautiful Soup parses the markup '<meta charset="utf8">', the
76 value of the 'charset' attribute will be one of these objects.
77 """
78
79 def __new__(cls, original_value):
80 obj = str.__new__(cls, original_value)
81 obj.original_value = original_value
82 return obj
83
84 def encode(self, encoding):
85 """When an HTML document is being encoded to a given encoding, the
86 value of a meta tag's 'charset' is the name of the encoding.
87 """
88 return encoding
89
90
91 class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
92 """A generic stand-in for the value of a meta tag's 'content' attribute.
93
94 When Beautiful Soup parses the markup:
95 <meta http-equiv="content-type" content="text/html; charset=utf8">
96
97 The value of the 'content' attribute will be one of these objects.
98 """
99
100 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
101
102 def __new__(cls, original_value):
103 match = cls.CHARSET_RE.search(original_value)
104 if match is None:
105 # No substitution necessary.
106 return str.__new__(str, original_value)
107
108 obj = str.__new__(cls, original_value)
109 obj.original_value = original_value
110 return obj
111
112 def encode(self, encoding):
113 def rewrite(match):
114 return match.group(1) + encoding
115 return self.CHARSET_RE.sub(rewrite, self.original_value)
116
117
118 class PageElement(object):
119 """Contains the navigational information for some part of the page:
120 that is, its current location in the parse tree.
121
122 NavigableString, Tag, etc. are all subclasses of PageElement.
123 """
124
125 def setup(self, parent=None, previous_element=None, next_element=None,
126 previous_sibling=None, next_sibling=None):
127 """Sets up the initial relations between this element and
128 other elements.
129
130 :param parent: The parent of this element.
131
132 :param previous_element: The element parsed immediately before
133 this one.
134
135 :param next_element: The element parsed immediately before
136 this one.
137
138 :param previous_sibling: The most recently encountered element
139 on the same level of the parse tree as this one.
140
141 :param previous_sibling: The next element to be encountered
142 on the same level of the parse tree as this one.
143 """
144 self.parent = parent
145
146 self.previous_element = previous_element
147 if previous_element is not None:
148 self.previous_element.next_element = self
149
150 self.next_element = next_element
151 if self.next_element is not None:
152 self.next_element.previous_element = self
153
154 self.next_sibling = next_sibling
155 if self.next_sibling is not None:
156 self.next_sibling.previous_sibling = self
157
158 if (previous_sibling is None
159 and self.parent is not None and self.parent.contents):
160 previous_sibling = self.parent.contents[-1]
161
162 self.previous_sibling = previous_sibling
163 if previous_sibling is not None:
164 self.previous_sibling.next_sibling = self
165
166 def format_string(self, s, formatter):
167 """Format the given string using the given formatter.
168
169 :param s: A string.
170 :param formatter: A Formatter object, or a string naming one of the standard formatters.
171 """
172 if formatter is None:
173 return s
174 if not isinstance(formatter, Formatter):
175 formatter = self.formatter_for_name(formatter)
176 output = formatter.substitute(s)
177 return output
178
179 def formatter_for_name(self, formatter):
180 """Look up or create a Formatter for the given identifier,
181 if necessary.
182
183 :param formatter: Can be a Formatter object (used as-is), a
184 function (used as the entity substitution hook for an
185 XMLFormatter or HTMLFormatter), or a string (used to look
186 up an XMLFormatter or HTMLFormatter in the appropriate
187 registry.
188 """
189 if isinstance(formatter, Formatter):
190 return formatter
191 if self._is_xml:
192 c = XMLFormatter
193 else:
194 c = HTMLFormatter
195 if isinstance(formatter, Callable):
196 return c(entity_substitution=formatter)
197 return c.REGISTRY[formatter]
198
199 @property
200 def _is_xml(self):
201 """Is this element part of an XML tree or an HTML tree?
202
203 This is used in formatter_for_name, when deciding whether an
204 XMLFormatter or HTMLFormatter is more appropriate. It can be
205 inefficient, but it should be called very rarely.
206 """
207 if self.known_xml is not None:
208 # Most of the time we will have determined this when the
209 # document is parsed.
210 return self.known_xml
211
212 # Otherwise, it's likely that this element was created by
213 # direct invocation of the constructor from within the user's
214 # Python code.
215 if self.parent is None:
216 # This is the top-level object. It should have .known_xml set
217 # from tree creation. If not, take a guess--BS is usually
218 # used on HTML markup.
219 return getattr(self, 'is_xml', False)
220 return self.parent._is_xml
221
222 nextSibling = _alias("next_sibling") # BS3
223 previousSibling = _alias("previous_sibling") # BS3
224
225 def replace_with(self, replace_with):
226 """Replace this PageElement with another one, keeping the rest of the
227 tree the same.
228
229 :param replace_with: A PageElement.
230 :return: `self`, no longer part of the tree.
231 """
232 if self.parent is None:
233 raise ValueError(
234 "Cannot replace one element with another when the "
235 "element to be replaced is not part of a tree.")
236 if replace_with is self:
237 return
238 if replace_with is self.parent:
239 raise ValueError("Cannot replace a Tag with its parent.")
240 old_parent = self.parent
241 my_index = self.parent.index(self)
242 self.extract(_self_index=my_index)
243 old_parent.insert(my_index, replace_with)
244 return self
245 replaceWith = replace_with # BS3
246
247 def unwrap(self):
248 """Replace this PageElement with its contents.
249
250 :return: `self`, no longer part of the tree.
251 """
252 my_parent = self.parent
253 if self.parent is None:
254 raise ValueError(
255 "Cannot replace an element with its contents when that"
256 "element is not part of a tree.")
257 my_index = self.parent.index(self)
258 self.extract(_self_index=my_index)
259 for child in reversed(self.contents[:]):
260 my_parent.insert(my_index, child)
261 return self
262 replace_with_children = unwrap
263 replaceWithChildren = unwrap # BS3
264
265 def wrap(self, wrap_inside):
266 """Wrap this PageElement inside another one.
267
268 :param wrap_inside: A PageElement.
269 :return: `wrap_inside`, occupying the position in the tree that used
270 to be occupied by `self`, and with `self` inside it.
271 """
272 me = self.replace_with(wrap_inside)
273 wrap_inside.append(me)
274 return wrap_inside
275
276 def extract(self, _self_index=None):
277 """Destructively rips this element out of the tree.
278
279 :param _self_index: The location of this element in its parent's
280 .contents, if known. Passing this in allows for a performance
281 optimization.
282
283 :return: `self`, no longer part of the tree.
284 """
285 if self.parent is not None:
286 if _self_index is None:
287 _self_index = self.parent.index(self)
288 del self.parent.contents[_self_index]
289
290 #Find the two elements that would be next to each other if
291 #this element (and any children) hadn't been parsed. Connect
292 #the two.
293 last_child = self._last_descendant()
294 next_element = last_child.next_element
295
296 if (self.previous_element is not None and
297 self.previous_element is not next_element):
298 self.previous_element.next_element = next_element
299 if next_element is not None and next_element is not self.previous_element:
300 next_element.previous_element = self.previous_element
301 self.previous_element = None
302 last_child.next_element = None
303
304 self.parent = None
305 if (self.previous_sibling is not None
306 and self.previous_sibling is not self.next_sibling):
307 self.previous_sibling.next_sibling = self.next_sibling
308 if (self.next_sibling is not None
309 and self.next_sibling is not self.previous_sibling):
310 self.next_sibling.previous_sibling = self.previous_sibling
311 self.previous_sibling = self.next_sibling = None
312 return self
313
314 def _last_descendant(self, is_initialized=True, accept_self=True):
315 """Finds the last element beneath this object to be parsed.
316
317 :param is_initialized: Has `setup` been called on this PageElement
318 yet?
319 :param accept_self: Is `self` an acceptable answer to the question?
320 """
321 if is_initialized and self.next_sibling is not None:
322 last_child = self.next_sibling.previous_element
323 else:
324 last_child = self
325 while isinstance(last_child, Tag) and last_child.contents:
326 last_child = last_child.contents[-1]
327 if not accept_self and last_child is self:
328 last_child = None
329 return last_child
330 # BS3: Not part of the API!
331 _lastRecursiveChild = _last_descendant
332
333 def insert(self, position, new_child):
334 """Insert a new PageElement in the list of this PageElement's children.
335
336 This works the same way as `list.insert`.
337
338 :param position: The numeric position that should be occupied
339 in `self.children` by the new PageElement.
340 :param new_child: A PageElement.
341 """
342 if new_child is None:
343 raise ValueError("Cannot insert None into a tag.")
344 if new_child is self:
345 raise ValueError("Cannot insert a tag into itself.")
346 if (isinstance(new_child, str)
347 and not isinstance(new_child, NavigableString)):
348 new_child = NavigableString(new_child)
349
350 from bs4 import BeautifulSoup
351 if isinstance(new_child, BeautifulSoup):
352 # We don't want to end up with a situation where one BeautifulSoup
353 # object contains another. Insert the children one at a time.
354 for subchild in list(new_child.contents):
355 self.insert(position, subchild)
356 position += 1
357 return
358 position = min(position, len(self.contents))
359 if hasattr(new_child, 'parent') and new_child.parent is not None:
360 # We're 'inserting' an element that's already one
361 # of this object's children.
362 if new_child.parent is self:
363 current_index = self.index(new_child)
364 if current_index < position:
365 # We're moving this element further down the list
366 # of this object's children. That means that when
367 # we extract this element, our target index will
368 # jump down one.
369 position -= 1
370 new_child.extract()
371
372 new_child.parent = self
373 previous_child = None
374 if position == 0:
375 new_child.previous_sibling = None
376 new_child.previous_element = self
377 else:
378 previous_child = self.contents[position - 1]
379 new_child.previous_sibling = previous_child
380 new_child.previous_sibling.next_sibling = new_child
381 new_child.previous_element = previous_child._last_descendant(False)
382 if new_child.previous_element is not None:
383 new_child.previous_element.next_element = new_child
384
385 new_childs_last_element = new_child._last_descendant(False)
386
387 if position >= len(self.contents):
388 new_child.next_sibling = None
389
390 parent = self
391 parents_next_sibling = None
392 while parents_next_sibling is None and parent is not None:
393 parents_next_sibling = parent.next_sibling
394 parent = parent.parent
395 if parents_next_sibling is not None:
396 # We found the element that comes next in the document.
397 break
398 if parents_next_sibling is not None:
399 new_childs_last_element.next_element = parents_next_sibling
400 else:
401 # The last element of this tag is the last element in
402 # the document.
403 new_childs_last_element.next_element = None
404 else:
405 next_child = self.contents[position]
406 new_child.next_sibling = next_child
407 if new_child.next_sibling is not None:
408 new_child.next_sibling.previous_sibling = new_child
409 new_childs_last_element.next_element = next_child
410
411 if new_childs_last_element.next_element is not None:
412 new_childs_last_element.next_element.previous_element = new_childs_last_element
413 self.contents.insert(position, new_child)
414
415 def append(self, tag):
416 """Appends the given PageElement to the contents of this one.
417
418 :param tag: A PageElement.
419 """
420 self.insert(len(self.contents), tag)
421
422 def extend(self, tags):
423 """Appends the given PageElements to this one's contents.
424
425 :param tags: A list of PageElements.
426 """
427 for tag in tags:
428 self.append(tag)
429
430 def insert_before(self, *args):
431 """Makes the given element(s) the immediate predecessor of this one.
432
433 All the elements will have the same parent, and the given elements
434 will be immediately before this one.
435
436 :param args: One or more PageElements.
437 """
438 parent = self.parent
439 if parent is None:
440 raise ValueError(
441 "Element has no parent, so 'before' has no meaning.")
442 if any(x is self for x in args):
443 raise ValueError("Can't insert an element before itself.")
444 for predecessor in args:
445 # Extract first so that the index won't be screwed up if they
446 # are siblings.
447 if isinstance(predecessor, PageElement):
448 predecessor.extract()
449 index = parent.index(self)
450 parent.insert(index, predecessor)
451
452 def insert_after(self, *args):
453 """Makes the given element(s) the immediate successor of this one.
454
455 The elements will have the same parent, and the given elements
456 will be immediately after this one.
457
458 :param args: One or more PageElements.
459 """
460 # Do all error checking before modifying the tree.
461 parent = self.parent
462 if parent is None:
463 raise ValueError(
464 "Element has no parent, so 'after' has no meaning.")
465 if any(x is self for x in args):
466 raise ValueError("Can't insert an element after itself.")
467
468 offset = 0
469 for successor in args:
470 # Extract first so that the index won't be screwed up if they
471 # are siblings.
472 if isinstance(successor, PageElement):
473 successor.extract()
474 index = parent.index(self)
475 parent.insert(index+1+offset, successor)
476 offset += 1
477
478 def find_next(self, name=None, attrs={}, text=None, **kwargs):
479 """Find the first PageElement that matches the given criteria and
480 appears later in the document than this PageElement.
481
482 All find_* methods take a common set of arguments. See the online
483 documentation for detailed explanations.
484
485 :param name: A filter on tag name.
486 :param attrs: A dictionary of filters on attribute values.
487 :param text: A filter for a NavigableString with specific text.
488 :kwargs: A dictionary of filters on attribute values.
489 :return: A PageElement.
490 :rtype: bs4.element.Tag | bs4.element.NavigableString
491 """
492 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
493 findNext = find_next # BS3
494
495 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
496 **kwargs):
497 """Find all PageElements that match the given criteria and appear
498 later in the document than this PageElement.
499
500 All find_* methods take a common set of arguments. See the online
501 documentation for detailed explanations.
502
503 :param name: A filter on tag name.
504 :param attrs: A dictionary of filters on attribute values.
505 :param text: A filter for a NavigableString with specific text.
506 :param limit: Stop looking after finding this many results.
507 :kwargs: A dictionary of filters on attribute values.
508 :return: A ResultSet containing PageElements.
509 """
510 return self._find_all(name, attrs, text, limit, self.next_elements,
511 **kwargs)
512 findAllNext = find_all_next # BS3
513
514 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
515 """Find the closest sibling to this PageElement that matches the
516 given criteria and appears later in the document.
517
518 All find_* methods take a common set of arguments. See the
519 online documentation for detailed explanations.
520
521 :param name: A filter on tag name.
522 :param attrs: A dictionary of filters on attribute values.
523 :param text: A filter for a NavigableString with specific text.
524 :kwargs: A dictionary of filters on attribute values.
525 :return: A PageElement.
526 :rtype: bs4.element.Tag | bs4.element.NavigableString
527 """
528 return self._find_one(self.find_next_siblings, name, attrs, text,
529 **kwargs)
530 findNextSibling = find_next_sibling # BS3
531
532 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
533 **kwargs):
534 """Find all siblings of this PageElement that match the given criteria
535 and appear later in the document.
536
537 All find_* methods take a common set of arguments. See the online
538 documentation for detailed explanations.
539
540 :param name: A filter on tag name.
541 :param attrs: A dictionary of filters on attribute values.
542 :param text: A filter for a NavigableString with specific text.
543 :param limit: Stop looking after finding this many results.
544 :kwargs: A dictionary of filters on attribute values.
545 :return: A ResultSet of PageElements.
546 :rtype: bs4.element.ResultSet
547 """
548 return self._find_all(name, attrs, text, limit,
549 self.next_siblings, **kwargs)
550 findNextSiblings = find_next_siblings # BS3
551 fetchNextSiblings = find_next_siblings # BS2
552
553 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
554 """Look backwards in the document from this PageElement and find the
555 first PageElement that matches the given criteria.
556
557 All find_* methods take a common set of arguments. See the online
558 documentation for detailed explanations.
559
560 :param name: A filter on tag name.
561 :param attrs: A dictionary of filters on attribute values.
562 :param text: A filter for a NavigableString with specific text.
563 :kwargs: A dictionary of filters on attribute values.
564 :return: A PageElement.
565 :rtype: bs4.element.Tag | bs4.element.NavigableString
566 """
567 return self._find_one(
568 self.find_all_previous, name, attrs, text, **kwargs)
569 findPrevious = find_previous # BS3
570
571 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
572 **kwargs):
573 """Look backwards in the document from this PageElement and find all
574 PageElements that match the given criteria.
575
576 All find_* methods take a common set of arguments. See the online
577 documentation for detailed explanations.
578
579 :param name: A filter on tag name.
580 :param attrs: A dictionary of filters on attribute values.
581 :param text: A filter for a NavigableString with specific text.
582 :param limit: Stop looking after finding this many results.
583 :kwargs: A dictionary of filters on attribute values.
584 :return: A ResultSet of PageElements.
585 :rtype: bs4.element.ResultSet
586 """
587 return self._find_all(name, attrs, text, limit, self.previous_elements,
588 **kwargs)
589 findAllPrevious = find_all_previous # BS3
590 fetchPrevious = find_all_previous # BS2
591
592 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
593 """Returns the closest sibling to this PageElement that matches the
594 given criteria and appears earlier in the document.
595
596 All find_* methods take a common set of arguments. See the online
597 documentation for detailed explanations.
598
599 :param name: A filter on tag name.
600 :param attrs: A dictionary of filters on attribute values.
601 :param text: A filter for a NavigableString with specific text.
602 :kwargs: A dictionary of filters on attribute values.
603 :return: A PageElement.
604 :rtype: bs4.element.Tag | bs4.element.NavigableString
605 """
606 return self._find_one(self.find_previous_siblings, name, attrs, text,
607 **kwargs)
608 findPreviousSibling = find_previous_sibling # BS3
609
610 def find_previous_siblings(self, name=None, attrs={}, text=None,
611 limit=None, **kwargs):
612 """Returns all siblings to this PageElement that match the
613 given criteria and appear earlier in the document.
614
615 All find_* methods take a common set of arguments. See the online
616 documentation for detailed explanations.
617
618 :param name: A filter on tag name.
619 :param attrs: A dictionary of filters on attribute values.
620 :param text: A filter for a NavigableString with specific text.
621 :param limit: Stop looking after finding this many results.
622 :kwargs: A dictionary of filters on attribute values.
623 :return: A ResultSet of PageElements.
624 :rtype: bs4.element.ResultSet
625 """
626 return self._find_all(name, attrs, text, limit,
627 self.previous_siblings, **kwargs)
628 findPreviousSiblings = find_previous_siblings # BS3
629 fetchPreviousSiblings = find_previous_siblings # BS2
630
631 def find_parent(self, name=None, attrs={}, **kwargs):
632 """Find the closest parent of this PageElement that matches the given
633 criteria.
634
635 All find_* methods take a common set of arguments. See the online
636 documentation for detailed explanations.
637
638 :param name: A filter on tag name.
639 :param attrs: A dictionary of filters on attribute values.
640 :kwargs: A dictionary of filters on attribute values.
641
642 :return: A PageElement.
643 :rtype: bs4.element.Tag | bs4.element.NavigableString
644 """
645 # NOTE: We can't use _find_one because findParents takes a different
646 # set of arguments.
647 r = None
648 l = self.find_parents(name, attrs, 1, **kwargs)
649 if l:
650 r = l[0]
651 return r
652 findParent = find_parent # BS3
653
654 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
655 """Find all parents of this PageElement that match the given criteria.
656
657 All find_* methods take a common set of arguments. See the online
658 documentation for detailed explanations.
659
660 :param name: A filter on tag name.
661 :param attrs: A dictionary of filters on attribute values.
662 :param limit: Stop looking after finding this many results.
663 :kwargs: A dictionary of filters on attribute values.
664
665 :return: A PageElement.
666 :rtype: bs4.element.Tag | bs4.element.NavigableString
667 """
668 return self._find_all(name, attrs, None, limit, self.parents,
669 **kwargs)
670 findParents = find_parents # BS3
671 fetchParents = find_parents # BS2
672
673 @property
674 def next(self):
675 """The PageElement, if any, that was parsed just after this one.
676
677 :return: A PageElement.
678 :rtype: bs4.element.Tag | bs4.element.NavigableString
679 """
680 return self.next_element
681
682 @property
683 def previous(self):
684 """The PageElement, if any, that was parsed just before this one.
685
686 :return: A PageElement.
687 :rtype: bs4.element.Tag | bs4.element.NavigableString
688 """
689 return self.previous_element
690
691 #These methods do the real heavy lifting.
692
693 def _find_one(self, method, name, attrs, text, **kwargs):
694 r = None
695 l = method(name, attrs, text, 1, **kwargs)
696 if l:
697 r = l[0]
698 return r
699
700 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
701 "Iterates over a generator looking for things that match."
702
703 if text is None and 'string' in kwargs:
704 text = kwargs['string']
705 del kwargs['string']
706
707 if isinstance(name, SoupStrainer):
708 strainer = name
709 else:
710 strainer = SoupStrainer(name, attrs, text, **kwargs)
711
712 if text is None and not limit and not attrs and not kwargs:
713 if name is True or name is None:
714 # Optimization to find all tags.
715 result = (element for element in generator
716 if isinstance(element, Tag))
717 return ResultSet(strainer, result)
718 elif isinstance(name, str):
719 # Optimization to find all tags with a given name.
720 if name.count(':') == 1:
721 # This is a name with a prefix. If this is a namespace-aware document,
722 # we need to match the local name against tag.name. If not,
723 # we need to match the fully-qualified name against tag.name.
724 prefix, local_name = name.split(':', 1)
725 else:
726 prefix = None
727 local_name = name
728 result = (element for element in generator
729 if isinstance(element, Tag)
730 and (
731 element.name == name
732 ) or (
733 element.name == local_name
734 and (prefix is None or element.prefix == prefix)
735 )
736 )
737 return ResultSet(strainer, result)
738 results = ResultSet(strainer)
739 while True:
740 try:
741 i = next(generator)
742 except StopIteration:
743 break
744 if i:
745 found = strainer.search(i)
746 if found:
747 results.append(found)
748 if limit and len(results) >= limit:
749 break
750 return results
751
752 #These generators can be used to navigate starting from both
753 #NavigableStrings and Tags.
754 @property
755 def next_elements(self):
756 """All PageElements that were parsed after this one.
757
758 :yield: A sequence of PageElements.
759 """
760 i = self.next_element
761 while i is not None:
762 yield i
763 i = i.next_element
764
765 @property
766 def next_siblings(self):
767 """All PageElements that are siblings of this one but were parsed
768 later.
769
770 :yield: A sequence of PageElements.
771 """
772 i = self.next_sibling
773 while i is not None:
774 yield i
775 i = i.next_sibling
776
777 @property
778 def previous_elements(self):
779 """All PageElements that were parsed before this one.
780
781 :yield: A sequence of PageElements.
782 """
783 i = self.previous_element
784 while i is not None:
785 yield i
786 i = i.previous_element
787
788 @property
789 def previous_siblings(self):
790 """All PageElements that are siblings of this one but were parsed
791 earlier.
792
793 :yield: A sequence of PageElements.
794 """
795 i = self.previous_sibling
796 while i is not None:
797 yield i
798 i = i.previous_sibling
799
800 @property
801 def parents(self):
802 """All PageElements that are parents of this PageElement.
803
804 :yield: A sequence of PageElements.
805 """
806 i = self.parent
807 while i is not None:
808 yield i
809 i = i.parent
810
811 @property
812 def decomposed(self):
813 """Check whether a PageElement has been decomposed.
814
815 :rtype: bool
816 """
817 return getattr(self, '_decomposed', False) or False
818
819 # Old non-property versions of the generators, for backwards
820 # compatibility with BS3.
821 def nextGenerator(self):
822 return self.next_elements
823
824 def nextSiblingGenerator(self):
825 return self.next_siblings
826
827 def previousGenerator(self):
828 return self.previous_elements
829
830 def previousSiblingGenerator(self):
831 return self.previous_siblings
832
833 def parentGenerator(self):
834 return self.parents
835
836
837 class NavigableString(str, PageElement):
838 """A Python Unicode string that is part of a parse tree.
839
840 When Beautiful Soup parses the markup <b>penguin</b>, it will
841 create a NavigableString for the string "penguin".
842 """
843
844 PREFIX = ''
845 SUFFIX = ''
846
847 # We can't tell just by looking at a string whether it's contained
848 # in an XML document or an HTML document.
849
850 known_xml = None
851
852 def __new__(cls, value):
853 """Create a new NavigableString.
854
855 When unpickling a NavigableString, this method is called with
856 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
857 passed in to the superclass's __new__ or the superclass won't know
858 how to handle non-ASCII characters.
859 """
860 if isinstance(value, str):
861 u = str.__new__(cls, value)
862 else:
863 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
864 u.setup()
865 return u
866
867 def __copy__(self):
868 """A copy of a NavigableString has the same contents and class
869 as the original, but it is not connected to the parse tree.
870 """
871 return type(self)(self)
872
873 def __getnewargs__(self):
874 return (str(self),)
875
876 def __getattr__(self, attr):
877 """text.string gives you text. This is for backwards
878 compatibility for Navigable*String, but for CData* it lets you
879 get the string without the CData wrapper."""
880 if attr == 'string':
881 return self
882 else:
883 raise AttributeError(
884 "'%s' object has no attribute '%s'" % (
885 self.__class__.__name__, attr))
886
887 def output_ready(self, formatter="minimal"):
888 """Run the string through the provided formatter.
889
890 :param formatter: A Formatter object, or a string naming one of the standard formatters.
891 """
892 output = self.format_string(self, formatter)
893 return self.PREFIX + output + self.SUFFIX
894
895 @property
896 def name(self):
897 """Since a NavigableString is not a Tag, it has no .name.
898
899 This property is implemented so that code like this doesn't crash
900 when run on a mixture of Tag and NavigableString objects:
901 [x.name for x in tag.children]
902 """
903 return None
904
905 @name.setter
906 def name(self, name):
907 """Prevent NavigableString.name from ever being set."""
908 raise AttributeError("A NavigableString cannot be given a name.")
909
910
911 class PreformattedString(NavigableString):
912 """A NavigableString not subject to the normal formatting rules.
913
914 This is an abstract class used for special kinds of strings such
915 as comments (the Comment class) and CDATA blocks (the CData
916 class).
917 """
918
919 PREFIX = ''
920 SUFFIX = ''
921
922 def output_ready(self, formatter=None):
923 """Make this string ready for output by adding any subclass-specific
924 prefix or suffix.
925
926 :param formatter: A Formatter object, or a string naming one
927 of the standard formatters. The string will be passed into the
928 Formatter, but only to trigger any side effects: the return
929 value is ignored.
930
931 :return: The string, with any subclass-specific prefix and
932 suffix added on.
933 """
934 if formatter is not None:
935 ignore = self.format_string(self, formatter)
936 return self.PREFIX + self + self.SUFFIX
937
938 class CData(PreformattedString):
939 """A CDATA block."""
940 PREFIX = '<![CDATA['
941 SUFFIX = ']]>'
942
943 class ProcessingInstruction(PreformattedString):
944 """A SGML processing instruction."""
945
946 PREFIX = '<?'
947 SUFFIX = '>'
948
949 class XMLProcessingInstruction(ProcessingInstruction):
950 """An XML processing instruction."""
951 PREFIX = '<?'
952 SUFFIX = '?>'
953
954 class Comment(PreformattedString):
955 """An HTML or XML comment."""
956 PREFIX = '<!--'
957 SUFFIX = '-->'
958
959
960 class Declaration(PreformattedString):
961 """An XML declaration."""
962 PREFIX = '<?'
963 SUFFIX = '?>'
964
965
966 class Doctype(PreformattedString):
967 """A document type declaration."""
968 @classmethod
969 def for_name_and_ids(cls, name, pub_id, system_id):
970 """Generate an appropriate document type declaration for a given
971 public ID and system ID.
972
973 :param name: The name of the document's root element, e.g. 'html'.
974 :param pub_id: The Formal Public Identifier for this document type,
975 e.g. '-//W3C//DTD XHTML 1.1//EN'
976 :param system_id: The system identifier for this document type,
977 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
978
979 :return: A Doctype.
980 """
981 value = name or ''
982 if pub_id is not None:
983 value += ' PUBLIC "%s"' % pub_id
984 if system_id is not None:
985 value += ' "%s"' % system_id
986 elif system_id is not None:
987 value += ' SYSTEM "%s"' % system_id
988
989 return Doctype(value)
990
991 PREFIX = '<!DOCTYPE '
992 SUFFIX = '>\n'
993
994
995 class Stylesheet(NavigableString):
996 """A NavigableString representing an stylesheet (probably
997 CSS).
998
999 Used to distinguish embedded stylesheets from textual content.
1000 """
1001 pass
1002
1003
1004 class Script(NavigableString):
1005 """A NavigableString representing an executable script (probably
1006 Javascript).
1007
1008 Used to distinguish executable code from textual content.
1009 """
1010 pass
1011
1012
1013 class TemplateString(NavigableString):
1014 """A NavigableString representing a string found inside an HTML
1015 template embedded in a larger document.
1016
1017 Used to distinguish such strings from the main body of the document.
1018 """
1019 pass
1020
1021
1022 class Tag(PageElement):
1023 """Represents an HTML or XML tag that is part of a parse tree, along
1024 with its attributes and contents.
1025
1026 When Beautiful Soup parses the markup <b>penguin</b>, it will
1027 create a Tag object representing the <b> tag.
1028 """
1029
1030 def __init__(self, parser=None, builder=None, name=None, namespace=None,
1031 prefix=None, attrs=None, parent=None, previous=None,
1032 is_xml=None, sourceline=None, sourcepos=None,
1033 can_be_empty_element=None, cdata_list_attributes=None,
1034 preserve_whitespace_tags=None
1035 ):
1036 """Basic constructor.
1037
1038 :param parser: A BeautifulSoup object.
1039 :param builder: A TreeBuilder.
1040 :param name: The name of the tag.
1041 :param namespace: The URI of this Tag's XML namespace, if any.
1042 :param prefix: The prefix for this Tag's XML namespace, if any.
1043 :param attrs: A dictionary of this Tag's attribute values.
1044 :param parent: The PageElement to use as this Tag's parent.
1045 :param previous: The PageElement that was parsed immediately before
1046 this tag.
1047 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1048 HTML tag.
1049 :param sourceline: The line number where this tag was found in its
1050 source document.
1051 :param sourcepos: The character position within `sourceline` where this
1052 tag was found.
1053 :param can_be_empty_element: If True, this tag should be
1054 represented as <tag/>. If False, this tag should be represented
1055 as <tag></tag>.
1056 :param cdata_list_attributes: A list of attributes whose values should
1057 be treated as CDATA if they ever show up on this tag.
1058 :param preserve_whitespace_tags: A list of tag names whose contents
1059 should have their whitespace preserved.
1060 """
1061 if parser is None:
1062 self.parser_class = None
1063 else:
1064 # We don't actually store the parser object: that lets extracted
1065 # chunks be garbage-collected.
1066 self.parser_class = parser.__class__
1067 if name is None:
1068 raise ValueError("No value provided for new tag's name.")
1069 self.name = name
1070 self.namespace = namespace
1071 self.prefix = prefix
1072 if ((not builder or builder.store_line_numbers)
1073 and (sourceline is not None or sourcepos is not None)):
1074 self.sourceline = sourceline
1075 self.sourcepos = sourcepos
1076 if attrs is None:
1077 attrs = {}
1078 elif attrs:
1079 if builder is not None and builder.cdata_list_attributes:
1080 attrs = builder._replace_cdata_list_attribute_values(
1081 self.name, attrs)
1082 else:
1083 attrs = dict(attrs)
1084 else:
1085 attrs = dict(attrs)
1086
1087 # If possible, determine ahead of time whether this tag is an
1088 # XML tag.
1089 if builder:
1090 self.known_xml = builder.is_xml
1091 else:
1092 self.known_xml = is_xml
1093 self.attrs = attrs
1094 self.contents = []
1095 self.setup(parent, previous)
1096 self.hidden = False
1097
1098 if builder is None:
1099 # In the absence of a TreeBuilder, use whatever values were
1100 # passed in here. They're probably None, unless this is a copy of some
1101 # other tag.
1102 self.can_be_empty_element = can_be_empty_element
1103 self.cdata_list_attributes = cdata_list_attributes
1104 self.preserve_whitespace_tags = preserve_whitespace_tags
1105 else:
1106 # Set up any substitutions for this tag, such as the charset in a META tag.
1107 builder.set_up_substitutions(self)
1108
1109 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1110 self.can_be_empty_element = builder.can_be_empty_element(name)
1111
1112 # Keep track of the list of attributes of this tag that
1113 # might need to be treated as a list.
1114 #
1115 # For performance reasons, we store the whole data structure
1116 # rather than asking the question of every tag. Asking would
1117 # require building a new data structure every time, and
1118 # (unlike can_be_empty_element), we almost never need
1119 # to check this.
1120 self.cdata_list_attributes = builder.cdata_list_attributes
1121
1122 # Keep track of the names that might cause this tag to be treated as a
1123 # whitespace-preserved tag.
1124 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1125
1126 parserClass = _alias("parser_class") # BS3
1127
1128 def __copy__(self):
1129 """A copy of a Tag is a new Tag, unconnected to the parse tree.
1130 Its contents are a copy of the old Tag's contents.
1131 """
1132 clone = type(self)(
1133 None, self.builder, self.name, self.namespace,
1134 self.prefix, self.attrs, is_xml=self._is_xml,
1135 sourceline=self.sourceline, sourcepos=self.sourcepos,
1136 can_be_empty_element=self.can_be_empty_element,
1137 cdata_list_attributes=self.cdata_list_attributes,
1138 preserve_whitespace_tags=self.preserve_whitespace_tags
1139 )
1140 for attr in ('can_be_empty_element', 'hidden'):
1141 setattr(clone, attr, getattr(self, attr))
1142 for child in self.contents:
1143 clone.append(child.__copy__())
1144 return clone
1145
1146 @property
1147 def is_empty_element(self):
1148 """Is this tag an empty-element tag? (aka a self-closing tag)
1149
1150 A tag that has contents is never an empty-element tag.
1151
1152 A tag that has no contents may or may not be an empty-element
1153 tag. It depends on the builder used to create the tag. If the
1154 builder has a designated list of empty-element tags, then only
1155 a tag whose name shows up in that list is considered an
1156 empty-element tag.
1157
1158 If the builder has no designated list of empty-element tags,
1159 then any tag with no contents is an empty-element tag.
1160 """
1161 return len(self.contents) == 0 and self.can_be_empty_element
1162 isSelfClosing = is_empty_element # BS3
1163
1164 @property
1165 def string(self):
1166 """Convenience property to get the single string within this
1167 PageElement.
1168
1169 TODO It might make sense to have NavigableString.string return
1170 itself.
1171
1172 :return: If this element has a single string child, return
1173 value is that string. If this element has one child tag,
1174 return value is the 'string' attribute of the child tag,
1175 recursively. If this element is itself a string, has no
1176 children, or has more than one child, return value is None.
1177 """
1178 if len(self.contents) != 1:
1179 return None
1180 child = self.contents[0]
1181 if isinstance(child, NavigableString):
1182 return child
1183 return child.string
1184
1185 @string.setter
1186 def string(self, string):
1187 """Replace this PageElement's contents with `string`."""
1188 self.clear()
1189 self.append(string.__class__(string))
1190
1191 def _all_strings(self, strip=False, types=(NavigableString, CData)):
1192 """Yield all strings of certain classes, possibly stripping them.
1193
1194 :param strip: If True, all strings will be stripped before being
1195 yielded.
1196
1197 :types: A tuple of NavigableString subclasses. Any strings of
1198 a subclass not found in this list will be ignored. By
1199 default, this means only NavigableString and CData objects
1200 will be considered. So no comments, processing instructions,
1201 etc.
1202
1203 :yield: A sequence of strings.
1204 """
1205 for descendant in self.descendants:
1206 if (
1207 (types is None and not isinstance(descendant, NavigableString))
1208 or
1209 (types is not None and type(descendant) not in types)):
1210 continue
1211 if strip:
1212 descendant = descendant.strip()
1213 if len(descendant) == 0:
1214 continue
1215 yield descendant
1216
1217 strings = property(_all_strings)
1218
1219 @property
1220 def stripped_strings(self):
1221 """Yield all strings in the document, stripping them first.
1222
1223 :yield: A sequence of stripped strings.
1224 """
1225 for string in self._all_strings(True):
1226 yield string
1227
1228 def get_text(self, separator="", strip=False,
1229 types=(NavigableString, CData)):
1230 """Get all child strings, concatenated using the given separator.
1231
1232 :param separator: Strings will be concatenated using this separator.
1233
1234 :param strip: If True, strings will be stripped before being
1235 concatenated.
1236
1237 :types: A tuple of NavigableString subclasses. Any strings of
1238 a subclass not found in this list will be ignored. By
1239 default, this means only NavigableString and CData objects
1240 will be considered. So no comments, processing instructions,
1241 stylesheets, etc.
1242
1243 :return: A string.
1244 """
1245 return separator.join([s for s in self._all_strings(
1246 strip, types=types)])
1247 getText = get_text
1248 text = property(get_text)
1249
1250 def decompose(self):
1251 """Recursively destroys this PageElement and its children.
1252
1253 This element will be removed from the tree and wiped out; so
1254 will everything beneath it.
1255
1256 The behavior of a decomposed PageElement is undefined and you
1257 should never use one for anything, but if you need to _check_
1258 whether an element has been decomposed, you can use the
1259 `decomposed` property.
1260 """
1261 self.extract()
1262 i = self
1263 while i is not None:
1264 n = i.next_element
1265 i.__dict__.clear()
1266 i.contents = []
1267 i._decomposed = True
1268 i = n
1269
1270 def clear(self, decompose=False):
1271 """Wipe out all children of this PageElement by calling extract()
1272 on them.
1273
1274 :param decompose: If this is True, decompose() (a more
1275 destructive method) will be called instead of extract().
1276 """
1277 if decompose:
1278 for element in self.contents[:]:
1279 if isinstance(element, Tag):
1280 element.decompose()
1281 else:
1282 element.extract()
1283 else:
1284 for element in self.contents[:]:
1285 element.extract()
1286
1287 def smooth(self):
1288 """Smooth out this element's children by consolidating consecutive
1289 strings.
1290
1291 This makes pretty-printed output look more natural following a
1292 lot of operations that modified the tree.
1293 """
1294 # Mark the first position of every pair of children that need
1295 # to be consolidated. Do this rather than making a copy of
1296 # self.contents, since in most cases very few strings will be
1297 # affected.
1298 marked = []
1299 for i, a in enumerate(self.contents):
1300 if isinstance(a, Tag):
1301 # Recursively smooth children.
1302 a.smooth()
1303 if i == len(self.contents)-1:
1304 # This is the last item in .contents, and it's not a
1305 # tag. There's no chance it needs any work.
1306 continue
1307 b = self.contents[i+1]
1308 if (isinstance(a, NavigableString)
1309 and isinstance(b, NavigableString)
1310 and not isinstance(a, PreformattedString)
1311 and not isinstance(b, PreformattedString)
1312 ):
1313 marked.append(i)
1314
1315 # Go over the marked positions in reverse order, so that
1316 # removing items from .contents won't affect the remaining
1317 # positions.
1318 for i in reversed(marked):
1319 a = self.contents[i]
1320 b = self.contents[i+1]
1321 b.extract()
1322 n = NavigableString(a+b)
1323 a.replace_with(n)
1324
1325 def index(self, element):
1326 """Find the index of a child by identity, not value.
1327
1328 Avoids issues with tag.contents.index(element) getting the
1329 index of equal elements.
1330
1331 :param element: Look for this PageElement in `self.contents`.
1332 """
1333 for i, child in enumerate(self.contents):
1334 if child is element:
1335 return i
1336 raise ValueError("Tag.index: element not in tag")
1337
1338 def get(self, key, default=None):
1339 """Returns the value of the 'key' attribute for the tag, or
1340 the value given for 'default' if it doesn't have that
1341 attribute."""
1342 return self.attrs.get(key, default)
1343
1344 def get_attribute_list(self, key, default=None):
1345 """The same as get(), but always returns a list.
1346
1347 :param key: The attribute to look for.
1348 :param default: Use this value if the attribute is not present
1349 on this PageElement.
1350 :return: A list of values, probably containing only a single
1351 value.
1352 """
1353 value = self.get(key, default)
1354 if not isinstance(value, list):
1355 value = [value]
1356 return value
1357
1358 def has_attr(self, key):
1359 """Does this PageElement have an attribute with the given name?"""
1360 return key in self.attrs
1361
1362 def __hash__(self):
1363 return str(self).__hash__()
1364
1365 def __getitem__(self, key):
1366 """tag[key] returns the value of the 'key' attribute for the Tag,
1367 and throws an exception if it's not there."""
1368 return self.attrs[key]
1369
1370 def __iter__(self):
1371 "Iterating over a Tag iterates over its contents."
1372 return iter(self.contents)
1373
1374 def __len__(self):
1375 "The length of a Tag is the length of its list of contents."
1376 return len(self.contents)
1377
1378 def __contains__(self, x):
1379 return x in self.contents
1380
1381 def __bool__(self):
1382 "A tag is non-None even if it has no contents."
1383 return True
1384
1385 def __setitem__(self, key, value):
1386 """Setting tag[key] sets the value of the 'key' attribute for the
1387 tag."""
1388 self.attrs[key] = value
1389
1390 def __delitem__(self, key):
1391 "Deleting tag[key] deletes all 'key' attributes for the tag."
1392 self.attrs.pop(key, None)
1393
1394 def __call__(self, *args, **kwargs):
1395 """Calling a Tag like a function is the same as calling its
1396 find_all() method. Eg. tag('a') returns a list of all the A tags
1397 found within this tag."""
1398 return self.find_all(*args, **kwargs)
1399
1400 def __getattr__(self, tag):
1401 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1402 #print "Getattr %s.%s" % (self.__class__, tag)
1403 if len(tag) > 3 and tag.endswith('Tag'):
1404 # BS3: soup.aTag -> "soup.find("a")
1405 tag_name = tag[:-3]
1406 warnings.warn(
1407 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1408 name=tag_name
1409 )
1410 )
1411 return self.find(tag_name)
1412 # We special case contents to avoid recursion.
1413 elif not tag.startswith("__") and not tag == "contents":
1414 return self.find(tag)
1415 raise AttributeError(
1416 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1417
1418 def __eq__(self, other):
1419 """Returns true iff this Tag has the same name, the same attributes,
1420 and the same contents (recursively) as `other`."""
1421 if self is other:
1422 return True
1423 if (not hasattr(other, 'name') or
1424 not hasattr(other, 'attrs') or
1425 not hasattr(other, 'contents') or
1426 self.name != other.name or
1427 self.attrs != other.attrs or
1428 len(self) != len(other)):
1429 return False
1430 for i, my_child in enumerate(self.contents):
1431 if my_child != other.contents[i]:
1432 return False
1433 return True
1434
1435 def __ne__(self, other):
1436 """Returns true iff this Tag is not identical to `other`,
1437 as defined in __eq__."""
1438 return not self == other
1439
1440 def __repr__(self, encoding="unicode-escape"):
1441 """Renders this PageElement as a string.
1442
1443 :param encoding: The encoding to use (Python 2 only).
1444 :return: Under Python 2, a bytestring; under Python 3,
1445 a Unicode string.
1446 """
1447 if PY3K:
1448 # "The return value must be a string object", i.e. Unicode
1449 return self.decode()
1450 else:
1451 # "The return value must be a string object", i.e. a bytestring.
1452 # By convention, the return value of __repr__ should also be
1453 # an ASCII string.
1454 return self.encode(encoding)
1455
1456 def __unicode__(self):
1457 """Renders this PageElement as a Unicode string."""
1458 return self.decode()
1459
1460 def __str__(self):
1461 """Renders this PageElement as a generic string.
1462
1463 :return: Under Python 2, a UTF-8 bytestring; under Python 3,
1464 a Unicode string.
1465 """
1466 if PY3K:
1467 return self.decode()
1468 else:
1469 return self.encode()
1470
1471 if PY3K:
1472 __str__ = __repr__ = __unicode__
1473
1474 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1475 indent_level=None, formatter="minimal",
1476 errors="xmlcharrefreplace"):
1477 """Render a bytestring representation of this PageElement and its
1478 contents.
1479
1480 :param encoding: The destination encoding.
1481 :param indent_level: Each line of the rendering will be
1482 indented this many spaces. Used internally in
1483 recursive calls while pretty-printing.
1484 :param formatter: A Formatter object, or a string naming one of
1485 the standard formatters.
1486 :param errors: An error handling strategy such as
1487 'xmlcharrefreplace'. This value is passed along into
1488 encode() and its value should be one of the constants
1489 defined by Python.
1490 :return: A bytestring.
1491
1492 """
1493 # Turn the data structure into Unicode, then encode the
1494 # Unicode.
1495 u = self.decode(indent_level, encoding, formatter)
1496 return u.encode(encoding, errors)
1497
1498 def decode(self, indent_level=None,
1499 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1500 formatter="minimal"):
1501 """Render a Unicode representation of this PageElement and its
1502 contents.
1503
1504 :param indent_level: Each line of the rendering will be
1505 indented this many spaces. Used internally in
1506 recursive calls while pretty-printing.
1507 :param eventual_encoding: The tag is destined to be
1508 encoded into this encoding. This method is _not_
1509 responsible for performing that encoding. This information
1510 is passed in so that it can be substituted in if the
1511 document contains a <META> tag that mentions the document's
1512 encoding.
1513 :param formatter: A Formatter object, or a string naming one of
1514 the standard formatters.
1515 """
1516
1517 # First off, turn a non-Formatter `formatter` into a Formatter
1518 # object. This will stop the lookup from happening over and
1519 # over again.
1520 if not isinstance(formatter, Formatter):
1521 formatter = self.formatter_for_name(formatter)
1522 attributes = formatter.attributes(self)
1523 attrs = []
1524 for key, val in attributes:
1525 if val is None:
1526 decoded = key
1527 else:
1528 if isinstance(val, list) or isinstance(val, tuple):
1529 val = ' '.join(val)
1530 elif not isinstance(val, str):
1531 val = str(val)
1532 elif (
1533 isinstance(val, AttributeValueWithCharsetSubstitution)
1534 and eventual_encoding is not None
1535 ):
1536 val = val.encode(eventual_encoding)
1537
1538 text = formatter.attribute_value(val)
1539 decoded = (
1540 str(key) + '='
1541 + formatter.quoted_attribute_value(text))
1542 attrs.append(decoded)
1543 close = ''
1544 closeTag = ''
1545
1546 prefix = ''
1547 if self.prefix:
1548 prefix = self.prefix + ":"
1549
1550 if self.is_empty_element:
1551 close = formatter.void_element_close_prefix or ''
1552 else:
1553 closeTag = '</%s%s>' % (prefix, self.name)
1554
1555 pretty_print = self._should_pretty_print(indent_level)
1556 space = ''
1557 indent_space = ''
1558 if indent_level is not None:
1559 indent_space = (' ' * (indent_level - 1))
1560 if pretty_print:
1561 space = indent_space
1562 indent_contents = indent_level + 1
1563 else:
1564 indent_contents = None
1565 contents = self.decode_contents(
1566 indent_contents, eventual_encoding, formatter
1567 )
1568
1569 if self.hidden:
1570 # This is the 'document root' object.
1571 s = contents
1572 else:
1573 s = []
1574 attribute_string = ''
1575 if attrs:
1576 attribute_string = ' ' + ' '.join(attrs)
1577 if indent_level is not None:
1578 # Even if this particular tag is not pretty-printed,
1579 # we should indent up to the start of the tag.
1580 s.append(indent_space)
1581 s.append('<%s%s%s%s>' % (
1582 prefix, self.name, attribute_string, close))
1583 if pretty_print:
1584 s.append("\n")
1585 s.append(contents)
1586 if pretty_print and contents and contents[-1] != "\n":
1587 s.append("\n")
1588 if pretty_print and closeTag:
1589 s.append(space)
1590 s.append(closeTag)
1591 if indent_level is not None and closeTag and self.next_sibling:
1592 # Even if this particular tag is not pretty-printed,
1593 # we're now done with the tag, and we should add a
1594 # newline if appropriate.
1595 s.append("\n")
1596 s = ''.join(s)
1597 return s
1598
1599 def _should_pretty_print(self, indent_level):
1600 """Should this tag be pretty-printed?
1601
1602 Most of them should, but some (such as <pre> in HTML
1603 documents) should not.
1604 """
1605 return (
1606 indent_level is not None
1607 and (
1608 not self.preserve_whitespace_tags
1609 or self.name not in self.preserve_whitespace_tags
1610 )
1611 )
1612
1613 def prettify(self, encoding=None, formatter="minimal"):
1614 """Pretty-print this PageElement as a string.
1615
1616 :param encoding: The eventual encoding of the string. If this is None,
1617 a Unicode string will be returned.
1618 :param formatter: A Formatter object, or a string naming one of
1619 the standard formatters.
1620 :return: A Unicode string (if encoding==None) or a bytestring
1621 (otherwise).
1622 """
1623 if encoding is None:
1624 return self.decode(True, formatter=formatter)
1625 else:
1626 return self.encode(encoding, True, formatter=formatter)
1627
1628 def decode_contents(self, indent_level=None,
1629 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1630 formatter="minimal"):
1631 """Renders the contents of this tag as a Unicode string.
1632
1633 :param indent_level: Each line of the rendering will be
1634 indented this many spaces. Used internally in
1635 recursive calls while pretty-printing.
1636
1637 :param eventual_encoding: The tag is destined to be
1638 encoded into this encoding. decode_contents() is _not_
1639 responsible for performing that encoding. This information
1640 is passed in so that it can be substituted in if the
1641 document contains a <META> tag that mentions the document's
1642 encoding.
1643
1644 :param formatter: A Formatter object, or a string naming one of
1645 the standard Formatters.
1646 """
1647 # First off, turn a string formatter into a Formatter object. This
1648 # will stop the lookup from happening over and over again.
1649 if not isinstance(formatter, Formatter):
1650 formatter = self.formatter_for_name(formatter)
1651
1652 pretty_print = (indent_level is not None)
1653 s = []
1654 for c in self:
1655 text = None
1656 if isinstance(c, NavigableString):
1657 text = c.output_ready(formatter)
1658 elif isinstance(c, Tag):
1659 s.append(c.decode(indent_level, eventual_encoding,
1660 formatter))
1661 preserve_whitespace = (
1662 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1663 )
1664 if text and indent_level and not preserve_whitespace:
1665 text = text.strip()
1666 if text:
1667 if pretty_print and not preserve_whitespace:
1668 s.append(" " * (indent_level - 1))
1669 s.append(text)
1670 if pretty_print and not preserve_whitespace:
1671 s.append("\n")
1672 return ''.join(s)
1673
1674 def encode_contents(
1675 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1676 formatter="minimal"):
1677 """Renders the contents of this PageElement as a bytestring.
1678
1679 :param indent_level: Each line of the rendering will be
1680 indented this many spaces. Used internally in
1681 recursive calls while pretty-printing.
1682
1683 :param eventual_encoding: The bytestring will be in this encoding.
1684
1685 :param formatter: A Formatter object, or a string naming one of
1686 the standard Formatters.
1687
1688 :return: A bytestring.
1689 """
1690 contents = self.decode_contents(indent_level, encoding, formatter)
1691 return contents.encode(encoding)
1692
1693 # Old method for BS3 compatibility
1694 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1695 prettyPrint=False, indentLevel=0):
1696 """Deprecated method for BS3 compatibility."""
1697 if not prettyPrint:
1698 indentLevel = None
1699 return self.encode_contents(
1700 indent_level=indentLevel, encoding=encoding)
1701
1702 #Soup methods
1703
1704 def find(self, name=None, attrs={}, recursive=True, text=None,
1705 **kwargs):
1706 """Look in the children of this PageElement and find the first
1707 PageElement that matches the given criteria.
1708
1709 All find_* methods take a common set of arguments. See the online
1710 documentation for detailed explanations.
1711
1712 :param name: A filter on tag name.
1713 :param attrs: A dictionary of filters on attribute values.
1714 :param recursive: If this is True, find() will perform a
1715 recursive search of this PageElement's children. Otherwise,
1716 only the direct children will be considered.
1717 :param limit: Stop looking after finding this many results.
1718 :kwargs: A dictionary of filters on attribute values.
1719 :return: A PageElement.
1720 :rtype: bs4.element.Tag | bs4.element.NavigableString
1721 """
1722 r = None
1723 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1724 if l:
1725 r = l[0]
1726 return r
1727 findChild = find
1728
1729 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1730 limit=None, **kwargs):
1731 """Look in the children of this PageElement and find all
1732 PageElements that match the given criteria.
1733
1734 All find_* methods take a common set of arguments. See the online
1735 documentation for detailed explanations.
1736
1737 :param name: A filter on tag name.
1738 :param attrs: A dictionary of filters on attribute values.
1739 :param recursive: If this is True, find_all() will perform a
1740 recursive search of this PageElement's children. Otherwise,
1741 only the direct children will be considered.
1742 :param limit: Stop looking after finding this many results.
1743 :kwargs: A dictionary of filters on attribute values.
1744 :return: A ResultSet of PageElements.
1745 :rtype: bs4.element.ResultSet
1746 """
1747 generator = self.descendants
1748 if not recursive:
1749 generator = self.children
1750 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1751 findAll = find_all # BS3
1752 findChildren = find_all # BS2
1753
1754 #Generator methods
1755 @property
1756 def children(self):
1757 """Iterate over all direct children of this PageElement.
1758
1759 :yield: A sequence of PageElements.
1760 """
1761 # return iter() to make the purpose of the method clear
1762 return iter(self.contents) # XXX This seems to be untested.
1763
1764 @property
1765 def descendants(self):
1766 """Iterate over all children of this PageElement in a
1767 breadth-first sequence.
1768
1769 :yield: A sequence of PageElements.
1770 """
1771 if not len(self.contents):
1772 return
1773 stopNode = self._last_descendant().next_element
1774 current = self.contents[0]
1775 while current is not stopNode:
1776 yield current
1777 current = current.next_element
1778
1779 # CSS selector code
1780 def select_one(self, selector, namespaces=None, **kwargs):
1781 """Perform a CSS selection operation on the current element.
1782
1783 :param selector: A CSS selector.
1784
1785 :param namespaces: A dictionary mapping namespace prefixes
1786 used in the CSS selector to namespace URIs. By default,
1787 Beautiful Soup will use the prefixes it encountered while
1788 parsing the document.
1789
1790 :param kwargs: Keyword arguments to be passed into SoupSieve's
1791 soupsieve.select() method.
1792
1793 :return: A Tag.
1794 :rtype: bs4.element.Tag
1795 """
1796 value = self.select(selector, namespaces, 1, **kwargs)
1797 if value:
1798 return value[0]
1799 return None
1800
1801 def select(self, selector, namespaces=None, limit=None, **kwargs):
1802 """Perform a CSS selection operation on the current element.
1803
1804 This uses the SoupSieve library.
1805
1806 :param selector: A string containing a CSS selector.
1807
1808 :param namespaces: A dictionary mapping namespace prefixes
1809 used in the CSS selector to namespace URIs. By default,
1810 Beautiful Soup will use the prefixes it encountered while
1811 parsing the document.
1812
1813 :param limit: After finding this number of results, stop looking.
1814
1815 :param kwargs: Keyword arguments to be passed into SoupSieve's
1816 soupsieve.select() method.
1817
1818 :return: A ResultSet of Tags.
1819 :rtype: bs4.element.ResultSet
1820 """
1821 if namespaces is None:
1822 namespaces = self._namespaces
1823
1824 if limit is None:
1825 limit = 0
1826 if soupsieve is None:
1827 raise NotImplementedError(
1828 "Cannot execute CSS selectors because the soupsieve package is not installed."
1829 )
1830
1831 results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1832
1833 # We do this because it's more consistent and because
1834 # ResultSet.__getattr__ has a helpful error message.
1835 return ResultSet(None, results)
1836
1837 # Old names for backwards compatibility
1838 def childGenerator(self):
1839 """Deprecated generator."""
1840 return self.children
1841
1842 def recursiveChildGenerator(self):
1843 """Deprecated generator."""
1844 return self.descendants
1845
1846 def has_key(self, key):
1847 """Deprecated method. This was kind of misleading because has_key()
1848 (attributes) was different from __in__ (contents).
1849
1850 has_key() is gone in Python 3, anyway.
1851 """
1852 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1853 key))
1854 return self.has_attr(key)
1855
1856 # Next, a couple classes to represent queries and their results.
1857 class SoupStrainer(object):
1858 """Encapsulates a number of ways of matching a markup element (tag or
1859 string).
1860
1861 This is primarily used to underpin the find_* methods, but you can
1862 create one yourself and pass it in as `parse_only` to the
1863 `BeautifulSoup` constructor, to parse a subset of a large
1864 document.
1865 """
1866
1867 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1868 """Constructor.
1869
1870 The SoupStrainer constructor takes the same arguments passed
1871 into the find_* methods. See the online documentation for
1872 detailed explanations.
1873
1874 :param name: A filter on tag name.
1875 :param attrs: A dictionary of filters on attribute values.
1876 :param text: A filter for a NavigableString with specific text.
1877 :kwargs: A dictionary of filters on attribute values.
1878 """
1879 self.name = self._normalize_search_value(name)
1880 if not isinstance(attrs, dict):
1881 # Treat a non-dict value for attrs as a search for the 'class'
1882 # attribute.
1883 kwargs['class'] = attrs
1884 attrs = None
1885
1886 if 'class_' in kwargs:
1887 # Treat class_="foo" as a search for the 'class'
1888 # attribute, overriding any non-dict value for attrs.
1889 kwargs['class'] = kwargs['class_']
1890 del kwargs['class_']
1891
1892 if kwargs:
1893 if attrs:
1894 attrs = attrs.copy()
1895 attrs.update(kwargs)
1896 else:
1897 attrs = kwargs
1898 normalized_attrs = {}
1899 for key, value in list(attrs.items()):
1900 normalized_attrs[key] = self._normalize_search_value(value)
1901
1902 self.attrs = normalized_attrs
1903 self.text = self._normalize_search_value(text)
1904
1905 def _normalize_search_value(self, value):
1906 # Leave it alone if it's a Unicode string, a callable, a
1907 # regular expression, a boolean, or None.
1908 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1909 or isinstance(value, bool) or value is None):
1910 return value
1911
1912 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1913 if isinstance(value, bytes):
1914 return value.decode("utf8")
1915
1916 # If it's listlike, convert it into a list of strings.
1917 if hasattr(value, '__iter__'):
1918 new_value = []
1919 for v in value:
1920 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1921 and not isinstance(v, str)):
1922 # This is almost certainly the user's mistake. In the
1923 # interests of avoiding infinite loops, we'll let
1924 # it through as-is rather than doing a recursive call.
1925 new_value.append(v)
1926 else:
1927 new_value.append(self._normalize_search_value(v))
1928 return new_value
1929
1930 # Otherwise, convert it into a Unicode string.
1931 # The unicode(str()) thing is so this will do the same thing on Python 2
1932 # and Python 3.
1933 return str(str(value))
1934
1935 def __str__(self):
1936 """A human-readable representation of this SoupStrainer."""
1937 if self.text:
1938 return self.text
1939 else:
1940 return "%s|%s" % (self.name, self.attrs)
1941
1942 def search_tag(self, markup_name=None, markup_attrs={}):
1943 """Check whether a Tag with the given name and attributes would
1944 match this SoupStrainer.
1945
1946 Used prospectively to decide whether to even bother creating a Tag
1947 object.
1948
1949 :param markup_name: A tag name as found in some markup.
1950 :param markup_attrs: A dictionary of attributes as found in some markup.
1951
1952 :return: True if the prospective tag would match this SoupStrainer;
1953 False otherwise.
1954 """
1955 found = None
1956 markup = None
1957 if isinstance(markup_name, Tag):
1958 markup = markup_name
1959 markup_attrs = markup
1960 call_function_with_tag_data = (
1961 isinstance(self.name, Callable)
1962 and not isinstance(markup_name, Tag))
1963
1964 if ((not self.name)
1965 or call_function_with_tag_data
1966 or (markup and self._matches(markup, self.name))
1967 or (not markup and self._matches(markup_name, self.name))):
1968 if call_function_with_tag_data:
1969 match = self.name(markup_name, markup_attrs)
1970 else:
1971 match = True
1972 markup_attr_map = None
1973 for attr, match_against in list(self.attrs.items()):
1974 if not markup_attr_map:
1975 if hasattr(markup_attrs, 'get'):
1976 markup_attr_map = markup_attrs
1977 else:
1978 markup_attr_map = {}
1979 for k, v in markup_attrs:
1980 markup_attr_map[k] = v
1981 attr_value = markup_attr_map.get(attr)
1982 if not self._matches(attr_value, match_against):
1983 match = False
1984 break
1985 if match:
1986 if markup:
1987 found = markup
1988 else:
1989 found = markup_name
1990 if found and self.text and not self._matches(found.string, self.text):
1991 found = None
1992 return found
1993
1994 # For BS3 compatibility.
1995 searchTag = search_tag
1996
1997 def search(self, markup):
1998 """Find all items in `markup` that match this SoupStrainer.
1999
2000 Used by the core _find_all() method, which is ultimately
2001 called by all find_* methods.
2002
2003 :param markup: A PageElement or a list of them.
2004 """
2005 # print 'looking for %s in %s' % (self, markup)
2006 found = None
2007 # If given a list of items, scan it for a text element that
2008 # matches.
2009 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2010 for element in markup:
2011 if isinstance(element, NavigableString) \
2012 and self.search(element):
2013 found = element
2014 break
2015 # If it's a Tag, make sure its name or attributes match.
2016 # Don't bother with Tags if we're searching for text.
2017 elif isinstance(markup, Tag):
2018 if not self.text or self.name or self.attrs:
2019 found = self.search_tag(markup)
2020 # If it's text, make sure the text matches.
2021 elif isinstance(markup, NavigableString) or \
2022 isinstance(markup, str):
2023 if not self.name and not self.attrs and self._matches(markup, self.text):
2024 found = markup
2025 else:
2026 raise Exception(
2027 "I don't know how to match against a %s" % markup.__class__)
2028 return found
2029
2030 def _matches(self, markup, match_against, already_tried=None):
2031 # print u"Matching %s against %s" % (markup, match_against)
2032 result = False
2033 if isinstance(markup, list) or isinstance(markup, tuple):
2034 # This should only happen when searching a multi-valued attribute
2035 # like 'class'.
2036 for item in markup:
2037 if self._matches(item, match_against):
2038 return True
2039 # We didn't match any particular value of the multivalue
2040 # attribute, but maybe we match the attribute value when
2041 # considered as a string.
2042 if self._matches(' '.join(markup), match_against):
2043 return True
2044 return False
2045
2046 if match_against is True:
2047 # True matches any non-None value.
2048 return markup is not None
2049
2050 if isinstance(match_against, Callable):
2051 return match_against(markup)
2052
2053 # Custom callables take the tag as an argument, but all
2054 # other ways of matching match the tag name as a string.
2055 original_markup = markup
2056 if isinstance(markup, Tag):
2057 markup = markup.name
2058
2059 # Ensure that `markup` is either a Unicode string, or None.
2060 markup = self._normalize_search_value(markup)
2061
2062 if markup is None:
2063 # None matches None, False, an empty string, an empty list, and so on.
2064 return not match_against
2065
2066 if (hasattr(match_against, '__iter__')
2067 and not isinstance(match_against, str)):
2068 # We're asked to match against an iterable of items.
2069 # The markup must be match at least one item in the
2070 # iterable. We'll try each one in turn.
2071 #
2072 # To avoid infinite recursion we need to keep track of
2073 # items we've already seen.
2074 if not already_tried:
2075 already_tried = set()
2076 for item in match_against:
2077 if item.__hash__:
2078 key = item
2079 else:
2080 key = id(item)
2081 if key in already_tried:
2082 continue
2083 else:
2084 already_tried.add(key)
2085 if self._matches(original_markup, item, already_tried):
2086 return True
2087 else:
2088 return False
2089
2090 # Beyond this point we might need to run the test twice: once against
2091 # the tag's name and once against its prefixed name.
2092 match = False
2093
2094 if not match and isinstance(match_against, str):
2095 # Exact string match
2096 match = markup == match_against
2097
2098 if not match and hasattr(match_against, 'search'):
2099 # Regexp match
2100 return match_against.search(markup)
2101
2102 if (not match
2103 and isinstance(original_markup, Tag)
2104 and original_markup.prefix):
2105 # Try the whole thing again with the prefixed tag name.
2106 return self._matches(
2107 original_markup.prefix + ':' + original_markup.name, match_against
2108 )
2109
2110 return match
2111
2112
2113 class ResultSet(list):
2114 """A ResultSet is just a list that keeps track of the SoupStrainer
2115 that created it."""
2116 def __init__(self, source, result=()):
2117 """Constructor.
2118
2119 :param source: A SoupStrainer.
2120 :param result: A list of PageElements.
2121 """
2122 super(ResultSet, self).__init__(result)
2123 self.source = source
2124
2125 def __getattr__(self, key):
2126 """Raise a helpful exception to explain a common code fix."""
2127 raise AttributeError(
2128 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2129 )