Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bs4/element.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 # Use of this source code is governed by the MIT license. | |
2 __license__ = "MIT" | |
3 | |
4 try: | |
5 from collections.abc import Callable # Python 3.6 | |
6 except ImportError as e: | |
7 from collections import Callable | |
8 import re | |
9 import sys | |
10 import warnings | |
11 try: | |
12 import soupsieve | |
13 except ImportError as e: | |
14 soupsieve = None | |
15 warnings.warn( | |
16 'The soupsieve package is not installed. CSS selectors cannot be used.' | |
17 ) | |
18 | |
19 from bs4.formatter import ( | |
20 Formatter, | |
21 HTMLFormatter, | |
22 XMLFormatter, | |
23 ) | |
24 | |
25 DEFAULT_OUTPUT_ENCODING = "utf-8" | |
26 PY3K = (sys.version_info[0] > 2) | |
27 | |
28 nonwhitespace_re = re.compile(r"\S+") | |
29 | |
30 # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on | |
31 # the off chance someone imported it for their own use. | |
32 whitespace_re = re.compile(r"\s+") | |
33 | |
34 def _alias(attr): | |
35 """Alias one attribute name to another for backward compatibility""" | |
36 @property | |
37 def alias(self): | |
38 return getattr(self, attr) | |
39 | |
40 @alias.setter | |
41 def alias(self): | |
42 return setattr(self, attr) | |
43 return alias | |
44 | |
45 | |
46 # These encodings are recognized by Python (so PageElement.encode | |
47 # could theoretically support them) but XML and HTML don't recognize | |
48 # them (so they should not show up in an XML or HTML document as that | |
49 # document's encoding). | |
50 # | |
51 # If an XML document is encoded in one of these encodings, no encoding | |
52 # will be mentioned in the XML declaration. If an HTML document is | |
53 # encoded in one of these encodings, and the HTML document has a | |
54 # <meta> tag that mentions an encoding, the encoding will be given as | |
55 # the empty string. | |
56 # | |
57 # Source: | |
58 # https://docs.python.org/3/library/codecs.html#python-specific-encodings | |
59 PYTHON_SPECIFIC_ENCODINGS = set([ | |
60 "idna", | |
61 "mbcs", | |
62 "oem", | |
63 "palmos", | |
64 "punycode", | |
65 "raw_unicode_escape", | |
66 "undefined", | |
67 "unicode_escape", | |
68 "raw-unicode-escape", | |
69 "unicode-escape", | |
70 "string-escape", | |
71 "string_escape", | |
72 ]) | |
73 | |
74 | |
75 class NamespacedAttribute(str): | |
76 """A namespaced string (e.g. 'xml:lang') that remembers the namespace | |
77 ('xml') and the name ('lang') that were used to create it. | |
78 """ | |
79 | |
80 def __new__(cls, prefix, name=None, namespace=None): | |
81 if not name: | |
82 # This is the default namespace. Its name "has no value" | |
83 # per https://www.w3.org/TR/xml-names/#defaulting | |
84 name = None | |
85 | |
86 if name is None: | |
87 obj = str.__new__(cls, prefix) | |
88 elif prefix is None: | |
89 # Not really namespaced. | |
90 obj = str.__new__(cls, name) | |
91 else: | |
92 obj = str.__new__(cls, prefix + ":" + name) | |
93 obj.prefix = prefix | |
94 obj.name = name | |
95 obj.namespace = namespace | |
96 return obj | |
97 | |
98 class AttributeValueWithCharsetSubstitution(str): | |
99 """A stand-in object for a character encoding specified in HTML.""" | |
100 | |
101 class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |
102 """A generic stand-in for the value of a meta tag's 'charset' attribute. | |
103 | |
104 When Beautiful Soup parses the markup '<meta charset="utf8">', the | |
105 value of the 'charset' attribute will be one of these objects. | |
106 """ | |
107 | |
108 def __new__(cls, original_value): | |
109 obj = str.__new__(cls, original_value) | |
110 obj.original_value = original_value | |
111 return obj | |
112 | |
113 def encode(self, encoding): | |
114 """When an HTML document is being encoded to a given encoding, the | |
115 value of a meta tag's 'charset' is the name of the encoding. | |
116 """ | |
117 if encoding in PYTHON_SPECIFIC_ENCODINGS: | |
118 return '' | |
119 return encoding | |
120 | |
121 | |
122 class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): | |
123 """A generic stand-in for the value of a meta tag's 'content' attribute. | |
124 | |
125 When Beautiful Soup parses the markup: | |
126 <meta http-equiv="content-type" content="text/html; charset=utf8"> | |
127 | |
128 The value of the 'content' attribute will be one of these objects. | |
129 """ | |
130 | |
131 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) | |
132 | |
133 def __new__(cls, original_value): | |
134 match = cls.CHARSET_RE.search(original_value) | |
135 if match is None: | |
136 # No substitution necessary. | |
137 return str.__new__(str, original_value) | |
138 | |
139 obj = str.__new__(cls, original_value) | |
140 obj.original_value = original_value | |
141 return obj | |
142 | |
143 def encode(self, encoding): | |
144 if encoding in PYTHON_SPECIFIC_ENCODINGS: | |
145 return '' | |
146 def rewrite(match): | |
147 return match.group(1) + encoding | |
148 return self.CHARSET_RE.sub(rewrite, self.original_value) | |
149 | |
150 | |
151 class PageElement(object): | |
152 """Contains the navigational information for some part of the page: | |
153 that is, its current location in the parse tree. | |
154 | |
155 NavigableString, Tag, etc. are all subclasses of PageElement. | |
156 """ | |
157 | |
158 def setup(self, parent=None, previous_element=None, next_element=None, | |
159 previous_sibling=None, next_sibling=None): | |
160 """Sets up the initial relations between this element and | |
161 other elements. | |
162 | |
163 :param parent: The parent of this element. | |
164 | |
165 :param previous_element: The element parsed immediately before | |
166 this one. | |
167 | |
168 :param next_element: The element parsed immediately before | |
169 this one. | |
170 | |
171 :param previous_sibling: The most recently encountered element | |
172 on the same level of the parse tree as this one. | |
173 | |
174 :param previous_sibling: The next element to be encountered | |
175 on the same level of the parse tree as this one. | |
176 """ | |
177 self.parent = parent | |
178 | |
179 self.previous_element = previous_element | |
180 if previous_element is not None: | |
181 self.previous_element.next_element = self | |
182 | |
183 self.next_element = next_element | |
184 if self.next_element is not None: | |
185 self.next_element.previous_element = self | |
186 | |
187 self.next_sibling = next_sibling | |
188 if self.next_sibling is not None: | |
189 self.next_sibling.previous_sibling = self | |
190 | |
191 if (previous_sibling is None | |
192 and self.parent is not None and self.parent.contents): | |
193 previous_sibling = self.parent.contents[-1] | |
194 | |
195 self.previous_sibling = previous_sibling | |
196 if previous_sibling is not None: | |
197 self.previous_sibling.next_sibling = self | |
198 | |
199 def format_string(self, s, formatter): | |
200 """Format the given string using the given formatter. | |
201 | |
202 :param s: A string. | |
203 :param formatter: A Formatter object, or a string naming one of the standard formatters. | |
204 """ | |
205 if formatter is None: | |
206 return s | |
207 if not isinstance(formatter, Formatter): | |
208 formatter = self.formatter_for_name(formatter) | |
209 output = formatter.substitute(s) | |
210 return output | |
211 | |
212 def formatter_for_name(self, formatter): | |
213 """Look up or create a Formatter for the given identifier, | |
214 if necessary. | |
215 | |
216 :param formatter: Can be a Formatter object (used as-is), a | |
217 function (used as the entity substitution hook for an | |
218 XMLFormatter or HTMLFormatter), or a string (used to look | |
219 up an XMLFormatter or HTMLFormatter in the appropriate | |
220 registry. | |
221 """ | |
222 if isinstance(formatter, Formatter): | |
223 return formatter | |
224 if self._is_xml: | |
225 c = XMLFormatter | |
226 else: | |
227 c = HTMLFormatter | |
228 if isinstance(formatter, Callable): | |
229 return c(entity_substitution=formatter) | |
230 return c.REGISTRY[formatter] | |
231 | |
232 @property | |
233 def _is_xml(self): | |
234 """Is this element part of an XML tree or an HTML tree? | |
235 | |
236 This is used in formatter_for_name, when deciding whether an | |
237 XMLFormatter or HTMLFormatter is more appropriate. It can be | |
238 inefficient, but it should be called very rarely. | |
239 """ | |
240 if self.known_xml is not None: | |
241 # Most of the time we will have determined this when the | |
242 # document is parsed. | |
243 return self.known_xml | |
244 | |
245 # Otherwise, it's likely that this element was created by | |
246 # direct invocation of the constructor from within the user's | |
247 # Python code. | |
248 if self.parent is None: | |
249 # This is the top-level object. It should have .known_xml set | |
250 # from tree creation. If not, take a guess--BS is usually | |
251 # used on HTML markup. | |
252 return getattr(self, 'is_xml', False) | |
253 return self.parent._is_xml | |
254 | |
255 nextSibling = _alias("next_sibling") # BS3 | |
256 previousSibling = _alias("previous_sibling") # BS3 | |
257 | |
258 def replace_with(self, replace_with): | |
259 """Replace this PageElement with another one, keeping the rest of the | |
260 tree the same. | |
261 | |
262 :param replace_with: A PageElement. | |
263 :return: `self`, no longer part of the tree. | |
264 """ | |
265 if self.parent is None: | |
266 raise ValueError( | |
267 "Cannot replace one element with another when the " | |
268 "element to be replaced is not part of a tree.") | |
269 if replace_with is self: | |
270 return | |
271 if replace_with is self.parent: | |
272 raise ValueError("Cannot replace a Tag with its parent.") | |
273 old_parent = self.parent | |
274 my_index = self.parent.index(self) | |
275 self.extract(_self_index=my_index) | |
276 old_parent.insert(my_index, replace_with) | |
277 return self | |
278 replaceWith = replace_with # BS3 | |
279 | |
280 def unwrap(self): | |
281 """Replace this PageElement with its contents. | |
282 | |
283 :return: `self`, no longer part of the tree. | |
284 """ | |
285 my_parent = self.parent | |
286 if self.parent is None: | |
287 raise ValueError( | |
288 "Cannot replace an element with its contents when that" | |
289 "element is not part of a tree.") | |
290 my_index = self.parent.index(self) | |
291 self.extract(_self_index=my_index) | |
292 for child in reversed(self.contents[:]): | |
293 my_parent.insert(my_index, child) | |
294 return self | |
295 replace_with_children = unwrap | |
296 replaceWithChildren = unwrap # BS3 | |
297 | |
298 def wrap(self, wrap_inside): | |
299 """Wrap this PageElement inside another one. | |
300 | |
301 :param wrap_inside: A PageElement. | |
302 :return: `wrap_inside`, occupying the position in the tree that used | |
303 to be occupied by `self`, and with `self` inside it. | |
304 """ | |
305 me = self.replace_with(wrap_inside) | |
306 wrap_inside.append(me) | |
307 return wrap_inside | |
308 | |
309 def extract(self, _self_index=None): | |
310 """Destructively rips this element out of the tree. | |
311 | |
312 :param _self_index: The location of this element in its parent's | |
313 .contents, if known. Passing this in allows for a performance | |
314 optimization. | |
315 | |
316 :return: `self`, no longer part of the tree. | |
317 """ | |
318 if self.parent is not None: | |
319 if _self_index is None: | |
320 _self_index = self.parent.index(self) | |
321 del self.parent.contents[_self_index] | |
322 | |
323 #Find the two elements that would be next to each other if | |
324 #this element (and any children) hadn't been parsed. Connect | |
325 #the two. | |
326 last_child = self._last_descendant() | |
327 next_element = last_child.next_element | |
328 | |
329 if (self.previous_element is not None and | |
330 self.previous_element is not next_element): | |
331 self.previous_element.next_element = next_element | |
332 if next_element is not None and next_element is not self.previous_element: | |
333 next_element.previous_element = self.previous_element | |
334 self.previous_element = None | |
335 last_child.next_element = None | |
336 | |
337 self.parent = None | |
338 if (self.previous_sibling is not None | |
339 and self.previous_sibling is not self.next_sibling): | |
340 self.previous_sibling.next_sibling = self.next_sibling | |
341 if (self.next_sibling is not None | |
342 and self.next_sibling is not self.previous_sibling): | |
343 self.next_sibling.previous_sibling = self.previous_sibling | |
344 self.previous_sibling = self.next_sibling = None | |
345 return self | |
346 | |
347 def _last_descendant(self, is_initialized=True, accept_self=True): | |
348 """Finds the last element beneath this object to be parsed. | |
349 | |
350 :param is_initialized: Has `setup` been called on this PageElement | |
351 yet? | |
352 :param accept_self: Is `self` an acceptable answer to the question? | |
353 """ | |
354 if is_initialized and self.next_sibling is not None: | |
355 last_child = self.next_sibling.previous_element | |
356 else: | |
357 last_child = self | |
358 while isinstance(last_child, Tag) and last_child.contents: | |
359 last_child = last_child.contents[-1] | |
360 if not accept_self and last_child is self: | |
361 last_child = None | |
362 return last_child | |
363 # BS3: Not part of the API! | |
364 _lastRecursiveChild = _last_descendant | |
365 | |
366 def insert(self, position, new_child): | |
367 """Insert a new PageElement in the list of this PageElement's children. | |
368 | |
369 This works the same way as `list.insert`. | |
370 | |
371 :param position: The numeric position that should be occupied | |
372 in `self.children` by the new PageElement. | |
373 :param new_child: A PageElement. | |
374 """ | |
375 if new_child is None: | |
376 raise ValueError("Cannot insert None into a tag.") | |
377 if new_child is self: | |
378 raise ValueError("Cannot insert a tag into itself.") | |
379 if (isinstance(new_child, str) | |
380 and not isinstance(new_child, NavigableString)): | |
381 new_child = NavigableString(new_child) | |
382 | |
383 from bs4 import BeautifulSoup | |
384 if isinstance(new_child, BeautifulSoup): | |
385 # We don't want to end up with a situation where one BeautifulSoup | |
386 # object contains another. Insert the children one at a time. | |
387 for subchild in list(new_child.contents): | |
388 self.insert(position, subchild) | |
389 position += 1 | |
390 return | |
391 position = min(position, len(self.contents)) | |
392 if hasattr(new_child, 'parent') and new_child.parent is not None: | |
393 # We're 'inserting' an element that's already one | |
394 # of this object's children. | |
395 if new_child.parent is self: | |
396 current_index = self.index(new_child) | |
397 if current_index < position: | |
398 # We're moving this element further down the list | |
399 # of this object's children. That means that when | |
400 # we extract this element, our target index will | |
401 # jump down one. | |
402 position -= 1 | |
403 new_child.extract() | |
404 | |
405 new_child.parent = self | |
406 previous_child = None | |
407 if position == 0: | |
408 new_child.previous_sibling = None | |
409 new_child.previous_element = self | |
410 else: | |
411 previous_child = self.contents[position - 1] | |
412 new_child.previous_sibling = previous_child | |
413 new_child.previous_sibling.next_sibling = new_child | |
414 new_child.previous_element = previous_child._last_descendant(False) | |
415 if new_child.previous_element is not None: | |
416 new_child.previous_element.next_element = new_child | |
417 | |
418 new_childs_last_element = new_child._last_descendant(False) | |
419 | |
420 if position >= len(self.contents): | |
421 new_child.next_sibling = None | |
422 | |
423 parent = self | |
424 parents_next_sibling = None | |
425 while parents_next_sibling is None and parent is not None: | |
426 parents_next_sibling = parent.next_sibling | |
427 parent = parent.parent | |
428 if parents_next_sibling is not None: | |
429 # We found the element that comes next in the document. | |
430 break | |
431 if parents_next_sibling is not None: | |
432 new_childs_last_element.next_element = parents_next_sibling | |
433 else: | |
434 # The last element of this tag is the last element in | |
435 # the document. | |
436 new_childs_last_element.next_element = None | |
437 else: | |
438 next_child = self.contents[position] | |
439 new_child.next_sibling = next_child | |
440 if new_child.next_sibling is not None: | |
441 new_child.next_sibling.previous_sibling = new_child | |
442 new_childs_last_element.next_element = next_child | |
443 | |
444 if new_childs_last_element.next_element is not None: | |
445 new_childs_last_element.next_element.previous_element = new_childs_last_element | |
446 self.contents.insert(position, new_child) | |
447 | |
448 def append(self, tag): | |
449 """Appends the given PageElement to the contents of this one. | |
450 | |
451 :param tag: A PageElement. | |
452 """ | |
453 self.insert(len(self.contents), tag) | |
454 | |
455 def extend(self, tags): | |
456 """Appends the given PageElements to this one's contents. | |
457 | |
458 :param tags: A list of PageElements. | |
459 """ | |
460 for tag in tags: | |
461 self.append(tag) | |
462 | |
463 def insert_before(self, *args): | |
464 """Makes the given element(s) the immediate predecessor of this one. | |
465 | |
466 All the elements will have the same parent, and the given elements | |
467 will be immediately before this one. | |
468 | |
469 :param args: One or more PageElements. | |
470 """ | |
471 parent = self.parent | |
472 if parent is None: | |
473 raise ValueError( | |
474 "Element has no parent, so 'before' has no meaning.") | |
475 if any(x is self for x in args): | |
476 raise ValueError("Can't insert an element before itself.") | |
477 for predecessor in args: | |
478 # Extract first so that the index won't be screwed up if they | |
479 # are siblings. | |
480 if isinstance(predecessor, PageElement): | |
481 predecessor.extract() | |
482 index = parent.index(self) | |
483 parent.insert(index, predecessor) | |
484 | |
485 def insert_after(self, *args): | |
486 """Makes the given element(s) the immediate successor of this one. | |
487 | |
488 The elements will have the same parent, and the given elements | |
489 will be immediately after this one. | |
490 | |
491 :param args: One or more PageElements. | |
492 """ | |
493 # Do all error checking before modifying the tree. | |
494 parent = self.parent | |
495 if parent is None: | |
496 raise ValueError( | |
497 "Element has no parent, so 'after' has no meaning.") | |
498 if any(x is self for x in args): | |
499 raise ValueError("Can't insert an element after itself.") | |
500 | |
501 offset = 0 | |
502 for successor in args: | |
503 # Extract first so that the index won't be screwed up if they | |
504 # are siblings. | |
505 if isinstance(successor, PageElement): | |
506 successor.extract() | |
507 index = parent.index(self) | |
508 parent.insert(index+1+offset, successor) | |
509 offset += 1 | |
510 | |
511 def find_next(self, name=None, attrs={}, text=None, **kwargs): | |
512 """Find the first PageElement that matches the given criteria and | |
513 appears later in the document than this PageElement. | |
514 | |
515 All find_* methods take a common set of arguments. See the online | |
516 documentation for detailed explanations. | |
517 | |
518 :param name: A filter on tag name. | |
519 :param attrs: A dictionary of filters on attribute values. | |
520 :param text: A filter for a NavigableString with specific text. | |
521 :kwargs: A dictionary of filters on attribute values. | |
522 :return: A PageElement. | |
523 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
524 """ | |
525 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) | |
526 findNext = find_next # BS3 | |
527 | |
528 def find_all_next(self, name=None, attrs={}, text=None, limit=None, | |
529 **kwargs): | |
530 """Find all PageElements that match the given criteria and appear | |
531 later in the document than this PageElement. | |
532 | |
533 All find_* methods take a common set of arguments. See the online | |
534 documentation for detailed explanations. | |
535 | |
536 :param name: A filter on tag name. | |
537 :param attrs: A dictionary of filters on attribute values. | |
538 :param text: A filter for a NavigableString with specific text. | |
539 :param limit: Stop looking after finding this many results. | |
540 :kwargs: A dictionary of filters on attribute values. | |
541 :return: A ResultSet containing PageElements. | |
542 """ | |
543 return self._find_all(name, attrs, text, limit, self.next_elements, | |
544 **kwargs) | |
545 findAllNext = find_all_next # BS3 | |
546 | |
547 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): | |
548 """Find the closest sibling to this PageElement that matches the | |
549 given criteria and appears later in the document. | |
550 | |
551 All find_* methods take a common set of arguments. See the | |
552 online documentation for detailed explanations. | |
553 | |
554 :param name: A filter on tag name. | |
555 :param attrs: A dictionary of filters on attribute values. | |
556 :param text: A filter for a NavigableString with specific text. | |
557 :kwargs: A dictionary of filters on attribute values. | |
558 :return: A PageElement. | |
559 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
560 """ | |
561 return self._find_one(self.find_next_siblings, name, attrs, text, | |
562 **kwargs) | |
563 findNextSibling = find_next_sibling # BS3 | |
564 | |
565 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, | |
566 **kwargs): | |
567 """Find all siblings of this PageElement that match the given criteria | |
568 and appear later in the document. | |
569 | |
570 All find_* methods take a common set of arguments. See the online | |
571 documentation for detailed explanations. | |
572 | |
573 :param name: A filter on tag name. | |
574 :param attrs: A dictionary of filters on attribute values. | |
575 :param text: A filter for a NavigableString with specific text. | |
576 :param limit: Stop looking after finding this many results. | |
577 :kwargs: A dictionary of filters on attribute values. | |
578 :return: A ResultSet of PageElements. | |
579 :rtype: bs4.element.ResultSet | |
580 """ | |
581 return self._find_all(name, attrs, text, limit, | |
582 self.next_siblings, **kwargs) | |
583 findNextSiblings = find_next_siblings # BS3 | |
584 fetchNextSiblings = find_next_siblings # BS2 | |
585 | |
586 def find_previous(self, name=None, attrs={}, text=None, **kwargs): | |
587 """Look backwards in the document from this PageElement and find the | |
588 first PageElement that matches the given criteria. | |
589 | |
590 All find_* methods take a common set of arguments. See the online | |
591 documentation for detailed explanations. | |
592 | |
593 :param name: A filter on tag name. | |
594 :param attrs: A dictionary of filters on attribute values. | |
595 :param text: A filter for a NavigableString with specific text. | |
596 :kwargs: A dictionary of filters on attribute values. | |
597 :return: A PageElement. | |
598 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
599 """ | |
600 return self._find_one( | |
601 self.find_all_previous, name, attrs, text, **kwargs) | |
602 findPrevious = find_previous # BS3 | |
603 | |
604 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, | |
605 **kwargs): | |
606 """Look backwards in the document from this PageElement and find all | |
607 PageElements that match the given criteria. | |
608 | |
609 All find_* methods take a common set of arguments. See the online | |
610 documentation for detailed explanations. | |
611 | |
612 :param name: A filter on tag name. | |
613 :param attrs: A dictionary of filters on attribute values. | |
614 :param text: A filter for a NavigableString with specific text. | |
615 :param limit: Stop looking after finding this many results. | |
616 :kwargs: A dictionary of filters on attribute values. | |
617 :return: A ResultSet of PageElements. | |
618 :rtype: bs4.element.ResultSet | |
619 """ | |
620 return self._find_all(name, attrs, text, limit, self.previous_elements, | |
621 **kwargs) | |
622 findAllPrevious = find_all_previous # BS3 | |
623 fetchPrevious = find_all_previous # BS2 | |
624 | |
625 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): | |
626 """Returns the closest sibling to this PageElement that matches the | |
627 given criteria and appears earlier in the document. | |
628 | |
629 All find_* methods take a common set of arguments. See the online | |
630 documentation for detailed explanations. | |
631 | |
632 :param name: A filter on tag name. | |
633 :param attrs: A dictionary of filters on attribute values. | |
634 :param text: A filter for a NavigableString with specific text. | |
635 :kwargs: A dictionary of filters on attribute values. | |
636 :return: A PageElement. | |
637 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
638 """ | |
639 return self._find_one(self.find_previous_siblings, name, attrs, text, | |
640 **kwargs) | |
641 findPreviousSibling = find_previous_sibling # BS3 | |
642 | |
643 def find_previous_siblings(self, name=None, attrs={}, text=None, | |
644 limit=None, **kwargs): | |
645 """Returns all siblings to this PageElement that match the | |
646 given criteria and appear earlier in the document. | |
647 | |
648 All find_* methods take a common set of arguments. See the online | |
649 documentation for detailed explanations. | |
650 | |
651 :param name: A filter on tag name. | |
652 :param attrs: A dictionary of filters on attribute values. | |
653 :param text: A filter for a NavigableString with specific text. | |
654 :param limit: Stop looking after finding this many results. | |
655 :kwargs: A dictionary of filters on attribute values. | |
656 :return: A ResultSet of PageElements. | |
657 :rtype: bs4.element.ResultSet | |
658 """ | |
659 return self._find_all(name, attrs, text, limit, | |
660 self.previous_siblings, **kwargs) | |
661 findPreviousSiblings = find_previous_siblings # BS3 | |
662 fetchPreviousSiblings = find_previous_siblings # BS2 | |
663 | |
664 def find_parent(self, name=None, attrs={}, **kwargs): | |
665 """Find the closest parent of this PageElement that matches the given | |
666 criteria. | |
667 | |
668 All find_* methods take a common set of arguments. See the online | |
669 documentation for detailed explanations. | |
670 | |
671 :param name: A filter on tag name. | |
672 :param attrs: A dictionary of filters on attribute values. | |
673 :kwargs: A dictionary of filters on attribute values. | |
674 | |
675 :return: A PageElement. | |
676 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
677 """ | |
678 # NOTE: We can't use _find_one because findParents takes a different | |
679 # set of arguments. | |
680 r = None | |
681 l = self.find_parents(name, attrs, 1, **kwargs) | |
682 if l: | |
683 r = l[0] | |
684 return r | |
685 findParent = find_parent # BS3 | |
686 | |
687 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): | |
688 """Find all parents of this PageElement that match the given criteria. | |
689 | |
690 All find_* methods take a common set of arguments. See the online | |
691 documentation for detailed explanations. | |
692 | |
693 :param name: A filter on tag name. | |
694 :param attrs: A dictionary of filters on attribute values. | |
695 :param limit: Stop looking after finding this many results. | |
696 :kwargs: A dictionary of filters on attribute values. | |
697 | |
698 :return: A PageElement. | |
699 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
700 """ | |
701 return self._find_all(name, attrs, None, limit, self.parents, | |
702 **kwargs) | |
703 findParents = find_parents # BS3 | |
704 fetchParents = find_parents # BS2 | |
705 | |
706 @property | |
707 def next(self): | |
708 """The PageElement, if any, that was parsed just after this one. | |
709 | |
710 :return: A PageElement. | |
711 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
712 """ | |
713 return self.next_element | |
714 | |
715 @property | |
716 def previous(self): | |
717 """The PageElement, if any, that was parsed just before this one. | |
718 | |
719 :return: A PageElement. | |
720 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
721 """ | |
722 return self.previous_element | |
723 | |
724 #These methods do the real heavy lifting. | |
725 | |
726 def _find_one(self, method, name, attrs, text, **kwargs): | |
727 r = None | |
728 l = method(name, attrs, text, 1, **kwargs) | |
729 if l: | |
730 r = l[0] | |
731 return r | |
732 | |
733 def _find_all(self, name, attrs, text, limit, generator, **kwargs): | |
734 "Iterates over a generator looking for things that match." | |
735 | |
736 if text is None and 'string' in kwargs: | |
737 text = kwargs['string'] | |
738 del kwargs['string'] | |
739 | |
740 if isinstance(name, SoupStrainer): | |
741 strainer = name | |
742 else: | |
743 strainer = SoupStrainer(name, attrs, text, **kwargs) | |
744 | |
745 if text is None and not limit and not attrs and not kwargs: | |
746 if name is True or name is None: | |
747 # Optimization to find all tags. | |
748 result = (element for element in generator | |
749 if isinstance(element, Tag)) | |
750 return ResultSet(strainer, result) | |
751 elif isinstance(name, str): | |
752 # Optimization to find all tags with a given name. | |
753 if name.count(':') == 1: | |
754 # This is a name with a prefix. If this is a namespace-aware document, | |
755 # we need to match the local name against tag.name. If not, | |
756 # we need to match the fully-qualified name against tag.name. | |
757 prefix, local_name = name.split(':', 1) | |
758 else: | |
759 prefix = None | |
760 local_name = name | |
761 result = (element for element in generator | |
762 if isinstance(element, Tag) | |
763 and ( | |
764 element.name == name | |
765 ) or ( | |
766 element.name == local_name | |
767 and (prefix is None or element.prefix == prefix) | |
768 ) | |
769 ) | |
770 return ResultSet(strainer, result) | |
771 results = ResultSet(strainer) | |
772 while True: | |
773 try: | |
774 i = next(generator) | |
775 except StopIteration: | |
776 break | |
777 if i: | |
778 found = strainer.search(i) | |
779 if found: | |
780 results.append(found) | |
781 if limit and len(results) >= limit: | |
782 break | |
783 return results | |
784 | |
785 #These generators can be used to navigate starting from both | |
786 #NavigableStrings and Tags. | |
787 @property | |
788 def next_elements(self): | |
789 """All PageElements that were parsed after this one. | |
790 | |
791 :yield: A sequence of PageElements. | |
792 """ | |
793 i = self.next_element | |
794 while i is not None: | |
795 yield i | |
796 i = i.next_element | |
797 | |
798 @property | |
799 def next_siblings(self): | |
800 """All PageElements that are siblings of this one but were parsed | |
801 later. | |
802 | |
803 :yield: A sequence of PageElements. | |
804 """ | |
805 i = self.next_sibling | |
806 while i is not None: | |
807 yield i | |
808 i = i.next_sibling | |
809 | |
810 @property | |
811 def previous_elements(self): | |
812 """All PageElements that were parsed before this one. | |
813 | |
814 :yield: A sequence of PageElements. | |
815 """ | |
816 i = self.previous_element | |
817 while i is not None: | |
818 yield i | |
819 i = i.previous_element | |
820 | |
821 @property | |
822 def previous_siblings(self): | |
823 """All PageElements that are siblings of this one but were parsed | |
824 earlier. | |
825 | |
826 :yield: A sequence of PageElements. | |
827 """ | |
828 i = self.previous_sibling | |
829 while i is not None: | |
830 yield i | |
831 i = i.previous_sibling | |
832 | |
833 @property | |
834 def parents(self): | |
835 """All PageElements that are parents of this PageElement. | |
836 | |
837 :yield: A sequence of PageElements. | |
838 """ | |
839 i = self.parent | |
840 while i is not None: | |
841 yield i | |
842 i = i.parent | |
843 | |
844 @property | |
845 def decomposed(self): | |
846 """Check whether a PageElement has been decomposed. | |
847 | |
848 :rtype: bool | |
849 """ | |
850 return getattr(self, '_decomposed', False) or False | |
851 | |
852 # Old non-property versions of the generators, for backwards | |
853 # compatibility with BS3. | |
854 def nextGenerator(self): | |
855 return self.next_elements | |
856 | |
857 def nextSiblingGenerator(self): | |
858 return self.next_siblings | |
859 | |
860 def previousGenerator(self): | |
861 return self.previous_elements | |
862 | |
863 def previousSiblingGenerator(self): | |
864 return self.previous_siblings | |
865 | |
866 def parentGenerator(self): | |
867 return self.parents | |
868 | |
869 | |
870 class NavigableString(str, PageElement): | |
871 """A Python Unicode string that is part of a parse tree. | |
872 | |
873 When Beautiful Soup parses the markup <b>penguin</b>, it will | |
874 create a NavigableString for the string "penguin". | |
875 """ | |
876 | |
877 PREFIX = '' | |
878 SUFFIX = '' | |
879 | |
880 # We can't tell just by looking at a string whether it's contained | |
881 # in an XML document or an HTML document. | |
882 | |
883 known_xml = None | |
884 | |
885 def __new__(cls, value): | |
886 """Create a new NavigableString. | |
887 | |
888 When unpickling a NavigableString, this method is called with | |
889 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be | |
890 passed in to the superclass's __new__ or the superclass won't know | |
891 how to handle non-ASCII characters. | |
892 """ | |
893 if isinstance(value, str): | |
894 u = str.__new__(cls, value) | |
895 else: | |
896 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) | |
897 u.setup() | |
898 return u | |
899 | |
900 def __copy__(self): | |
901 """A copy of a NavigableString has the same contents and class | |
902 as the original, but it is not connected to the parse tree. | |
903 """ | |
904 return type(self)(self) | |
905 | |
906 def __getnewargs__(self): | |
907 return (str(self),) | |
908 | |
909 def __getattr__(self, attr): | |
910 """text.string gives you text. This is for backwards | |
911 compatibility for Navigable*String, but for CData* it lets you | |
912 get the string without the CData wrapper.""" | |
913 if attr == 'string': | |
914 return self | |
915 else: | |
916 raise AttributeError( | |
917 "'%s' object has no attribute '%s'" % ( | |
918 self.__class__.__name__, attr)) | |
919 | |
920 def output_ready(self, formatter="minimal"): | |
921 """Run the string through the provided formatter. | |
922 | |
923 :param formatter: A Formatter object, or a string naming one of the standard formatters. | |
924 """ | |
925 output = self.format_string(self, formatter) | |
926 return self.PREFIX + output + self.SUFFIX | |
927 | |
928 @property | |
929 def name(self): | |
930 """Since a NavigableString is not a Tag, it has no .name. | |
931 | |
932 This property is implemented so that code like this doesn't crash | |
933 when run on a mixture of Tag and NavigableString objects: | |
934 [x.name for x in tag.children] | |
935 """ | |
936 return None | |
937 | |
938 @name.setter | |
939 def name(self, name): | |
940 """Prevent NavigableString.name from ever being set.""" | |
941 raise AttributeError("A NavigableString cannot be given a name.") | |
942 | |
943 | |
944 class PreformattedString(NavigableString): | |
945 """A NavigableString not subject to the normal formatting rules. | |
946 | |
947 This is an abstract class used for special kinds of strings such | |
948 as comments (the Comment class) and CDATA blocks (the CData | |
949 class). | |
950 """ | |
951 | |
952 PREFIX = '' | |
953 SUFFIX = '' | |
954 | |
955 def output_ready(self, formatter=None): | |
956 """Make this string ready for output by adding any subclass-specific | |
957 prefix or suffix. | |
958 | |
959 :param formatter: A Formatter object, or a string naming one | |
960 of the standard formatters. The string will be passed into the | |
961 Formatter, but only to trigger any side effects: the return | |
962 value is ignored. | |
963 | |
964 :return: The string, with any subclass-specific prefix and | |
965 suffix added on. | |
966 """ | |
967 if formatter is not None: | |
968 ignore = self.format_string(self, formatter) | |
969 return self.PREFIX + self + self.SUFFIX | |
970 | |
971 class CData(PreformattedString): | |
972 """A CDATA block.""" | |
973 PREFIX = '<![CDATA[' | |
974 SUFFIX = ']]>' | |
975 | |
976 class ProcessingInstruction(PreformattedString): | |
977 """A SGML processing instruction.""" | |
978 | |
979 PREFIX = '<?' | |
980 SUFFIX = '>' | |
981 | |
982 class XMLProcessingInstruction(ProcessingInstruction): | |
983 """An XML processing instruction.""" | |
984 PREFIX = '<?' | |
985 SUFFIX = '?>' | |
986 | |
987 class Comment(PreformattedString): | |
988 """An HTML or XML comment.""" | |
989 PREFIX = '<!--' | |
990 SUFFIX = '-->' | |
991 | |
992 | |
993 class Declaration(PreformattedString): | |
994 """An XML declaration.""" | |
995 PREFIX = '<?' | |
996 SUFFIX = '?>' | |
997 | |
998 | |
999 class Doctype(PreformattedString): | |
1000 """A document type declaration.""" | |
1001 @classmethod | |
1002 def for_name_and_ids(cls, name, pub_id, system_id): | |
1003 """Generate an appropriate document type declaration for a given | |
1004 public ID and system ID. | |
1005 | |
1006 :param name: The name of the document's root element, e.g. 'html'. | |
1007 :param pub_id: The Formal Public Identifier for this document type, | |
1008 e.g. '-//W3C//DTD XHTML 1.1//EN' | |
1009 :param system_id: The system identifier for this document type, | |
1010 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' | |
1011 | |
1012 :return: A Doctype. | |
1013 """ | |
1014 value = name or '' | |
1015 if pub_id is not None: | |
1016 value += ' PUBLIC "%s"' % pub_id | |
1017 if system_id is not None: | |
1018 value += ' "%s"' % system_id | |
1019 elif system_id is not None: | |
1020 value += ' SYSTEM "%s"' % system_id | |
1021 | |
1022 return Doctype(value) | |
1023 | |
1024 PREFIX = '<!DOCTYPE ' | |
1025 SUFFIX = '>\n' | |
1026 | |
1027 | |
1028 class Stylesheet(NavigableString): | |
1029 """A NavigableString representing an stylesheet (probably | |
1030 CSS). | |
1031 | |
1032 Used to distinguish embedded stylesheets from textual content. | |
1033 """ | |
1034 pass | |
1035 | |
1036 | |
1037 class Script(NavigableString): | |
1038 """A NavigableString representing an executable script (probably | |
1039 Javascript). | |
1040 | |
1041 Used to distinguish executable code from textual content. | |
1042 """ | |
1043 pass | |
1044 | |
1045 | |
1046 class TemplateString(NavigableString): | |
1047 """A NavigableString representing a string found inside an HTML | |
1048 template embedded in a larger document. | |
1049 | |
1050 Used to distinguish such strings from the main body of the document. | |
1051 """ | |
1052 pass | |
1053 | |
1054 | |
1055 class Tag(PageElement): | |
1056 """Represents an HTML or XML tag that is part of a parse tree, along | |
1057 with its attributes and contents. | |
1058 | |
1059 When Beautiful Soup parses the markup <b>penguin</b>, it will | |
1060 create a Tag object representing the <b> tag. | |
1061 """ | |
1062 | |
1063 def __init__(self, parser=None, builder=None, name=None, namespace=None, | |
1064 prefix=None, attrs=None, parent=None, previous=None, | |
1065 is_xml=None, sourceline=None, sourcepos=None, | |
1066 can_be_empty_element=None, cdata_list_attributes=None, | |
1067 preserve_whitespace_tags=None | |
1068 ): | |
1069 """Basic constructor. | |
1070 | |
1071 :param parser: A BeautifulSoup object. | |
1072 :param builder: A TreeBuilder. | |
1073 :param name: The name of the tag. | |
1074 :param namespace: The URI of this Tag's XML namespace, if any. | |
1075 :param prefix: The prefix for this Tag's XML namespace, if any. | |
1076 :param attrs: A dictionary of this Tag's attribute values. | |
1077 :param parent: The PageElement to use as this Tag's parent. | |
1078 :param previous: The PageElement that was parsed immediately before | |
1079 this tag. | |
1080 :param is_xml: If True, this is an XML tag. Otherwise, this is an | |
1081 HTML tag. | |
1082 :param sourceline: The line number where this tag was found in its | |
1083 source document. | |
1084 :param sourcepos: The character position within `sourceline` where this | |
1085 tag was found. | |
1086 :param can_be_empty_element: If True, this tag should be | |
1087 represented as <tag/>. If False, this tag should be represented | |
1088 as <tag></tag>. | |
1089 :param cdata_list_attributes: A list of attributes whose values should | |
1090 be treated as CDATA if they ever show up on this tag. | |
1091 :param preserve_whitespace_tags: A list of tag names whose contents | |
1092 should have their whitespace preserved. | |
1093 """ | |
1094 if parser is None: | |
1095 self.parser_class = None | |
1096 else: | |
1097 # We don't actually store the parser object: that lets extracted | |
1098 # chunks be garbage-collected. | |
1099 self.parser_class = parser.__class__ | |
1100 if name is None: | |
1101 raise ValueError("No value provided for new tag's name.") | |
1102 self.name = name | |
1103 self.namespace = namespace | |
1104 self.prefix = prefix | |
1105 if ((not builder or builder.store_line_numbers) | |
1106 and (sourceline is not None or sourcepos is not None)): | |
1107 self.sourceline = sourceline | |
1108 self.sourcepos = sourcepos | |
1109 if attrs is None: | |
1110 attrs = {} | |
1111 elif attrs: | |
1112 if builder is not None and builder.cdata_list_attributes: | |
1113 attrs = builder._replace_cdata_list_attribute_values( | |
1114 self.name, attrs) | |
1115 else: | |
1116 attrs = dict(attrs) | |
1117 else: | |
1118 attrs = dict(attrs) | |
1119 | |
1120 # If possible, determine ahead of time whether this tag is an | |
1121 # XML tag. | |
1122 if builder: | |
1123 self.known_xml = builder.is_xml | |
1124 else: | |
1125 self.known_xml = is_xml | |
1126 self.attrs = attrs | |
1127 self.contents = [] | |
1128 self.setup(parent, previous) | |
1129 self.hidden = False | |
1130 | |
1131 if builder is None: | |
1132 # In the absence of a TreeBuilder, use whatever values were | |
1133 # passed in here. They're probably None, unless this is a copy of some | |
1134 # other tag. | |
1135 self.can_be_empty_element = can_be_empty_element | |
1136 self.cdata_list_attributes = cdata_list_attributes | |
1137 self.preserve_whitespace_tags = preserve_whitespace_tags | |
1138 else: | |
1139 # Set up any substitutions for this tag, such as the charset in a META tag. | |
1140 builder.set_up_substitutions(self) | |
1141 | |
1142 # Ask the TreeBuilder whether this tag might be an empty-element tag. | |
1143 self.can_be_empty_element = builder.can_be_empty_element(name) | |
1144 | |
1145 # Keep track of the list of attributes of this tag that | |
1146 # might need to be treated as a list. | |
1147 # | |
1148 # For performance reasons, we store the whole data structure | |
1149 # rather than asking the question of every tag. Asking would | |
1150 # require building a new data structure every time, and | |
1151 # (unlike can_be_empty_element), we almost never need | |
1152 # to check this. | |
1153 self.cdata_list_attributes = builder.cdata_list_attributes | |
1154 | |
1155 # Keep track of the names that might cause this tag to be treated as a | |
1156 # whitespace-preserved tag. | |
1157 self.preserve_whitespace_tags = builder.preserve_whitespace_tags | |
1158 | |
1159 parserClass = _alias("parser_class") # BS3 | |
1160 | |
1161 def __copy__(self): | |
1162 """A copy of a Tag is a new Tag, unconnected to the parse tree. | |
1163 Its contents are a copy of the old Tag's contents. | |
1164 """ | |
1165 clone = type(self)( | |
1166 None, self.builder, self.name, self.namespace, | |
1167 self.prefix, self.attrs, is_xml=self._is_xml, | |
1168 sourceline=self.sourceline, sourcepos=self.sourcepos, | |
1169 can_be_empty_element=self.can_be_empty_element, | |
1170 cdata_list_attributes=self.cdata_list_attributes, | |
1171 preserve_whitespace_tags=self.preserve_whitespace_tags | |
1172 ) | |
1173 for attr in ('can_be_empty_element', 'hidden'): | |
1174 setattr(clone, attr, getattr(self, attr)) | |
1175 for child in self.contents: | |
1176 clone.append(child.__copy__()) | |
1177 return clone | |
1178 | |
1179 @property | |
1180 def is_empty_element(self): | |
1181 """Is this tag an empty-element tag? (aka a self-closing tag) | |
1182 | |
1183 A tag that has contents is never an empty-element tag. | |
1184 | |
1185 A tag that has no contents may or may not be an empty-element | |
1186 tag. It depends on the builder used to create the tag. If the | |
1187 builder has a designated list of empty-element tags, then only | |
1188 a tag whose name shows up in that list is considered an | |
1189 empty-element tag. | |
1190 | |
1191 If the builder has no designated list of empty-element tags, | |
1192 then any tag with no contents is an empty-element tag. | |
1193 """ | |
1194 return len(self.contents) == 0 and self.can_be_empty_element | |
1195 isSelfClosing = is_empty_element # BS3 | |
1196 | |
1197 @property | |
1198 def string(self): | |
1199 """Convenience property to get the single string within this | |
1200 PageElement. | |
1201 | |
1202 TODO It might make sense to have NavigableString.string return | |
1203 itself. | |
1204 | |
1205 :return: If this element has a single string child, return | |
1206 value is that string. If this element has one child tag, | |
1207 return value is the 'string' attribute of the child tag, | |
1208 recursively. If this element is itself a string, has no | |
1209 children, or has more than one child, return value is None. | |
1210 """ | |
1211 if len(self.contents) != 1: | |
1212 return None | |
1213 child = self.contents[0] | |
1214 if isinstance(child, NavigableString): | |
1215 return child | |
1216 return child.string | |
1217 | |
1218 @string.setter | |
1219 def string(self, string): | |
1220 """Replace this PageElement's contents with `string`.""" | |
1221 self.clear() | |
1222 self.append(string.__class__(string)) | |
1223 | |
1224 def _all_strings(self, strip=False, types=(NavigableString, CData)): | |
1225 """Yield all strings of certain classes, possibly stripping them. | |
1226 | |
1227 :param strip: If True, all strings will be stripped before being | |
1228 yielded. | |
1229 | |
1230 :types: A tuple of NavigableString subclasses. Any strings of | |
1231 a subclass not found in this list will be ignored. By | |
1232 default, this means only NavigableString and CData objects | |
1233 will be considered. So no comments, processing instructions, | |
1234 etc. | |
1235 | |
1236 :yield: A sequence of strings. | |
1237 """ | |
1238 for descendant in self.descendants: | |
1239 if ( | |
1240 (types is None and not isinstance(descendant, NavigableString)) | |
1241 or | |
1242 (types is not None and type(descendant) not in types)): | |
1243 continue | |
1244 if strip: | |
1245 descendant = descendant.strip() | |
1246 if len(descendant) == 0: | |
1247 continue | |
1248 yield descendant | |
1249 | |
1250 strings = property(_all_strings) | |
1251 | |
1252 @property | |
1253 def stripped_strings(self): | |
1254 """Yield all strings in the document, stripping them first. | |
1255 | |
1256 :yield: A sequence of stripped strings. | |
1257 """ | |
1258 for string in self._all_strings(True): | |
1259 yield string | |
1260 | |
1261 def get_text(self, separator="", strip=False, | |
1262 types=(NavigableString, CData)): | |
1263 """Get all child strings, concatenated using the given separator. | |
1264 | |
1265 :param separator: Strings will be concatenated using this separator. | |
1266 | |
1267 :param strip: If True, strings will be stripped before being | |
1268 concatenated. | |
1269 | |
1270 :types: A tuple of NavigableString subclasses. Any strings of | |
1271 a subclass not found in this list will be ignored. By | |
1272 default, this means only NavigableString and CData objects | |
1273 will be considered. So no comments, processing instructions, | |
1274 stylesheets, etc. | |
1275 | |
1276 :return: A string. | |
1277 """ | |
1278 return separator.join([s for s in self._all_strings( | |
1279 strip, types=types)]) | |
1280 getText = get_text | |
1281 text = property(get_text) | |
1282 | |
1283 def decompose(self): | |
1284 """Recursively destroys this PageElement and its children. | |
1285 | |
1286 This element will be removed from the tree and wiped out; so | |
1287 will everything beneath it. | |
1288 | |
1289 The behavior of a decomposed PageElement is undefined and you | |
1290 should never use one for anything, but if you need to _check_ | |
1291 whether an element has been decomposed, you can use the | |
1292 `decomposed` property. | |
1293 """ | |
1294 self.extract() | |
1295 i = self | |
1296 while i is not None: | |
1297 n = i.next_element | |
1298 i.__dict__.clear() | |
1299 i.contents = [] | |
1300 i._decomposed = True | |
1301 i = n | |
1302 | |
1303 def clear(self, decompose=False): | |
1304 """Wipe out all children of this PageElement by calling extract() | |
1305 on them. | |
1306 | |
1307 :param decompose: If this is True, decompose() (a more | |
1308 destructive method) will be called instead of extract(). | |
1309 """ | |
1310 if decompose: | |
1311 for element in self.contents[:]: | |
1312 if isinstance(element, Tag): | |
1313 element.decompose() | |
1314 else: | |
1315 element.extract() | |
1316 else: | |
1317 for element in self.contents[:]: | |
1318 element.extract() | |
1319 | |
1320 def smooth(self): | |
1321 """Smooth out this element's children by consolidating consecutive | |
1322 strings. | |
1323 | |
1324 This makes pretty-printed output look more natural following a | |
1325 lot of operations that modified the tree. | |
1326 """ | |
1327 # Mark the first position of every pair of children that need | |
1328 # to be consolidated. Do this rather than making a copy of | |
1329 # self.contents, since in most cases very few strings will be | |
1330 # affected. | |
1331 marked = [] | |
1332 for i, a in enumerate(self.contents): | |
1333 if isinstance(a, Tag): | |
1334 # Recursively smooth children. | |
1335 a.smooth() | |
1336 if i == len(self.contents)-1: | |
1337 # This is the last item in .contents, and it's not a | |
1338 # tag. There's no chance it needs any work. | |
1339 continue | |
1340 b = self.contents[i+1] | |
1341 if (isinstance(a, NavigableString) | |
1342 and isinstance(b, NavigableString) | |
1343 and not isinstance(a, PreformattedString) | |
1344 and not isinstance(b, PreformattedString) | |
1345 ): | |
1346 marked.append(i) | |
1347 | |
1348 # Go over the marked positions in reverse order, so that | |
1349 # removing items from .contents won't affect the remaining | |
1350 # positions. | |
1351 for i in reversed(marked): | |
1352 a = self.contents[i] | |
1353 b = self.contents[i+1] | |
1354 b.extract() | |
1355 n = NavigableString(a+b) | |
1356 a.replace_with(n) | |
1357 | |
1358 def index(self, element): | |
1359 """Find the index of a child by identity, not value. | |
1360 | |
1361 Avoids issues with tag.contents.index(element) getting the | |
1362 index of equal elements. | |
1363 | |
1364 :param element: Look for this PageElement in `self.contents`. | |
1365 """ | |
1366 for i, child in enumerate(self.contents): | |
1367 if child is element: | |
1368 return i | |
1369 raise ValueError("Tag.index: element not in tag") | |
1370 | |
1371 def get(self, key, default=None): | |
1372 """Returns the value of the 'key' attribute for the tag, or | |
1373 the value given for 'default' if it doesn't have that | |
1374 attribute.""" | |
1375 return self.attrs.get(key, default) | |
1376 | |
1377 def get_attribute_list(self, key, default=None): | |
1378 """The same as get(), but always returns a list. | |
1379 | |
1380 :param key: The attribute to look for. | |
1381 :param default: Use this value if the attribute is not present | |
1382 on this PageElement. | |
1383 :return: A list of values, probably containing only a single | |
1384 value. | |
1385 """ | |
1386 value = self.get(key, default) | |
1387 if not isinstance(value, list): | |
1388 value = [value] | |
1389 return value | |
1390 | |
1391 def has_attr(self, key): | |
1392 """Does this PageElement have an attribute with the given name?""" | |
1393 return key in self.attrs | |
1394 | |
1395 def __hash__(self): | |
1396 return str(self).__hash__() | |
1397 | |
1398 def __getitem__(self, key): | |
1399 """tag[key] returns the value of the 'key' attribute for the Tag, | |
1400 and throws an exception if it's not there.""" | |
1401 return self.attrs[key] | |
1402 | |
1403 def __iter__(self): | |
1404 "Iterating over a Tag iterates over its contents." | |
1405 return iter(self.contents) | |
1406 | |
1407 def __len__(self): | |
1408 "The length of a Tag is the length of its list of contents." | |
1409 return len(self.contents) | |
1410 | |
1411 def __contains__(self, x): | |
1412 return x in self.contents | |
1413 | |
1414 def __bool__(self): | |
1415 "A tag is non-None even if it has no contents." | |
1416 return True | |
1417 | |
1418 def __setitem__(self, key, value): | |
1419 """Setting tag[key] sets the value of the 'key' attribute for the | |
1420 tag.""" | |
1421 self.attrs[key] = value | |
1422 | |
1423 def __delitem__(self, key): | |
1424 "Deleting tag[key] deletes all 'key' attributes for the tag." | |
1425 self.attrs.pop(key, None) | |
1426 | |
1427 def __call__(self, *args, **kwargs): | |
1428 """Calling a Tag like a function is the same as calling its | |
1429 find_all() method. Eg. tag('a') returns a list of all the A tags | |
1430 found within this tag.""" | |
1431 return self.find_all(*args, **kwargs) | |
1432 | |
1433 def __getattr__(self, tag): | |
1434 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" | |
1435 #print("Getattr %s.%s" % (self.__class__, tag)) | |
1436 if len(tag) > 3 and tag.endswith('Tag'): | |
1437 # BS3: soup.aTag -> "soup.find("a") | |
1438 tag_name = tag[:-3] | |
1439 warnings.warn( | |
1440 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( | |
1441 name=tag_name | |
1442 ) | |
1443 ) | |
1444 return self.find(tag_name) | |
1445 # We special case contents to avoid recursion. | |
1446 elif not tag.startswith("__") and not tag == "contents": | |
1447 return self.find(tag) | |
1448 raise AttributeError( | |
1449 "'%s' object has no attribute '%s'" % (self.__class__, tag)) | |
1450 | |
1451 def __eq__(self, other): | |
1452 """Returns true iff this Tag has the same name, the same attributes, | |
1453 and the same contents (recursively) as `other`.""" | |
1454 if self is other: | |
1455 return True | |
1456 if (not hasattr(other, 'name') or | |
1457 not hasattr(other, 'attrs') or | |
1458 not hasattr(other, 'contents') or | |
1459 self.name != other.name or | |
1460 self.attrs != other.attrs or | |
1461 len(self) != len(other)): | |
1462 return False | |
1463 for i, my_child in enumerate(self.contents): | |
1464 if my_child != other.contents[i]: | |
1465 return False | |
1466 return True | |
1467 | |
1468 def __ne__(self, other): | |
1469 """Returns true iff this Tag is not identical to `other`, | |
1470 as defined in __eq__.""" | |
1471 return not self == other | |
1472 | |
1473 def __repr__(self, encoding="unicode-escape"): | |
1474 """Renders this PageElement as a string. | |
1475 | |
1476 :param encoding: The encoding to use (Python 2 only). | |
1477 :return: Under Python 2, a bytestring; under Python 3, | |
1478 a Unicode string. | |
1479 """ | |
1480 if PY3K: | |
1481 # "The return value must be a string object", i.e. Unicode | |
1482 return self.decode() | |
1483 else: | |
1484 # "The return value must be a string object", i.e. a bytestring. | |
1485 # By convention, the return value of __repr__ should also be | |
1486 # an ASCII string. | |
1487 return self.encode(encoding) | |
1488 | |
1489 def __unicode__(self): | |
1490 """Renders this PageElement as a Unicode string.""" | |
1491 return self.decode() | |
1492 | |
1493 def __str__(self): | |
1494 """Renders this PageElement as a generic string. | |
1495 | |
1496 :return: Under Python 2, a UTF-8 bytestring; under Python 3, | |
1497 a Unicode string. | |
1498 """ | |
1499 if PY3K: | |
1500 return self.decode() | |
1501 else: | |
1502 return self.encode() | |
1503 | |
1504 if PY3K: | |
1505 __str__ = __repr__ = __unicode__ | |
1506 | |
1507 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
1508 indent_level=None, formatter="minimal", | |
1509 errors="xmlcharrefreplace"): | |
1510 """Render a bytestring representation of this PageElement and its | |
1511 contents. | |
1512 | |
1513 :param encoding: The destination encoding. | |
1514 :param indent_level: Each line of the rendering will be | |
1515 indented this many spaces. Used internally in | |
1516 recursive calls while pretty-printing. | |
1517 :param formatter: A Formatter object, or a string naming one of | |
1518 the standard formatters. | |
1519 :param errors: An error handling strategy such as | |
1520 'xmlcharrefreplace'. This value is passed along into | |
1521 encode() and its value should be one of the constants | |
1522 defined by Python. | |
1523 :return: A bytestring. | |
1524 | |
1525 """ | |
1526 # Turn the data structure into Unicode, then encode the | |
1527 # Unicode. | |
1528 u = self.decode(indent_level, encoding, formatter) | |
1529 return u.encode(encoding, errors) | |
1530 | |
1531 def decode(self, indent_level=None, | |
1532 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
1533 formatter="minimal"): | |
1534 """Render a Unicode representation of this PageElement and its | |
1535 contents. | |
1536 | |
1537 :param indent_level: Each line of the rendering will be | |
1538 indented this many spaces. Used internally in | |
1539 recursive calls while pretty-printing. | |
1540 :param eventual_encoding: The tag is destined to be | |
1541 encoded into this encoding. This method is _not_ | |
1542 responsible for performing that encoding. This information | |
1543 is passed in so that it can be substituted in if the | |
1544 document contains a <META> tag that mentions the document's | |
1545 encoding. | |
1546 :param formatter: A Formatter object, or a string naming one of | |
1547 the standard formatters. | |
1548 """ | |
1549 | |
1550 # First off, turn a non-Formatter `formatter` into a Formatter | |
1551 # object. This will stop the lookup from happening over and | |
1552 # over again. | |
1553 if not isinstance(formatter, Formatter): | |
1554 formatter = self.formatter_for_name(formatter) | |
1555 attributes = formatter.attributes(self) | |
1556 attrs = [] | |
1557 for key, val in attributes: | |
1558 if val is None: | |
1559 decoded = key | |
1560 else: | |
1561 if isinstance(val, list) or isinstance(val, tuple): | |
1562 val = ' '.join(val) | |
1563 elif not isinstance(val, str): | |
1564 val = str(val) | |
1565 elif ( | |
1566 isinstance(val, AttributeValueWithCharsetSubstitution) | |
1567 and eventual_encoding is not None | |
1568 ): | |
1569 val = val.encode(eventual_encoding) | |
1570 | |
1571 text = formatter.attribute_value(val) | |
1572 decoded = ( | |
1573 str(key) + '=' | |
1574 + formatter.quoted_attribute_value(text)) | |
1575 attrs.append(decoded) | |
1576 close = '' | |
1577 closeTag = '' | |
1578 | |
1579 prefix = '' | |
1580 if self.prefix: | |
1581 prefix = self.prefix + ":" | |
1582 | |
1583 if self.is_empty_element: | |
1584 close = formatter.void_element_close_prefix or '' | |
1585 else: | |
1586 closeTag = '</%s%s>' % (prefix, self.name) | |
1587 | |
1588 pretty_print = self._should_pretty_print(indent_level) | |
1589 space = '' | |
1590 indent_space = '' | |
1591 if indent_level is not None: | |
1592 indent_space = (' ' * (indent_level - 1)) | |
1593 if pretty_print: | |
1594 space = indent_space | |
1595 indent_contents = indent_level + 1 | |
1596 else: | |
1597 indent_contents = None | |
1598 contents = self.decode_contents( | |
1599 indent_contents, eventual_encoding, formatter | |
1600 ) | |
1601 | |
1602 if self.hidden: | |
1603 # This is the 'document root' object. | |
1604 s = contents | |
1605 else: | |
1606 s = [] | |
1607 attribute_string = '' | |
1608 if attrs: | |
1609 attribute_string = ' ' + ' '.join(attrs) | |
1610 if indent_level is not None: | |
1611 # Even if this particular tag is not pretty-printed, | |
1612 # we should indent up to the start of the tag. | |
1613 s.append(indent_space) | |
1614 s.append('<%s%s%s%s>' % ( | |
1615 prefix, self.name, attribute_string, close)) | |
1616 if pretty_print: | |
1617 s.append("\n") | |
1618 s.append(contents) | |
1619 if pretty_print and contents and contents[-1] != "\n": | |
1620 s.append("\n") | |
1621 if pretty_print and closeTag: | |
1622 s.append(space) | |
1623 s.append(closeTag) | |
1624 if indent_level is not None and closeTag and self.next_sibling: | |
1625 # Even if this particular tag is not pretty-printed, | |
1626 # we're now done with the tag, and we should add a | |
1627 # newline if appropriate. | |
1628 s.append("\n") | |
1629 s = ''.join(s) | |
1630 return s | |
1631 | |
1632 def _should_pretty_print(self, indent_level): | |
1633 """Should this tag be pretty-printed? | |
1634 | |
1635 Most of them should, but some (such as <pre> in HTML | |
1636 documents) should not. | |
1637 """ | |
1638 return ( | |
1639 indent_level is not None | |
1640 and ( | |
1641 not self.preserve_whitespace_tags | |
1642 or self.name not in self.preserve_whitespace_tags | |
1643 ) | |
1644 ) | |
1645 | |
1646 def prettify(self, encoding=None, formatter="minimal"): | |
1647 """Pretty-print this PageElement as a string. | |
1648 | |
1649 :param encoding: The eventual encoding of the string. If this is None, | |
1650 a Unicode string will be returned. | |
1651 :param formatter: A Formatter object, or a string naming one of | |
1652 the standard formatters. | |
1653 :return: A Unicode string (if encoding==None) or a bytestring | |
1654 (otherwise). | |
1655 """ | |
1656 if encoding is None: | |
1657 return self.decode(True, formatter=formatter) | |
1658 else: | |
1659 return self.encode(encoding, True, formatter=formatter) | |
1660 | |
1661 def decode_contents(self, indent_level=None, | |
1662 eventual_encoding=DEFAULT_OUTPUT_ENCODING, | |
1663 formatter="minimal"): | |
1664 """Renders the contents of this tag as a Unicode string. | |
1665 | |
1666 :param indent_level: Each line of the rendering will be | |
1667 indented this many spaces. Used internally in | |
1668 recursive calls while pretty-printing. | |
1669 | |
1670 :param eventual_encoding: The tag is destined to be | |
1671 encoded into this encoding. decode_contents() is _not_ | |
1672 responsible for performing that encoding. This information | |
1673 is passed in so that it can be substituted in if the | |
1674 document contains a <META> tag that mentions the document's | |
1675 encoding. | |
1676 | |
1677 :param formatter: A Formatter object, or a string naming one of | |
1678 the standard Formatters. | |
1679 """ | |
1680 # First off, turn a string formatter into a Formatter object. This | |
1681 # will stop the lookup from happening over and over again. | |
1682 if not isinstance(formatter, Formatter): | |
1683 formatter = self.formatter_for_name(formatter) | |
1684 | |
1685 pretty_print = (indent_level is not None) | |
1686 s = [] | |
1687 for c in self: | |
1688 text = None | |
1689 if isinstance(c, NavigableString): | |
1690 text = c.output_ready(formatter) | |
1691 elif isinstance(c, Tag): | |
1692 s.append(c.decode(indent_level, eventual_encoding, | |
1693 formatter)) | |
1694 preserve_whitespace = ( | |
1695 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags | |
1696 ) | |
1697 if text and indent_level and not preserve_whitespace: | |
1698 text = text.strip() | |
1699 if text: | |
1700 if pretty_print and not preserve_whitespace: | |
1701 s.append(" " * (indent_level - 1)) | |
1702 s.append(text) | |
1703 if pretty_print and not preserve_whitespace: | |
1704 s.append("\n") | |
1705 return ''.join(s) | |
1706 | |
1707 def encode_contents( | |
1708 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, | |
1709 formatter="minimal"): | |
1710 """Renders the contents of this PageElement as a bytestring. | |
1711 | |
1712 :param indent_level: Each line of the rendering will be | |
1713 indented this many spaces. Used internally in | |
1714 recursive calls while pretty-printing. | |
1715 | |
1716 :param eventual_encoding: The bytestring will be in this encoding. | |
1717 | |
1718 :param formatter: A Formatter object, or a string naming one of | |
1719 the standard Formatters. | |
1720 | |
1721 :return: A bytestring. | |
1722 """ | |
1723 contents = self.decode_contents(indent_level, encoding, formatter) | |
1724 return contents.encode(encoding) | |
1725 | |
1726 # Old method for BS3 compatibility | |
1727 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
1728 prettyPrint=False, indentLevel=0): | |
1729 """Deprecated method for BS3 compatibility.""" | |
1730 if not prettyPrint: | |
1731 indentLevel = None | |
1732 return self.encode_contents( | |
1733 indent_level=indentLevel, encoding=encoding) | |
1734 | |
1735 #Soup methods | |
1736 | |
1737 def find(self, name=None, attrs={}, recursive=True, text=None, | |
1738 **kwargs): | |
1739 """Look in the children of this PageElement and find the first | |
1740 PageElement that matches the given criteria. | |
1741 | |
1742 All find_* methods take a common set of arguments. See the online | |
1743 documentation for detailed explanations. | |
1744 | |
1745 :param name: A filter on tag name. | |
1746 :param attrs: A dictionary of filters on attribute values. | |
1747 :param recursive: If this is True, find() will perform a | |
1748 recursive search of this PageElement's children. Otherwise, | |
1749 only the direct children will be considered. | |
1750 :param limit: Stop looking after finding this many results. | |
1751 :kwargs: A dictionary of filters on attribute values. | |
1752 :return: A PageElement. | |
1753 :rtype: bs4.element.Tag | bs4.element.NavigableString | |
1754 """ | |
1755 r = None | |
1756 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) | |
1757 if l: | |
1758 r = l[0] | |
1759 return r | |
1760 findChild = find #BS2 | |
1761 | |
1762 def find_all(self, name=None, attrs={}, recursive=True, text=None, | |
1763 limit=None, **kwargs): | |
1764 """Look in the children of this PageElement and find all | |
1765 PageElements that match the given criteria. | |
1766 | |
1767 All find_* methods take a common set of arguments. See the online | |
1768 documentation for detailed explanations. | |
1769 | |
1770 :param name: A filter on tag name. | |
1771 :param attrs: A dictionary of filters on attribute values. | |
1772 :param recursive: If this is True, find_all() will perform a | |
1773 recursive search of this PageElement's children. Otherwise, | |
1774 only the direct children will be considered. | |
1775 :param limit: Stop looking after finding this many results. | |
1776 :kwargs: A dictionary of filters on attribute values. | |
1777 :return: A ResultSet of PageElements. | |
1778 :rtype: bs4.element.ResultSet | |
1779 """ | |
1780 generator = self.descendants | |
1781 if not recursive: | |
1782 generator = self.children | |
1783 return self._find_all(name, attrs, text, limit, generator, **kwargs) | |
1784 findAll = find_all # BS3 | |
1785 findChildren = find_all # BS2 | |
1786 | |
1787 #Generator methods | |
1788 @property | |
1789 def children(self): | |
1790 """Iterate over all direct children of this PageElement. | |
1791 | |
1792 :yield: A sequence of PageElements. | |
1793 """ | |
1794 # return iter() to make the purpose of the method clear | |
1795 return iter(self.contents) # XXX This seems to be untested. | |
1796 | |
1797 @property | |
1798 def descendants(self): | |
1799 """Iterate over all children of this PageElement in a | |
1800 breadth-first sequence. | |
1801 | |
1802 :yield: A sequence of PageElements. | |
1803 """ | |
1804 if not len(self.contents): | |
1805 return | |
1806 stopNode = self._last_descendant().next_element | |
1807 current = self.contents[0] | |
1808 while current is not stopNode: | |
1809 yield current | |
1810 current = current.next_element | |
1811 | |
1812 # CSS selector code | |
1813 def select_one(self, selector, namespaces=None, **kwargs): | |
1814 """Perform a CSS selection operation on the current element. | |
1815 | |
1816 :param selector: A CSS selector. | |
1817 | |
1818 :param namespaces: A dictionary mapping namespace prefixes | |
1819 used in the CSS selector to namespace URIs. By default, | |
1820 Beautiful Soup will use the prefixes it encountered while | |
1821 parsing the document. | |
1822 | |
1823 :param kwargs: Keyword arguments to be passed into SoupSieve's | |
1824 soupsieve.select() method. | |
1825 | |
1826 :return: A Tag. | |
1827 :rtype: bs4.element.Tag | |
1828 """ | |
1829 value = self.select(selector, namespaces, 1, **kwargs) | |
1830 if value: | |
1831 return value[0] | |
1832 return None | |
1833 | |
1834 def select(self, selector, namespaces=None, limit=None, **kwargs): | |
1835 """Perform a CSS selection operation on the current element. | |
1836 | |
1837 This uses the SoupSieve library. | |
1838 | |
1839 :param selector: A string containing a CSS selector. | |
1840 | |
1841 :param namespaces: A dictionary mapping namespace prefixes | |
1842 used in the CSS selector to namespace URIs. By default, | |
1843 Beautiful Soup will use the prefixes it encountered while | |
1844 parsing the document. | |
1845 | |
1846 :param limit: After finding this number of results, stop looking. | |
1847 | |
1848 :param kwargs: Keyword arguments to be passed into SoupSieve's | |
1849 soupsieve.select() method. | |
1850 | |
1851 :return: A ResultSet of Tags. | |
1852 :rtype: bs4.element.ResultSet | |
1853 """ | |
1854 if namespaces is None: | |
1855 namespaces = self._namespaces | |
1856 | |
1857 if limit is None: | |
1858 limit = 0 | |
1859 if soupsieve is None: | |
1860 raise NotImplementedError( | |
1861 "Cannot execute CSS selectors because the soupsieve package is not installed." | |
1862 ) | |
1863 | |
1864 results = soupsieve.select(selector, self, namespaces, limit, **kwargs) | |
1865 | |
1866 # We do this because it's more consistent and because | |
1867 # ResultSet.__getattr__ has a helpful error message. | |
1868 return ResultSet(None, results) | |
1869 | |
1870 # Old names for backwards compatibility | |
1871 def childGenerator(self): | |
1872 """Deprecated generator.""" | |
1873 return self.children | |
1874 | |
1875 def recursiveChildGenerator(self): | |
1876 """Deprecated generator.""" | |
1877 return self.descendants | |
1878 | |
1879 def has_key(self, key): | |
1880 """Deprecated method. This was kind of misleading because has_key() | |
1881 (attributes) was different from __in__ (contents). | |
1882 | |
1883 has_key() is gone in Python 3, anyway. | |
1884 """ | |
1885 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( | |
1886 key)) | |
1887 return self.has_attr(key) | |
1888 | |
1889 # Next, a couple classes to represent queries and their results. | |
1890 class SoupStrainer(object): | |
1891 """Encapsulates a number of ways of matching a markup element (tag or | |
1892 string). | |
1893 | |
1894 This is primarily used to underpin the find_* methods, but you can | |
1895 create one yourself and pass it in as `parse_only` to the | |
1896 `BeautifulSoup` constructor, to parse a subset of a large | |
1897 document. | |
1898 """ | |
1899 | |
1900 def __init__(self, name=None, attrs={}, text=None, **kwargs): | |
1901 """Constructor. | |
1902 | |
1903 The SoupStrainer constructor takes the same arguments passed | |
1904 into the find_* methods. See the online documentation for | |
1905 detailed explanations. | |
1906 | |
1907 :param name: A filter on tag name. | |
1908 :param attrs: A dictionary of filters on attribute values. | |
1909 :param text: A filter for a NavigableString with specific text. | |
1910 :kwargs: A dictionary of filters on attribute values. | |
1911 """ | |
1912 self.name = self._normalize_search_value(name) | |
1913 if not isinstance(attrs, dict): | |
1914 # Treat a non-dict value for attrs as a search for the 'class' | |
1915 # attribute. | |
1916 kwargs['class'] = attrs | |
1917 attrs = None | |
1918 | |
1919 if 'class_' in kwargs: | |
1920 # Treat class_="foo" as a search for the 'class' | |
1921 # attribute, overriding any non-dict value for attrs. | |
1922 kwargs['class'] = kwargs['class_'] | |
1923 del kwargs['class_'] | |
1924 | |
1925 if kwargs: | |
1926 if attrs: | |
1927 attrs = attrs.copy() | |
1928 attrs.update(kwargs) | |
1929 else: | |
1930 attrs = kwargs | |
1931 normalized_attrs = {} | |
1932 for key, value in list(attrs.items()): | |
1933 normalized_attrs[key] = self._normalize_search_value(value) | |
1934 | |
1935 self.attrs = normalized_attrs | |
1936 self.text = self._normalize_search_value(text) | |
1937 | |
1938 def _normalize_search_value(self, value): | |
1939 # Leave it alone if it's a Unicode string, a callable, a | |
1940 # regular expression, a boolean, or None. | |
1941 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') | |
1942 or isinstance(value, bool) or value is None): | |
1943 return value | |
1944 | |
1945 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. | |
1946 if isinstance(value, bytes): | |
1947 return value.decode("utf8") | |
1948 | |
1949 # If it's listlike, convert it into a list of strings. | |
1950 if hasattr(value, '__iter__'): | |
1951 new_value = [] | |
1952 for v in value: | |
1953 if (hasattr(v, '__iter__') and not isinstance(v, bytes) | |
1954 and not isinstance(v, str)): | |
1955 # This is almost certainly the user's mistake. In the | |
1956 # interests of avoiding infinite loops, we'll let | |
1957 # it through as-is rather than doing a recursive call. | |
1958 new_value.append(v) | |
1959 else: | |
1960 new_value.append(self._normalize_search_value(v)) | |
1961 return new_value | |
1962 | |
1963 # Otherwise, convert it into a Unicode string. | |
1964 # The unicode(str()) thing is so this will do the same thing on Python 2 | |
1965 # and Python 3. | |
1966 return str(str(value)) | |
1967 | |
1968 def __str__(self): | |
1969 """A human-readable representation of this SoupStrainer.""" | |
1970 if self.text: | |
1971 return self.text | |
1972 else: | |
1973 return "%s|%s" % (self.name, self.attrs) | |
1974 | |
1975 def search_tag(self, markup_name=None, markup_attrs={}): | |
1976 """Check whether a Tag with the given name and attributes would | |
1977 match this SoupStrainer. | |
1978 | |
1979 Used prospectively to decide whether to even bother creating a Tag | |
1980 object. | |
1981 | |
1982 :param markup_name: A tag name as found in some markup. | |
1983 :param markup_attrs: A dictionary of attributes as found in some markup. | |
1984 | |
1985 :return: True if the prospective tag would match this SoupStrainer; | |
1986 False otherwise. | |
1987 """ | |
1988 found = None | |
1989 markup = None | |
1990 if isinstance(markup_name, Tag): | |
1991 markup = markup_name | |
1992 markup_attrs = markup | |
1993 call_function_with_tag_data = ( | |
1994 isinstance(self.name, Callable) | |
1995 and not isinstance(markup_name, Tag)) | |
1996 | |
1997 if ((not self.name) | |
1998 or call_function_with_tag_data | |
1999 or (markup and self._matches(markup, self.name)) | |
2000 or (not markup and self._matches(markup_name, self.name))): | |
2001 if call_function_with_tag_data: | |
2002 match = self.name(markup_name, markup_attrs) | |
2003 else: | |
2004 match = True | |
2005 markup_attr_map = None | |
2006 for attr, match_against in list(self.attrs.items()): | |
2007 if not markup_attr_map: | |
2008 if hasattr(markup_attrs, 'get'): | |
2009 markup_attr_map = markup_attrs | |
2010 else: | |
2011 markup_attr_map = {} | |
2012 for k, v in markup_attrs: | |
2013 markup_attr_map[k] = v | |
2014 attr_value = markup_attr_map.get(attr) | |
2015 if not self._matches(attr_value, match_against): | |
2016 match = False | |
2017 break | |
2018 if match: | |
2019 if markup: | |
2020 found = markup | |
2021 else: | |
2022 found = markup_name | |
2023 if found and self.text and not self._matches(found.string, self.text): | |
2024 found = None | |
2025 return found | |
2026 | |
2027 # For BS3 compatibility. | |
2028 searchTag = search_tag | |
2029 | |
2030 def search(self, markup): | |
2031 """Find all items in `markup` that match this SoupStrainer. | |
2032 | |
2033 Used by the core _find_all() method, which is ultimately | |
2034 called by all find_* methods. | |
2035 | |
2036 :param markup: A PageElement or a list of them. | |
2037 """ | |
2038 # print('looking for %s in %s' % (self, markup)) | |
2039 found = None | |
2040 # If given a list of items, scan it for a text element that | |
2041 # matches. | |
2042 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): | |
2043 for element in markup: | |
2044 if isinstance(element, NavigableString) \ | |
2045 and self.search(element): | |
2046 found = element | |
2047 break | |
2048 # If it's a Tag, make sure its name or attributes match. | |
2049 # Don't bother with Tags if we're searching for text. | |
2050 elif isinstance(markup, Tag): | |
2051 if not self.text or self.name or self.attrs: | |
2052 found = self.search_tag(markup) | |
2053 # If it's text, make sure the text matches. | |
2054 elif isinstance(markup, NavigableString) or \ | |
2055 isinstance(markup, str): | |
2056 if not self.name and not self.attrs and self._matches(markup, self.text): | |
2057 found = markup | |
2058 else: | |
2059 raise Exception( | |
2060 "I don't know how to match against a %s" % markup.__class__) | |
2061 return found | |
2062 | |
2063 def _matches(self, markup, match_against, already_tried=None): | |
2064 # print(u"Matching %s against %s" % (markup, match_against)) | |
2065 result = False | |
2066 if isinstance(markup, list) or isinstance(markup, tuple): | |
2067 # This should only happen when searching a multi-valued attribute | |
2068 # like 'class'. | |
2069 for item in markup: | |
2070 if self._matches(item, match_against): | |
2071 return True | |
2072 # We didn't match any particular value of the multivalue | |
2073 # attribute, but maybe we match the attribute value when | |
2074 # considered as a string. | |
2075 if self._matches(' '.join(markup), match_against): | |
2076 return True | |
2077 return False | |
2078 | |
2079 if match_against is True: | |
2080 # True matches any non-None value. | |
2081 return markup is not None | |
2082 | |
2083 if isinstance(match_against, Callable): | |
2084 return match_against(markup) | |
2085 | |
2086 # Custom callables take the tag as an argument, but all | |
2087 # other ways of matching match the tag name as a string. | |
2088 original_markup = markup | |
2089 if isinstance(markup, Tag): | |
2090 markup = markup.name | |
2091 | |
2092 # Ensure that `markup` is either a Unicode string, or None. | |
2093 markup = self._normalize_search_value(markup) | |
2094 | |
2095 if markup is None: | |
2096 # None matches None, False, an empty string, an empty list, and so on. | |
2097 return not match_against | |
2098 | |
2099 if (hasattr(match_against, '__iter__') | |
2100 and not isinstance(match_against, str)): | |
2101 # We're asked to match against an iterable of items. | |
2102 # The markup must be match at least one item in the | |
2103 # iterable. We'll try each one in turn. | |
2104 # | |
2105 # To avoid infinite recursion we need to keep track of | |
2106 # items we've already seen. | |
2107 if not already_tried: | |
2108 already_tried = set() | |
2109 for item in match_against: | |
2110 if item.__hash__: | |
2111 key = item | |
2112 else: | |
2113 key = id(item) | |
2114 if key in already_tried: | |
2115 continue | |
2116 else: | |
2117 already_tried.add(key) | |
2118 if self._matches(original_markup, item, already_tried): | |
2119 return True | |
2120 else: | |
2121 return False | |
2122 | |
2123 # Beyond this point we might need to run the test twice: once against | |
2124 # the tag's name and once against its prefixed name. | |
2125 match = False | |
2126 | |
2127 if not match and isinstance(match_against, str): | |
2128 # Exact string match | |
2129 match = markup == match_against | |
2130 | |
2131 if not match and hasattr(match_against, 'search'): | |
2132 # Regexp match | |
2133 return match_against.search(markup) | |
2134 | |
2135 if (not match | |
2136 and isinstance(original_markup, Tag) | |
2137 and original_markup.prefix): | |
2138 # Try the whole thing again with the prefixed tag name. | |
2139 return self._matches( | |
2140 original_markup.prefix + ':' + original_markup.name, match_against | |
2141 ) | |
2142 | |
2143 return match | |
2144 | |
2145 | |
2146 class ResultSet(list): | |
2147 """A ResultSet is just a list that keeps track of the SoupStrainer | |
2148 that created it.""" | |
2149 def __init__(self, source, result=()): | |
2150 """Constructor. | |
2151 | |
2152 :param source: A SoupStrainer. | |
2153 :param result: A list of PageElements. | |
2154 """ | |
2155 super(ResultSet, self).__init__(result) | |
2156 self.source = source | |
2157 | |
2158 def __getattr__(self, key): | |
2159 """Raise a helpful exception to explain a common code fix.""" | |
2160 raise AttributeError( | |
2161 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key | |
2162 ) |