Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bs4/element.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bs4/element.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2129 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -try: - from collections.abc import Callable # Python 3.6 -except ImportError as e: - from collections import Callable -import re -import sys -import warnings -try: - import soupsieve -except ImportError as e: - soupsieve = None - warnings.warn( - 'The soupsieve package is not installed. CSS selectors cannot be used.' - ) - -from bs4.formatter import ( - Formatter, - HTMLFormatter, - XMLFormatter, -) - -DEFAULT_OUTPUT_ENCODING = "utf-8" -PY3K = (sys.version_info[0] > 2) - -nonwhitespace_re = re.compile(r"\S+") - -# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on -# the off chance someone imported it for their own use. -whitespace_re = re.compile(r"\s+") - -def _alias(attr): - """Alias one attribute name to another for backward compatibility""" - @property - def alias(self): - return getattr(self, attr) - - @alias.setter - def alias(self): - return setattr(self, attr) - return alias - - -class NamespacedAttribute(str): - """A namespaced string (e.g. 'xml:lang') that remembers the namespace - ('xml') and the name ('lang') that were used to create it. - """ - - def __new__(cls, prefix, name=None, namespace=None): - if not name: - # This is the default namespace. Its name "has no value" - # per https://www.w3.org/TR/xml-names/#defaulting - name = None - - if name is None: - obj = str.__new__(cls, prefix) - elif prefix is None: - # Not really namespaced. - obj = str.__new__(cls, name) - else: - obj = str.__new__(cls, prefix + ":" + name) - obj.prefix = prefix - obj.name = name - obj.namespace = namespace - return obj - -class AttributeValueWithCharsetSubstitution(str): - """A stand-in object for a character encoding specified in HTML.""" - -class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'charset' attribute. - - When Beautiful Soup parses the markup '<meta charset="utf8">', the - value of the 'charset' attribute will be one of these objects. - """ - - def __new__(cls, original_value): - obj = str.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - """When an HTML document is being encoded to a given encoding, the - value of a meta tag's 'charset' is the name of the encoding. - """ - return encoding - - -class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'content' attribute. - - When Beautiful Soup parses the markup: - <meta http-equiv="content-type" content="text/html; charset=utf8"> - - The value of the 'content' attribute will be one of these objects. - """ - - CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) - - def __new__(cls, original_value): - match = cls.CHARSET_RE.search(original_value) - if match is None: - # No substitution necessary. - return str.__new__(str, original_value) - - obj = str.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - def rewrite(match): - return match.group(1) + encoding - return self.CHARSET_RE.sub(rewrite, self.original_value) - - -class PageElement(object): - """Contains the navigational information for some part of the page: - that is, its current location in the parse tree. - - NavigableString, Tag, etc. are all subclasses of PageElement. - """ - - def setup(self, parent=None, previous_element=None, next_element=None, - previous_sibling=None, next_sibling=None): - """Sets up the initial relations between this element and - other elements. - - :param parent: The parent of this element. - - :param previous_element: The element parsed immediately before - this one. - - :param next_element: The element parsed immediately before - this one. - - :param previous_sibling: The most recently encountered element - on the same level of the parse tree as this one. - - :param previous_sibling: The next element to be encountered - on the same level of the parse tree as this one. - """ - self.parent = parent - - self.previous_element = previous_element - if previous_element is not None: - self.previous_element.next_element = self - - self.next_element = next_element - if self.next_element is not None: - self.next_element.previous_element = self - - self.next_sibling = next_sibling - if self.next_sibling is not None: - self.next_sibling.previous_sibling = self - - if (previous_sibling is None - and self.parent is not None and self.parent.contents): - previous_sibling = self.parent.contents[-1] - - self.previous_sibling = previous_sibling - if previous_sibling is not None: - self.previous_sibling.next_sibling = self - - def format_string(self, s, formatter): - """Format the given string using the given formatter. - - :param s: A string. - :param formatter: A Formatter object, or a string naming one of the standard formatters. - """ - if formatter is None: - return s - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - output = formatter.substitute(s) - return output - - def formatter_for_name(self, formatter): - """Look up or create a Formatter for the given identifier, - if necessary. - - :param formatter: Can be a Formatter object (used as-is), a - function (used as the entity substitution hook for an - XMLFormatter or HTMLFormatter), or a string (used to look - up an XMLFormatter or HTMLFormatter in the appropriate - registry. - """ - if isinstance(formatter, Formatter): - return formatter - if self._is_xml: - c = XMLFormatter - else: - c = HTMLFormatter - if isinstance(formatter, Callable): - return c(entity_substitution=formatter) - return c.REGISTRY[formatter] - - @property - def _is_xml(self): - """Is this element part of an XML tree or an HTML tree? - - This is used in formatter_for_name, when deciding whether an - XMLFormatter or HTMLFormatter is more appropriate. It can be - inefficient, but it should be called very rarely. - """ - if self.known_xml is not None: - # Most of the time we will have determined this when the - # document is parsed. - return self.known_xml - - # Otherwise, it's likely that this element was created by - # direct invocation of the constructor from within the user's - # Python code. - if self.parent is None: - # This is the top-level object. It should have .known_xml set - # from tree creation. If not, take a guess--BS is usually - # used on HTML markup. - return getattr(self, 'is_xml', False) - return self.parent._is_xml - - nextSibling = _alias("next_sibling") # BS3 - previousSibling = _alias("previous_sibling") # BS3 - - def replace_with(self, replace_with): - """Replace this PageElement with another one, keeping the rest of the - tree the same. - - :param replace_with: A PageElement. - :return: `self`, no longer part of the tree. - """ - if self.parent is None: - raise ValueError( - "Cannot replace one element with another when the " - "element to be replaced is not part of a tree.") - if replace_with is self: - return - if replace_with is self.parent: - raise ValueError("Cannot replace a Tag with its parent.") - old_parent = self.parent - my_index = self.parent.index(self) - self.extract(_self_index=my_index) - old_parent.insert(my_index, replace_with) - return self - replaceWith = replace_with # BS3 - - def unwrap(self): - """Replace this PageElement with its contents. - - :return: `self`, no longer part of the tree. - """ - my_parent = self.parent - if self.parent is None: - raise ValueError( - "Cannot replace an element with its contents when that" - "element is not part of a tree.") - my_index = self.parent.index(self) - self.extract(_self_index=my_index) - for child in reversed(self.contents[:]): - my_parent.insert(my_index, child) - return self - replace_with_children = unwrap - replaceWithChildren = unwrap # BS3 - - def wrap(self, wrap_inside): - """Wrap this PageElement inside another one. - - :param wrap_inside: A PageElement. - :return: `wrap_inside`, occupying the position in the tree that used - to be occupied by `self`, and with `self` inside it. - """ - me = self.replace_with(wrap_inside) - wrap_inside.append(me) - return wrap_inside - - def extract(self, _self_index=None): - """Destructively rips this element out of the tree. - - :param _self_index: The location of this element in its parent's - .contents, if known. Passing this in allows for a performance - optimization. - - :return: `self`, no longer part of the tree. - """ - if self.parent is not None: - if _self_index is None: - _self_index = self.parent.index(self) - del self.parent.contents[_self_index] - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - last_child = self._last_descendant() - next_element = last_child.next_element - - if (self.previous_element is not None and - self.previous_element is not next_element): - self.previous_element.next_element = next_element - if next_element is not None and next_element is not self.previous_element: - next_element.previous_element = self.previous_element - self.previous_element = None - last_child.next_element = None - - self.parent = None - if (self.previous_sibling is not None - and self.previous_sibling is not self.next_sibling): - self.previous_sibling.next_sibling = self.next_sibling - if (self.next_sibling is not None - and self.next_sibling is not self.previous_sibling): - self.next_sibling.previous_sibling = self.previous_sibling - self.previous_sibling = self.next_sibling = None - return self - - def _last_descendant(self, is_initialized=True, accept_self=True): - """Finds the last element beneath this object to be parsed. - - :param is_initialized: Has `setup` been called on this PageElement - yet? - :param accept_self: Is `self` an acceptable answer to the question? - """ - if is_initialized and self.next_sibling is not None: - last_child = self.next_sibling.previous_element - else: - last_child = self - while isinstance(last_child, Tag) and last_child.contents: - last_child = last_child.contents[-1] - if not accept_self and last_child is self: - last_child = None - return last_child - # BS3: Not part of the API! - _lastRecursiveChild = _last_descendant - - def insert(self, position, new_child): - """Insert a new PageElement in the list of this PageElement's children. - - This works the same way as `list.insert`. - - :param position: The numeric position that should be occupied - in `self.children` by the new PageElement. - :param new_child: A PageElement. - """ - if new_child is None: - raise ValueError("Cannot insert None into a tag.") - if new_child is self: - raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, str) - and not isinstance(new_child, NavigableString)): - new_child = NavigableString(new_child) - - from bs4 import BeautifulSoup - if isinstance(new_child, BeautifulSoup): - # We don't want to end up with a situation where one BeautifulSoup - # object contains another. Insert the children one at a time. - for subchild in list(new_child.contents): - self.insert(position, subchild) - position += 1 - return - position = min(position, len(self.contents)) - if hasattr(new_child, 'parent') and new_child.parent is not None: - # We're 'inserting' an element that's already one - # of this object's children. - if new_child.parent is self: - current_index = self.index(new_child) - if current_index < position: - # We're moving this element further down the list - # of this object's children. That means that when - # we extract this element, our target index will - # jump down one. - position -= 1 - new_child.extract() - - new_child.parent = self - previous_child = None - if position == 0: - new_child.previous_sibling = None - new_child.previous_element = self - else: - previous_child = self.contents[position - 1] - new_child.previous_sibling = previous_child - new_child.previous_sibling.next_sibling = new_child - new_child.previous_element = previous_child._last_descendant(False) - if new_child.previous_element is not None: - new_child.previous_element.next_element = new_child - - new_childs_last_element = new_child._last_descendant(False) - - if position >= len(self.contents): - new_child.next_sibling = None - - parent = self - parents_next_sibling = None - while parents_next_sibling is None and parent is not None: - parents_next_sibling = parent.next_sibling - parent = parent.parent - if parents_next_sibling is not None: - # We found the element that comes next in the document. - break - if parents_next_sibling is not None: - new_childs_last_element.next_element = parents_next_sibling - else: - # The last element of this tag is the last element in - # the document. - new_childs_last_element.next_element = None - else: - next_child = self.contents[position] - new_child.next_sibling = next_child - if new_child.next_sibling is not None: - new_child.next_sibling.previous_sibling = new_child - new_childs_last_element.next_element = next_child - - if new_childs_last_element.next_element is not None: - new_childs_last_element.next_element.previous_element = new_childs_last_element - self.contents.insert(position, new_child) - - def append(self, tag): - """Appends the given PageElement to the contents of this one. - - :param tag: A PageElement. - """ - self.insert(len(self.contents), tag) - - def extend(self, tags): - """Appends the given PageElements to this one's contents. - - :param tags: A list of PageElements. - """ - for tag in tags: - self.append(tag) - - def insert_before(self, *args): - """Makes the given element(s) the immediate predecessor of this one. - - All the elements will have the same parent, and the given elements - will be immediately before this one. - - :param args: One or more PageElements. - """ - parent = self.parent - if parent is None: - raise ValueError( - "Element has no parent, so 'before' has no meaning.") - if any(x is self for x in args): - raise ValueError("Can't insert an element before itself.") - for predecessor in args: - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, *args): - """Makes the given element(s) the immediate successor of this one. - - The elements will have the same parent, and the given elements - will be immediately after this one. - - :param args: One or more PageElements. - """ - # Do all error checking before modifying the tree. - parent = self.parent - if parent is None: - raise ValueError( - "Element has no parent, so 'after' has no meaning.") - if any(x is self for x in args): - raise ValueError("Can't insert an element after itself.") - - offset = 0 - for successor in args: - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1+offset, successor) - offset += 1 - - def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Find the first PageElement that matches the given criteria and - appears later in the document than this PageElement. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_all_next, name, attrs, text, **kwargs) - findNext = find_next # BS3 - - def find_all_next(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Find all PageElements that match the given criteria and appear - later in the document than this PageElement. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet containing PageElements. - """ - return self._find_all(name, attrs, text, limit, self.next_elements, - **kwargs) - findAllNext = find_all_next # BS3 - - def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Find the closest sibling to this PageElement that matches the - given criteria and appears later in the document. - - All find_* methods take a common set of arguments. See the - online documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_next_siblings, name, attrs, text, - **kwargs) - findNextSibling = find_next_sibling # BS3 - - def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Find all siblings of this PageElement that match the given criteria - and appear later in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, - self.next_siblings, **kwargs) - findNextSiblings = find_next_siblings # BS3 - fetchNextSiblings = find_next_siblings # BS2 - - def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Look backwards in the document from this PageElement and find the - first PageElement that matches the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one( - self.find_all_previous, name, attrs, text, **kwargs) - findPrevious = find_previous # BS3 - - def find_all_previous(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Look backwards in the document from this PageElement and find all - PageElements that match the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, self.previous_elements, - **kwargs) - findAllPrevious = find_all_previous # BS3 - fetchPrevious = find_all_previous # BS2 - - def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this PageElement that matches the - given criteria and appears earlier in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_previous_siblings, name, attrs, text, - **kwargs) - findPreviousSibling = find_previous_sibling # BS3 - - def find_previous_siblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns all siblings to this PageElement that match the - given criteria and appear earlier in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, - self.previous_siblings, **kwargs) - findPreviousSiblings = find_previous_siblings # BS3 - fetchPreviousSiblings = find_previous_siblings # BS2 - - def find_parent(self, name=None, attrs={}, **kwargs): - """Find the closest parent of this PageElement that matches the given - criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :kwargs: A dictionary of filters on attribute values. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - # NOTE: We can't use _find_one because findParents takes a different - # set of arguments. - r = None - l = self.find_parents(name, attrs, 1, **kwargs) - if l: - r = l[0] - return r - findParent = find_parent # BS3 - - def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Find all parents of this PageElement that match the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) - findParents = find_parents # BS3 - fetchParents = find_parents # BS2 - - @property - def next(self): - """The PageElement, if any, that was parsed just after this one. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self.next_element - - @property - def previous(self): - """The PageElement, if any, that was parsed just before this one. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self.previous_element - - #These methods do the real heavy lifting. - - def _find_one(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _find_all(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if text is None and 'string' in kwargs: - text = kwargs['string'] - del kwargs['string'] - - if isinstance(name, SoupStrainer): - strainer = name - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - - if text is None and not limit and not attrs and not kwargs: - if name is True or name is None: - # Optimization to find all tags. - result = (element for element in generator - if isinstance(element, Tag)) - return ResultSet(strainer, result) - elif isinstance(name, str): - # Optimization to find all tags with a given name. - if name.count(':') == 1: - # This is a name with a prefix. If this is a namespace-aware document, - # we need to match the local name against tag.name. If not, - # we need to match the fully-qualified name against tag.name. - prefix, local_name = name.split(':', 1) - else: - prefix = None - local_name = name - result = (element for element in generator - if isinstance(element, Tag) - and ( - element.name == name - ) or ( - element.name == local_name - and (prefix is None or element.prefix == prefix) - ) - ) - return ResultSet(strainer, result) - results = ResultSet(strainer) - while True: - try: - i = next(generator) - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These generators can be used to navigate starting from both - #NavigableStrings and Tags. - @property - def next_elements(self): - """All PageElements that were parsed after this one. - - :yield: A sequence of PageElements. - """ - i = self.next_element - while i is not None: - yield i - i = i.next_element - - @property - def next_siblings(self): - """All PageElements that are siblings of this one but were parsed - later. - - :yield: A sequence of PageElements. - """ - i = self.next_sibling - while i is not None: - yield i - i = i.next_sibling - - @property - def previous_elements(self): - """All PageElements that were parsed before this one. - - :yield: A sequence of PageElements. - """ - i = self.previous_element - while i is not None: - yield i - i = i.previous_element - - @property - def previous_siblings(self): - """All PageElements that are siblings of this one but were parsed - earlier. - - :yield: A sequence of PageElements. - """ - i = self.previous_sibling - while i is not None: - yield i - i = i.previous_sibling - - @property - def parents(self): - """All PageElements that are parents of this PageElement. - - :yield: A sequence of PageElements. - """ - i = self.parent - while i is not None: - yield i - i = i.parent - - @property - def decomposed(self): - """Check whether a PageElement has been decomposed. - - :rtype: bool - """ - return getattr(self, '_decomposed', False) or False - - # Old non-property versions of the generators, for backwards - # compatibility with BS3. - def nextGenerator(self): - return self.next_elements - - def nextSiblingGenerator(self): - return self.next_siblings - - def previousGenerator(self): - return self.previous_elements - - def previousSiblingGenerator(self): - return self.previous_siblings - - def parentGenerator(self): - return self.parents - - -class NavigableString(str, PageElement): - """A Python Unicode string that is part of a parse tree. - - When Beautiful Soup parses the markup <b>penguin</b>, it will - create a NavigableString for the string "penguin". - """ - - PREFIX = '' - SUFFIX = '' - - # We can't tell just by looking at a string whether it's contained - # in an XML document or an HTML document. - - known_xml = None - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, str): - u = str.__new__(cls, value) - else: - u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - u.setup() - return u - - def __copy__(self): - """A copy of a NavigableString has the same contents and class - as the original, but it is not connected to the parse tree. - """ - return type(self)(self) - - def __getnewargs__(self): - return (str(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError( - "'%s' object has no attribute '%s'" % ( - self.__class__.__name__, attr)) - - def output_ready(self, formatter="minimal"): - """Run the string through the provided formatter. - - :param formatter: A Formatter object, or a string naming one of the standard formatters. - """ - output = self.format_string(self, formatter) - return self.PREFIX + output + self.SUFFIX - - @property - def name(self): - """Since a NavigableString is not a Tag, it has no .name. - - This property is implemented so that code like this doesn't crash - when run on a mixture of Tag and NavigableString objects: - [x.name for x in tag.children] - """ - return None - - @name.setter - def name(self, name): - """Prevent NavigableString.name from ever being set.""" - raise AttributeError("A NavigableString cannot be given a name.") - - -class PreformattedString(NavigableString): - """A NavigableString not subject to the normal formatting rules. - - This is an abstract class used for special kinds of strings such - as comments (the Comment class) and CDATA blocks (the CData - class). - """ - - PREFIX = '' - SUFFIX = '' - - def output_ready(self, formatter=None): - """Make this string ready for output by adding any subclass-specific - prefix or suffix. - - :param formatter: A Formatter object, or a string naming one - of the standard formatters. The string will be passed into the - Formatter, but only to trigger any side effects: the return - value is ignored. - - :return: The string, with any subclass-specific prefix and - suffix added on. - """ - if formatter is not None: - ignore = self.format_string(self, formatter) - return self.PREFIX + self + self.SUFFIX - -class CData(PreformattedString): - """A CDATA block.""" - PREFIX = '<![CDATA[' - SUFFIX = ']]>' - -class ProcessingInstruction(PreformattedString): - """A SGML processing instruction.""" - - PREFIX = '<?' - SUFFIX = '>' - -class XMLProcessingInstruction(ProcessingInstruction): - """An XML processing instruction.""" - PREFIX = '<?' - SUFFIX = '?>' - -class Comment(PreformattedString): - """An HTML or XML comment.""" - PREFIX = '<!--' - SUFFIX = '-->' - - -class Declaration(PreformattedString): - """An XML declaration.""" - PREFIX = '<?' - SUFFIX = '?>' - - -class Doctype(PreformattedString): - """A document type declaration.""" - @classmethod - def for_name_and_ids(cls, name, pub_id, system_id): - """Generate an appropriate document type declaration for a given - public ID and system ID. - - :param name: The name of the document's root element, e.g. 'html'. - :param pub_id: The Formal Public Identifier for this document type, - e.g. '-//W3C//DTD XHTML 1.1//EN' - :param system_id: The system identifier for this document type, - e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' - - :return: A Doctype. - """ - value = name or '' - if pub_id is not None: - value += ' PUBLIC "%s"' % pub_id - if system_id is not None: - value += ' "%s"' % system_id - elif system_id is not None: - value += ' SYSTEM "%s"' % system_id - - return Doctype(value) - - PREFIX = '<!DOCTYPE ' - SUFFIX = '>\n' - - -class Stylesheet(NavigableString): - """A NavigableString representing an stylesheet (probably - CSS). - - Used to distinguish embedded stylesheets from textual content. - """ - pass - - -class Script(NavigableString): - """A NavigableString representing an executable script (probably - Javascript). - - Used to distinguish executable code from textual content. - """ - pass - - -class TemplateString(NavigableString): - """A NavigableString representing a string found inside an HTML - template embedded in a larger document. - - Used to distinguish such strings from the main body of the document. - """ - pass - - -class Tag(PageElement): - """Represents an HTML or XML tag that is part of a parse tree, along - with its attributes and contents. - - When Beautiful Soup parses the markup <b>penguin</b>, it will - create a Tag object representing the <b> tag. - """ - - def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None, - is_xml=None, sourceline=None, sourcepos=None, - can_be_empty_element=None, cdata_list_attributes=None, - preserve_whitespace_tags=None - ): - """Basic constructor. - - :param parser: A BeautifulSoup object. - :param builder: A TreeBuilder. - :param name: The name of the tag. - :param namespace: The URI of this Tag's XML namespace, if any. - :param prefix: The prefix for this Tag's XML namespace, if any. - :param attrs: A dictionary of this Tag's attribute values. - :param parent: The PageElement to use as this Tag's parent. - :param previous: The PageElement that was parsed immediately before - this tag. - :param is_xml: If True, this is an XML tag. Otherwise, this is an - HTML tag. - :param sourceline: The line number where this tag was found in its - source document. - :param sourcepos: The character position within `sourceline` where this - tag was found. - :param can_be_empty_element: If True, this tag should be - represented as <tag/>. If False, this tag should be represented - as <tag></tag>. - :param cdata_list_attributes: A list of attributes whose values should - be treated as CDATA if they ever show up on this tag. - :param preserve_whitespace_tags: A list of tag names whose contents - should have their whitespace preserved. - """ - if parser is None: - self.parser_class = None - else: - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected. - self.parser_class = parser.__class__ - if name is None: - raise ValueError("No value provided for new tag's name.") - self.name = name - self.namespace = namespace - self.prefix = prefix - if ((not builder or builder.store_line_numbers) - and (sourceline is not None or sourcepos is not None)): - self.sourceline = sourceline - self.sourcepos = sourcepos - if attrs is None: - attrs = {} - elif attrs: - if builder is not None and builder.cdata_list_attributes: - attrs = builder._replace_cdata_list_attribute_values( - self.name, attrs) - else: - attrs = dict(attrs) - else: - attrs = dict(attrs) - - # If possible, determine ahead of time whether this tag is an - # XML tag. - if builder: - self.known_xml = builder.is_xml - else: - self.known_xml = is_xml - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - - if builder is None: - # In the absence of a TreeBuilder, use whatever values were - # passed in here. They're probably None, unless this is a copy of some - # other tag. - self.can_be_empty_element = can_be_empty_element - self.cdata_list_attributes = cdata_list_attributes - self.preserve_whitespace_tags = preserve_whitespace_tags - else: - # Set up any substitutions for this tag, such as the charset in a META tag. - builder.set_up_substitutions(self) - - # Ask the TreeBuilder whether this tag might be an empty-element tag. - self.can_be_empty_element = builder.can_be_empty_element(name) - - # Keep track of the list of attributes of this tag that - # might need to be treated as a list. - # - # For performance reasons, we store the whole data structure - # rather than asking the question of every tag. Asking would - # require building a new data structure every time, and - # (unlike can_be_empty_element), we almost never need - # to check this. - self.cdata_list_attributes = builder.cdata_list_attributes - - # Keep track of the names that might cause this tag to be treated as a - # whitespace-preserved tag. - self.preserve_whitespace_tags = builder.preserve_whitespace_tags - - parserClass = _alias("parser_class") # BS3 - - def __copy__(self): - """A copy of a Tag is a new Tag, unconnected to the parse tree. - Its contents are a copy of the old Tag's contents. - """ - clone = type(self)( - None, self.builder, self.name, self.namespace, - self.prefix, self.attrs, is_xml=self._is_xml, - sourceline=self.sourceline, sourcepos=self.sourcepos, - can_be_empty_element=self.can_be_empty_element, - cdata_list_attributes=self.cdata_list_attributes, - preserve_whitespace_tags=self.preserve_whitespace_tags - ) - for attr in ('can_be_empty_element', 'hidden'): - setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) - return clone - - @property - def is_empty_element(self): - """Is this tag an empty-element tag? (aka a self-closing tag) - - A tag that has contents is never an empty-element tag. - - A tag that has no contents may or may not be an empty-element - tag. It depends on the builder used to create the tag. If the - builder has a designated list of empty-element tags, then only - a tag whose name shows up in that list is considered an - empty-element tag. - - If the builder has no designated list of empty-element tags, - then any tag with no contents is an empty-element tag. - """ - return len(self.contents) == 0 and self.can_be_empty_element - isSelfClosing = is_empty_element # BS3 - - @property - def string(self): - """Convenience property to get the single string within this - PageElement. - - TODO It might make sense to have NavigableString.string return - itself. - - :return: If this element has a single string child, return - value is that string. If this element has one child tag, - return value is the 'string' attribute of the child tag, - recursively. If this element is itself a string, has no - children, or has more than one child, return value is None. - """ - if len(self.contents) != 1: - return None - child = self.contents[0] - if isinstance(child, NavigableString): - return child - return child.string - - @string.setter - def string(self, string): - """Replace this PageElement's contents with `string`.""" - self.clear() - self.append(string.__class__(string)) - - def _all_strings(self, strip=False, types=(NavigableString, CData)): - """Yield all strings of certain classes, possibly stripping them. - - :param strip: If True, all strings will be stripped before being - yielded. - - :types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - etc. - - :yield: A sequence of strings. - """ - for descendant in self.descendants: - if ( - (types is None and not isinstance(descendant, NavigableString)) - or - (types is not None and type(descendant) not in types)): - continue - if strip: - descendant = descendant.strip() - if len(descendant) == 0: - continue - yield descendant - - strings = property(_all_strings) - - @property - def stripped_strings(self): - """Yield all strings in the document, stripping them first. - - :yield: A sequence of stripped strings. - """ - for string in self._all_strings(True): - yield string - - def get_text(self, separator="", strip=False, - types=(NavigableString, CData)): - """Get all child strings, concatenated using the given separator. - - :param separator: Strings will be concatenated using this separator. - - :param strip: If True, strings will be stripped before being - concatenated. - - :types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - stylesheets, etc. - - :return: A string. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) - - def decompose(self): - """Recursively destroys this PageElement and its children. - - This element will be removed from the tree and wiped out; so - will everything beneath it. - - The behavior of a decomposed PageElement is undefined and you - should never use one for anything, but if you need to _check_ - whether an element has been decomposed, you can use the - `decomposed` property. - """ - self.extract() - i = self - while i is not None: - n = i.next_element - i.__dict__.clear() - i.contents = [] - i._decomposed = True - i = n - - def clear(self, decompose=False): - """Wipe out all children of this PageElement by calling extract() - on them. - - :param decompose: If this is True, decompose() (a more - destructive method) will be called instead of extract(). - """ - if decompose: - for element in self.contents[:]: - if isinstance(element, Tag): - element.decompose() - else: - element.extract() - else: - for element in self.contents[:]: - element.extract() - - def smooth(self): - """Smooth out this element's children by consolidating consecutive - strings. - - This makes pretty-printed output look more natural following a - lot of operations that modified the tree. - """ - # Mark the first position of every pair of children that need - # to be consolidated. Do this rather than making a copy of - # self.contents, since in most cases very few strings will be - # affected. - marked = [] - for i, a in enumerate(self.contents): - if isinstance(a, Tag): - # Recursively smooth children. - a.smooth() - if i == len(self.contents)-1: - # This is the last item in .contents, and it's not a - # tag. There's no chance it needs any work. - continue - b = self.contents[i+1] - if (isinstance(a, NavigableString) - and isinstance(b, NavigableString) - and not isinstance(a, PreformattedString) - and not isinstance(b, PreformattedString) - ): - marked.append(i) - - # Go over the marked positions in reverse order, so that - # removing items from .contents won't affect the remaining - # positions. - for i in reversed(marked): - a = self.contents[i] - b = self.contents[i+1] - b.extract() - n = NavigableString(a+b) - a.replace_with(n) - - def index(self, element): - """Find the index of a child by identity, not value. - - Avoids issues with tag.contents.index(element) getting the - index of equal elements. - - :param element: Look for this PageElement in `self.contents`. - """ - for i, child in enumerate(self.contents): - if child is element: - return i - raise ValueError("Tag.index: element not in tag") - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self.attrs.get(key, default) - - def get_attribute_list(self, key, default=None): - """The same as get(), but always returns a list. - - :param key: The attribute to look for. - :param default: Use this value if the attribute is not present - on this PageElement. - :return: A list of values, probably containing only a single - value. - """ - value = self.get(key, default) - if not isinstance(value, list): - value = [value] - return value - - def has_attr(self, key): - """Does this PageElement have an attribute with the given name?""" - return key in self.attrs - - def __hash__(self): - return str(self).__hash__() - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the Tag, - and throws an exception if it's not there.""" - return self.attrs[key] - - def __iter__(self): - "Iterating over a Tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a Tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __bool__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self.attrs[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - self.attrs.pop(key, None) - - def __call__(self, *args, **kwargs): - """Calling a Tag like a function is the same as calling its - find_all() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return self.find_all(*args, **kwargs) - - def __getattr__(self, tag): - """Calling tag.subtag is the same as calling tag.find(name="subtag")""" - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.endswith('Tag'): - # BS3: soup.aTag -> "soup.find("a") - tag_name = tag[:-3] - warnings.warn( - '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( - name=tag_name - ) - ) - return self.find(tag_name) - # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag == "contents": - return self.find(tag) - raise AttributeError( - "'%s' object has no attribute '%s'" % (self.__class__, tag)) - - def __eq__(self, other): - """Returns true iff this Tag has the same name, the same attributes, - and the same contents (recursively) as `other`.""" - if self is other: - return True - if (not hasattr(other, 'name') or - not hasattr(other, 'attrs') or - not hasattr(other, 'contents') or - self.name != other.name or - self.attrs != other.attrs or - len(self) != len(other)): - return False - for i, my_child in enumerate(self.contents): - if my_child != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this Tag is not identical to `other`, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding="unicode-escape"): - """Renders this PageElement as a string. - - :param encoding: The encoding to use (Python 2 only). - :return: Under Python 2, a bytestring; under Python 3, - a Unicode string. - """ - if PY3K: - # "The return value must be a string object", i.e. Unicode - return self.decode() - else: - # "The return value must be a string object", i.e. a bytestring. - # By convention, the return value of __repr__ should also be - # an ASCII string. - return self.encode(encoding) - - def __unicode__(self): - """Renders this PageElement as a Unicode string.""" - return self.decode() - - def __str__(self): - """Renders this PageElement as a generic string. - - :return: Under Python 2, a UTF-8 bytestring; under Python 3, - a Unicode string. - """ - if PY3K: - return self.decode() - else: - return self.encode() - - if PY3K: - __str__ = __repr__ = __unicode__ - - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, formatter="minimal", - errors="xmlcharrefreplace"): - """Render a bytestring representation of this PageElement and its - contents. - - :param encoding: The destination encoding. - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - :param errors: An error handling strategy such as - 'xmlcharrefreplace'. This value is passed along into - encode() and its value should be one of the constants - defined by Python. - :return: A bytestring. - - """ - # Turn the data structure into Unicode, then encode the - # Unicode. - u = self.decode(indent_level, encoding, formatter) - return u.encode(encoding, errors) - - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - """ - - # First off, turn a non-Formatter `formatter` into a Formatter - # object. This will stop the lookup from happening over and - # over again. - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key - else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) - - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' - - prefix = '' - if self.prefix: - prefix = self.prefix + ":" - - if self.is_empty_element: - close = formatter.void_element_close_prefix or '' - else: - closeTag = '</%s%s>' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (' ' * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter - ) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' - if attrs: - attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s - - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed? - - Most of them should, but some (such as <pre> in HTML - documents) should not. - """ - return ( - indent_level is not None - and ( - not self.preserve_whitespace_tags - or self.name not in self.preserve_whitespace_tags - ) - ) - - def prettify(self, encoding=None, formatter="minimal"): - """Pretty-print this PageElement as a string. - - :param encoding: The eventual encoding of the string. If this is None, - a Unicode string will be returned. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - :return: A Unicode string (if encoding==None) or a bytestring - (otherwise). - """ - if encoding is None: - return self.decode(True, formatter=formatter) - else: - return self.encode(encoding, True, formatter=formatter) - - def decode_contents(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Renders the contents of this tag as a Unicode string. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - - :param eventual_encoding: The tag is destined to be - encoded into this encoding. decode_contents() is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - - :param formatter: A Formatter object, or a string naming one of - the standard Formatters. - """ - # First off, turn a string formatter into a Formatter object. This - # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - - pretty_print = (indent_level is not None) - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(formatter) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - formatter)) - preserve_whitespace = ( - self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags - ) - if text and indent_level and not preserve_whitespace: - text = text.strip() - if text: - if pretty_print and not preserve_whitespace: - s.append(" " * (indent_level - 1)) - s.append(text) - if pretty_print and not preserve_whitespace: - s.append("\n") - return ''.join(s) - - def encode_contents( - self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Renders the contents of this PageElement as a bytestring. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - - :param eventual_encoding: The bytestring will be in this encoding. - - :param formatter: A Formatter object, or a string naming one of - the standard Formatters. - - :return: A bytestring. - """ - contents = self.decode_contents(indent_level, encoding, formatter) - return contents.encode(encoding) - - # Old method for BS3 compatibility - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Deprecated method for BS3 compatibility.""" - if not prettyPrint: - indentLevel = None - return self.encode_contents( - indent_level=indentLevel, encoding=encoding) - - #Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Look in the children of this PageElement and find the first - PageElement that matches the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param recursive: If this is True, find() will perform a - recursive search of this PageElement's children. Otherwise, - only the direct children will be considered. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - r = None - l = self.find_all(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def find_all(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Look in the children of this PageElement and find all - PageElements that match the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param recursive: If this is True, find_all() will perform a - recursive search of this PageElement's children. Otherwise, - only the direct children will be considered. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - generator = self.descendants - if not recursive: - generator = self.children - return self._find_all(name, attrs, text, limit, generator, **kwargs) - findAll = find_all # BS3 - findChildren = find_all # BS2 - - #Generator methods - @property - def children(self): - """Iterate over all direct children of this PageElement. - - :yield: A sequence of PageElements. - """ - # return iter() to make the purpose of the method clear - return iter(self.contents) # XXX This seems to be untested. - - @property - def descendants(self): - """Iterate over all children of this PageElement in a - breadth-first sequence. - - :yield: A sequence of PageElements. - """ - if not len(self.contents): - return - stopNode = self._last_descendant().next_element - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next_element - - # CSS selector code - def select_one(self, selector, namespaces=None, **kwargs): - """Perform a CSS selection operation on the current element. - - :param selector: A CSS selector. - - :param namespaces: A dictionary mapping namespace prefixes - used in the CSS selector to namespace URIs. By default, - Beautiful Soup will use the prefixes it encountered while - parsing the document. - - :param kwargs: Keyword arguments to be passed into SoupSieve's - soupsieve.select() method. - - :return: A Tag. - :rtype: bs4.element.Tag - """ - value = self.select(selector, namespaces, 1, **kwargs) - if value: - return value[0] - return None - - def select(self, selector, namespaces=None, limit=None, **kwargs): - """Perform a CSS selection operation on the current element. - - This uses the SoupSieve library. - - :param selector: A string containing a CSS selector. - - :param namespaces: A dictionary mapping namespace prefixes - used in the CSS selector to namespace URIs. By default, - Beautiful Soup will use the prefixes it encountered while - parsing the document. - - :param limit: After finding this number of results, stop looking. - - :param kwargs: Keyword arguments to be passed into SoupSieve's - soupsieve.select() method. - - :return: A ResultSet of Tags. - :rtype: bs4.element.ResultSet - """ - if namespaces is None: - namespaces = self._namespaces - - if limit is None: - limit = 0 - if soupsieve is None: - raise NotImplementedError( - "Cannot execute CSS selectors because the soupsieve package is not installed." - ) - - results = soupsieve.select(selector, self, namespaces, limit, **kwargs) - - # We do this because it's more consistent and because - # ResultSet.__getattr__ has a helpful error message. - return ResultSet(None, results) - - # Old names for backwards compatibility - def childGenerator(self): - """Deprecated generator.""" - return self.children - - def recursiveChildGenerator(self): - """Deprecated generator.""" - return self.descendants - - def has_key(self, key): - """Deprecated method. This was kind of misleading because has_key() - (attributes) was different from __in__ (contents). - - has_key() is gone in Python 3, anyway. - """ - warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( - key)) - return self.has_attr(key) - -# Next, a couple classes to represent queries and their results. -class SoupStrainer(object): - """Encapsulates a number of ways of matching a markup element (tag or - string). - - This is primarily used to underpin the find_* methods, but you can - create one yourself and pass it in as `parse_only` to the - `BeautifulSoup` constructor, to parse a subset of a large - document. - """ - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - """Constructor. - - The SoupStrainer constructor takes the same arguments passed - into the find_* methods. See the online documentation for - detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - """ - self.name = self._normalize_search_value(name) - if not isinstance(attrs, dict): - # Treat a non-dict value for attrs as a search for the 'class' - # attribute. - kwargs['class'] = attrs - attrs = None - - if 'class_' in kwargs: - # Treat class_="foo" as a search for the 'class' - # attribute, overriding any non-dict value for attrs. - kwargs['class'] = kwargs['class_'] - del kwargs['class_'] - - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - normalized_attrs = {} - for key, value in list(attrs.items()): - normalized_attrs[key] = self._normalize_search_value(value) - - self.attrs = normalized_attrs - self.text = self._normalize_search_value(text) - - def _normalize_search_value(self, value): - # Leave it alone if it's a Unicode string, a callable, a - # regular expression, a boolean, or None. - if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') - or isinstance(value, bool) or value is None): - return value - - # If it's a bytestring, convert it to Unicode, treating it as UTF-8. - if isinstance(value, bytes): - return value.decode("utf8") - - # If it's listlike, convert it into a list of strings. - if hasattr(value, '__iter__'): - new_value = [] - for v in value: - if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, str)): - # This is almost certainly the user's mistake. In the - # interests of avoiding infinite loops, we'll let - # it through as-is rather than doing a recursive call. - new_value.append(v) - else: - new_value.append(self._normalize_search_value(v)) - return new_value - - # Otherwise, convert it into a Unicode string. - # The unicode(str()) thing is so this will do the same thing on Python 2 - # and Python 3. - return str(str(value)) - - def __str__(self): - """A human-readable representation of this SoupStrainer.""" - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def search_tag(self, markup_name=None, markup_attrs={}): - """Check whether a Tag with the given name and attributes would - match this SoupStrainer. - - Used prospectively to decide whether to even bother creating a Tag - object. - - :param markup_name: A tag name as found in some markup. - :param markup_attrs: A dictionary of attributes as found in some markup. - - :return: True if the prospective tag would match this SoupStrainer; - False otherwise. - """ - found = None - markup = None - if isinstance(markup_name, Tag): - markup = markup_name - markup_attrs = markup - call_function_with_tag_data = ( - isinstance(self.name, Callable) - and not isinstance(markup_name, Tag)) - - if ((not self.name) - or call_function_with_tag_data - or (markup and self._matches(markup, self.name)) - or (not markup and self._matches(markup_name, self.name))): - if call_function_with_tag_data: - match = self.name(markup_name, markup_attrs) - else: - match = True - markup_attr_map = None - for attr, match_against in list(self.attrs.items()): - if not markup_attr_map: - if hasattr(markup_attrs, 'get'): - markup_attr_map = markup_attrs - else: - markup_attr_map = {} - for k, v in markup_attrs: - markup_attr_map[k] = v - attr_value = markup_attr_map.get(attr) - if not self._matches(attr_value, match_against): - match = False - break - if match: - if markup: - found = markup - else: - found = markup_name - if found and self.text and not self._matches(found.string, self.text): - found = None - return found - - # For BS3 compatibility. - searchTag = search_tag - - def search(self, markup): - """Find all items in `markup` that match this SoupStrainer. - - Used by the core _find_all() method, which is ultimately - called by all find_* methods. - - :param markup: A PageElement or a list of them. - """ - # print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text or self.name or self.attrs: - found = self.search_tag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, str): - if not self.name and not self.attrs and self._matches(markup, self.text): - found = markup - else: - raise Exception( - "I don't know how to match against a %s" % markup.__class__) - return found - - def _matches(self, markup, match_against, already_tried=None): - # print u"Matching %s against %s" % (markup, match_against) - result = False - if isinstance(markup, list) or isinstance(markup, tuple): - # This should only happen when searching a multi-valued attribute - # like 'class'. - for item in markup: - if self._matches(item, match_against): - return True - # We didn't match any particular value of the multivalue - # attribute, but maybe we match the attribute value when - # considered as a string. - if self._matches(' '.join(markup), match_against): - return True - return False - - if match_against is True: - # True matches any non-None value. - return markup is not None - - if isinstance(match_against, Callable): - return match_against(markup) - - # Custom callables take the tag as an argument, but all - # other ways of matching match the tag name as a string. - original_markup = markup - if isinstance(markup, Tag): - markup = markup.name - - # Ensure that `markup` is either a Unicode string, or None. - markup = self._normalize_search_value(markup) - - if markup is None: - # None matches None, False, an empty string, an empty list, and so on. - return not match_against - - if (hasattr(match_against, '__iter__') - and not isinstance(match_against, str)): - # We're asked to match against an iterable of items. - # The markup must be match at least one item in the - # iterable. We'll try each one in turn. - # - # To avoid infinite recursion we need to keep track of - # items we've already seen. - if not already_tried: - already_tried = set() - for item in match_against: - if item.__hash__: - key = item - else: - key = id(item) - if key in already_tried: - continue - else: - already_tried.add(key) - if self._matches(original_markup, item, already_tried): - return True - else: - return False - - # Beyond this point we might need to run the test twice: once against - # the tag's name and once against its prefixed name. - match = False - - if not match and isinstance(match_against, str): - # Exact string match - match = markup == match_against - - if not match and hasattr(match_against, 'search'): - # Regexp match - return match_against.search(markup) - - if (not match - and isinstance(original_markup, Tag) - and original_markup.prefix): - # Try the whole thing again with the prefixed tag name. - return self._matches( - original_markup.prefix + ':' + original_markup.name, match_against - ) - - return match - - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source, result=()): - """Constructor. - - :param source: A SoupStrainer. - :param result: A list of PageElements. - """ - super(ResultSet, self).__init__(result) - self.source = source - - def __getattr__(self, key): - """Raise a helpful exception to explain a common code fix.""" - raise AttributeError( - "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key - )