guppy_basecaller: env/lib/python3.7/site-packages/bs4/element.py comparison

comparison env/lib/python3.7/site-packages/bs4/element.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"

author	shellac
date	Mon, 01 Jun 2020 08:59:25 -0400
parents	79f47841a781
children

comparison

equal deleted inserted replaced

-:79f47841a781
+:9b1c78e6ba9c
-# Use of this source code is governed by the MIT license.
-__license__ = "MIT"
-try:
-from collections.abc import Callable # Python 3.6
-except ImportError as e:
-from collections import Callable
-import re
-import sys
-import warnings
-try:
-import soupsieve
-except ImportError as e:
-soupsieve = None
-warnings.warn(
-'The soupsieve package is not installed. CSS selectors cannot be used.'
-)
-from bs4.formatter import (
-Formatter,
-HTMLFormatter,
-XMLFormatter,
-)
-DEFAULT_OUTPUT_ENCODING = "utf-8"
-PY3K = (sys.version_info[0] > 2)
-nonwhitespace_re = re.compile(r"\S+")
-# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
-# the off chance someone imported it for their own use.
-whitespace_re = re.compile(r"\s+")
-def _alias(attr):
-"""Alias one attribute name to another for backward compatibility"""
-@property
-def alias(self):
-return getattr(self, attr)
-@alias.setter
-def alias(self):
-return setattr(self, attr)
-return alias
-class NamespacedAttribute(str):
-"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
-('xml') and the name ('lang') that were used to create it.
-"""
-def __new__(cls, prefix, name=None, namespace=None):
-if not name:
-# This is the default namespace. Its name "has no value"
-# per https://www.w3.org/TR/xml-names/#defaulting
-name = None
-if name is None:
-obj = str.__new__(cls, prefix)
-elif prefix is None:
-# Not really namespaced.
-obj = str.__new__(cls, name)
-else:
-obj = str.__new__(cls, prefix + ":" + name)
-obj.prefix = prefix
-obj.name = name
-obj.namespace = namespace
-return obj
-class AttributeValueWithCharsetSubstitution(str):
-"""A stand-in object for a character encoding specified in HTML."""
-class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
-"""A generic stand-in for the value of a meta tag's 'charset' attribute.
-When Beautiful Soup parses the markup '<meta charset="utf8">', the
-value of the 'charset' attribute will be one of these objects.
-"""
-def __new__(cls, original_value):
-obj = str.__new__(cls, original_value)
-obj.original_value = original_value
-return obj
-def encode(self, encoding):
-"""When an HTML document is being encoded to a given encoding, the
-value of a meta tag's 'charset' is the name of the encoding.
-"""
-return encoding
-class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
-"""A generic stand-in for the value of a meta tag's 'content' attribute.
-When Beautiful Soup parses the markup:
-<meta http-equiv="content-type" content="text/html; charset=utf8">
-The value of the 'content' attribute will be one of these objects.
-"""
-CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
-def __new__(cls, original_value):
-match = cls.CHARSET_RE.search(original_value)
-if match is None:
-# No substitution necessary.
-return str.__new__(str, original_value)
-obj = str.__new__(cls, original_value)
-obj.original_value = original_value
-return obj
-def encode(self, encoding):
-def rewrite(match):
-return match.group(1) + encoding
-return self.CHARSET_RE.sub(rewrite, self.original_value)
-class PageElement(object):
-"""Contains the navigational information for some part of the page:
-that is, its current location in the parse tree.
-NavigableString, Tag, etc. are all subclasses of PageElement.
-"""
-def setup(self, parent=None, previous_element=None, next_element=None,
-previous_sibling=None, next_sibling=None):
-"""Sets up the initial relations between this element and
-other elements.
-:param parent: The parent of this element.
-:param previous_element: The element parsed immediately before
-this one.
-:param next_element: The element parsed immediately before
-this one.
-:param previous_sibling: The most recently encountered element
-on the same level of the parse tree as this one.
-:param previous_sibling: The next element to be encountered
-on the same level of the parse tree as this one.
-"""
-self.parent = parent
-self.previous_element = previous_element
-if previous_element is not None:
-self.previous_element.next_element = self
-self.next_element = next_element
-if self.next_element is not None:
-self.next_element.previous_element = self
-self.next_sibling = next_sibling
-if self.next_sibling is not None:
-self.next_sibling.previous_sibling = self
-if (previous_sibling is None
-and self.parent is not None and self.parent.contents):
-previous_sibling = self.parent.contents[-1]
-self.previous_sibling = previous_sibling
-if previous_sibling is not None:
-self.previous_sibling.next_sibling = self
-def format_string(self, s, formatter):
-"""Format the given string using the given formatter.
-:param s: A string.
-:param formatter: A Formatter object, or a string naming one of the standard formatters.
-"""
-if formatter is None:
-return s
-if not isinstance(formatter, Formatter):
-formatter = self.formatter_for_name(formatter)
-output = formatter.substitute(s)
-return output
-def formatter_for_name(self, formatter):
-"""Look up or create a Formatter for the given identifier,
-if necessary.
-:param formatter: Can be a Formatter object (used as-is), a
-function (used as the entity substitution hook for an
-XMLFormatter or HTMLFormatter), or a string (used to look
-up an XMLFormatter or HTMLFormatter in the appropriate
-registry.
-"""
-if isinstance(formatter, Formatter):
-return formatter
-if self._is_xml:
-c = XMLFormatter
-else:
-c = HTMLFormatter
-if isinstance(formatter, Callable):
-return c(entity_substitution=formatter)
-return c.REGISTRY[formatter]
-@property
-def _is_xml(self):
-"""Is this element part of an XML tree or an HTML tree?
-This is used in formatter_for_name, when deciding whether an
-XMLFormatter or HTMLFormatter is more appropriate. It can be
-inefficient, but it should be called very rarely.
-"""
-if self.known_xml is not None:
-# Most of the time we will have determined this when the
-# document is parsed.
-return self.known_xml
-# Otherwise, it's likely that this element was created by
-# direct invocation of the constructor from within the user's
-# Python code.
-if self.parent is None:
-# This is the top-level object. It should have .known_xml set
-# from tree creation. If not, take a guess--BS is usually
-# used on HTML markup.
-return getattr(self, 'is_xml', False)
-return self.parent._is_xml
-nextSibling = _alias("next_sibling")  # BS3
-previousSibling = _alias("previous_sibling")  # BS3
-def replace_with(self, replace_with):
-"""Replace this PageElement with another one, keeping the rest of the
-tree the same.
-:param replace_with: A PageElement.
-:return: `self`, no longer part of the tree.
-"""
-if self.parent is None:
-raise ValueError(
-"Cannot replace one element with another when the "
-"element to be replaced is not part of a tree.")
-if replace_with is self:
-return
-if replace_with is self.parent:
-raise ValueError("Cannot replace a Tag with its parent.")
-old_parent = self.parent
-my_index = self.parent.index(self)
-self.extract(_self_index=my_index)
-old_parent.insert(my_index, replace_with)
-return self
-replaceWith = replace_with  # BS3
-def unwrap(self):
-"""Replace this PageElement with its contents.
-:return: `self`, no longer part of the tree.
-"""
-my_parent = self.parent
-if self.parent is None:
-raise ValueError(
-"Cannot replace an element with its contents when that"
-"element is not part of a tree.")
-my_index = self.parent.index(self)
-self.extract(_self_index=my_index)
-for child in reversed(self.contents[:]):
-my_parent.insert(my_index, child)
-return self
-replace_with_children = unwrap
-replaceWithChildren = unwrap  # BS3
-def wrap(self, wrap_inside):
-"""Wrap this PageElement inside another one.
-:param wrap_inside: A PageElement.
-:return: `wrap_inside`, occupying the position in the tree that used
-to be occupied by `self`, and with `self` inside it.
-"""
-me = self.replace_with(wrap_inside)
-wrap_inside.append(me)
-return wrap_inside
-def extract(self, _self_index=None):
-"""Destructively rips this element out of the tree.
-:param _self_index: The location of this element in its parent's
-.contents, if known. Passing this in allows for a performance
-optimization.
-:return: `self`, no longer part of the tree.
-"""
-if self.parent is not None:
-if _self_index is None:
-_self_index = self.parent.index(self)
-del self.parent.contents[_self_index]
-#Find the two elements that would be next to each other if
-#this element (and any children) hadn't been parsed. Connect
-#the two.
-last_child = self._last_descendant()
-next_element = last_child.next_element
-if (self.previous_element is not None and
-self.previous_element is not next_element):
-self.previous_element.next_element = next_element
-if next_element is not None and next_element is not self.previous_element:
-next_element.previous_element = self.previous_element
-self.previous_element = None
-last_child.next_element = None
-self.parent = None
-if (self.previous_sibling is not None
-and self.previous_sibling is not self.next_sibling):
-self.previous_sibling.next_sibling = self.next_sibling
-if (self.next_sibling is not None
-and self.next_sibling is not self.previous_sibling):
-self.next_sibling.previous_sibling = self.previous_sibling
-self.previous_sibling = self.next_sibling = None
-return self
-def _last_descendant(self, is_initialized=True, accept_self=True):
-"""Finds the last element beneath this object to be parsed.
-:param is_initialized: Has `setup` been called on this PageElement
-yet?
-:param accept_self: Is `self` an acceptable answer to the question?
-"""
-if is_initialized and self.next_sibling is not None:
-last_child = self.next_sibling.previous_element
-else:
-last_child = self
-while isinstance(last_child, Tag) and last_child.contents:
-last_child = last_child.contents[-1]
-if not accept_self and last_child is self:
-last_child = None
-return last_child
-# BS3: Not part of the API!
-_lastRecursiveChild = _last_descendant
-def insert(self, position, new_child):
-"""Insert a new PageElement in the list of this PageElement's children.
-This works the same way as `list.insert`.
-:param position: The numeric position that should be occupied
-in `self.children` by the new PageElement.
-:param new_child: A PageElement.
-"""
-if new_child is None:
-raise ValueError("Cannot insert None into a tag.")
-if new_child is self:
-raise ValueError("Cannot insert a tag into itself.")
-if (isinstance(new_child, str)
-and not isinstance(new_child, NavigableString)):
-new_child = NavigableString(new_child)
-from bs4 import BeautifulSoup
-if isinstance(new_child, BeautifulSoup):
-# We don't want to end up with a situation where one BeautifulSoup
-# object contains another. Insert the children one at a time.
-for subchild in list(new_child.contents):
-self.insert(position, subchild)
-position += 1
-return
-position = min(position, len(self.contents))
-if hasattr(new_child, 'parent') and new_child.parent is not None:
-# We're 'inserting' an element that's already one
-# of this object's children.
-if new_child.parent is self:
-current_index = self.index(new_child)
-if current_index < position:
-# We're moving this element further down the list
-# of this object's children. That means that when
-# we extract this element, our target index will
-# jump down one.
-position -= 1
-new_child.extract()
-new_child.parent = self
-previous_child = None
-if position == 0:
-new_child.previous_sibling = None
-new_child.previous_element = self
-else:
-previous_child = self.contents[position - 1]
-new_child.previous_sibling = previous_child
-new_child.previous_sibling.next_sibling = new_child
-new_child.previous_element = previous_child._last_descendant(False)
-if new_child.previous_element is not None:
-new_child.previous_element.next_element = new_child
-new_childs_last_element = new_child._last_descendant(False)
-if position >= len(self.contents):
-new_child.next_sibling = None
-parent = self
-parents_next_sibling = None
-while parents_next_sibling is None and parent is not None:
-parents_next_sibling = parent.next_sibling
-parent = parent.parent
-if parents_next_sibling is not None:
-# We found the element that comes next in the document.
-break
-if parents_next_sibling is not None:
-new_childs_last_element.next_element = parents_next_sibling
-else:
-# The last element of this tag is the last element in
-# the document.
-new_childs_last_element.next_element = None
-else:
-next_child = self.contents[position]
-new_child.next_sibling = next_child
-if new_child.next_sibling is not None:
-new_child.next_sibling.previous_sibling = new_child
-new_childs_last_element.next_element = next_child
-if new_childs_last_element.next_element is not None:
-new_childs_last_element.next_element.previous_element = new_childs_last_element
-self.contents.insert(position, new_child)
-def append(self, tag):
-"""Appends the given PageElement to the contents of this one.
-:param tag: A PageElement.
-"""
-self.insert(len(self.contents), tag)
-def extend(self, tags):
-"""Appends the given PageElements to this one's contents.
-:param tags: A list of PageElements.
-"""
-for tag in tags:
-self.append(tag)
-def insert_before(self, *args):
-"""Makes the given element(s) the immediate predecessor of this one.
-All the elements will have the same parent, and the given elements
-will be immediately before this one.
-:param args: One or more PageElements.
-"""
-parent = self.parent
-if parent is None:
-raise ValueError(
-"Element has no parent, so 'before' has no meaning.")
-if any(x is self for x in args):
-raise ValueError("Can't insert an element before itself.")
-for predecessor in args:
-# Extract first so that the index won't be screwed up if they
-# are siblings.
-if isinstance(predecessor, PageElement):
-predecessor.extract()
-index = parent.index(self)
-parent.insert(index, predecessor)
-def insert_after(self, *args):
-"""Makes the given element(s) the immediate successor of this one.
-The elements will have the same parent, and the given elements
-will be immediately after this one.
-:param args: One or more PageElements.
-"""
-# Do all error checking before modifying the tree.
-parent = self.parent
-if parent is None:
-raise ValueError(
-"Element has no parent, so 'after' has no meaning.")
-if any(x is self for x in args):
-raise ValueError("Can't insert an element after itself.")
-offset = 0
-for successor in args:
-# Extract first so that the index won't be screwed up if they
-# are siblings.
-if isinstance(successor, PageElement):
-successor.extract()
-index = parent.index(self)
-parent.insert(index+1+offset, successor)
-offset += 1
-def find_next(self, name=None, attrs={}, text=None, **kwargs):
-"""Find the first PageElement that matches the given criteria and
-appears later in the document than this PageElement.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
-findNext = find_next  # BS3
-def find_all_next(self, name=None, attrs={}, text=None, limit=None,
-**kwargs):
-"""Find all PageElements that match the given criteria and appear
-later in the document than this PageElement.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A ResultSet containing PageElements.
-"""
-return self._find_all(name, attrs, text, limit, self.next_elements,
-**kwargs)
-findAllNext = find_all_next  # BS3
-def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
-"""Find the closest sibling to this PageElement that matches the
-given criteria and appears later in the document.
-All find_* methods take a common set of arguments. See the
-online documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self._find_one(self.find_next_siblings, name, attrs, text,
-**kwargs)
-findNextSibling = find_next_sibling  # BS3
-def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
-**kwargs):
-"""Find all siblings of this PageElement that match the given criteria
-and appear later in the document.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A ResultSet of PageElements.
-:rtype: bs4.element.ResultSet
-"""
-return self._find_all(name, attrs, text, limit,
-self.next_siblings, **kwargs)
-findNextSiblings = find_next_siblings   # BS3
-fetchNextSiblings = find_next_siblings  # BS2
-def find_previous(self, name=None, attrs={}, text=None, **kwargs):
-"""Look backwards in the document from this PageElement and find the
-first PageElement that matches the given criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self._find_one(
-self.find_all_previous, name, attrs, text, **kwargs)
-findPrevious = find_previous  # BS3
-def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
-**kwargs):
-"""Look backwards in the document from this PageElement and find all
-PageElements that match the given criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A ResultSet of PageElements.
-:rtype: bs4.element.ResultSet
-"""
-return self._find_all(name, attrs, text, limit, self.previous_elements,
-**kwargs)
-findAllPrevious = find_all_previous  # BS3
-fetchPrevious = find_all_previous    # BS2
-def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
-"""Returns the closest sibling to this PageElement that matches the
-given criteria and appears earlier in the document.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self._find_one(self.find_previous_siblings, name, attrs, text,
-**kwargs)
-findPreviousSibling = find_previous_sibling  # BS3
-def find_previous_siblings(self, name=None, attrs={}, text=None,
-limit=None, **kwargs):
-"""Returns all siblings to this PageElement that match the
-given criteria and appear earlier in the document.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A ResultSet of PageElements.
-:rtype: bs4.element.ResultSet
-"""
-return self._find_all(name, attrs, text, limit,
-self.previous_siblings, **kwargs)
-findPreviousSiblings = find_previous_siblings   # BS3
-fetchPreviousSiblings = find_previous_siblings  # BS2
-def find_parent(self, name=None, attrs={}, **kwargs):
-"""Find the closest parent of this PageElement that matches the given
-criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-# NOTE: We can't use _find_one because findParents takes a different
-# set of arguments.
-r = None
-l = self.find_parents(name, attrs, 1, **kwargs)
-if l:
-r = l[0]
-return r
-findParent = find_parent  # BS3
-def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
-"""Find all parents of this PageElement that match the given criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self._find_all(name, attrs, None, limit, self.parents,
-**kwargs)
-findParents = find_parents   # BS3
-fetchParents = find_parents  # BS2
-@property
-def next(self):
-"""The PageElement, if any, that was parsed just after this one.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self.next_element
-@property
-def previous(self):
-"""The PageElement, if any, that was parsed just before this one.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-return self.previous_element
-#These methods do the real heavy lifting.
-def _find_one(self, method, name, attrs, text, **kwargs):
-r = None
-l = method(name, attrs, text, 1, **kwargs)
-if l:
-r = l[0]
-return r
-def _find_all(self, name, attrs, text, limit, generator, **kwargs):
-"Iterates over a generator looking for things that match."
-if text is None and 'string' in kwargs:
-text = kwargs['string']
-del kwargs['string']
-if isinstance(name, SoupStrainer):
-strainer = name
-else:
-strainer = SoupStrainer(name, attrs, text, **kwargs)
-if text is None and not limit and not attrs and not kwargs:
-if name is True or name is None:
-# Optimization to find all tags.
-result = (element for element in generator
-if isinstance(element, Tag))
-return ResultSet(strainer, result)
-elif isinstance(name, str):
-# Optimization to find all tags with a given name.
-if name.count(':') == 1:
-# This is a name with a prefix. If this is a namespace-aware document,
-# we need to match the local name against tag.name. If not,
-# we need to match the fully-qualified name against tag.name.
-prefix, local_name = name.split(':', 1)
-else:
-prefix = None
-local_name = name
-result = (element for element in generator
-if isinstance(element, Tag)
-and (
-element.name == name
-) or (
-element.name == local_name
-and (prefix is None or element.prefix == prefix)
-)
-)
-return ResultSet(strainer, result)
-results = ResultSet(strainer)
-while True:
-try:
-i = next(generator)
-except StopIteration:
-break
-if i:
-found = strainer.search(i)
-if found:
-results.append(found)
-if limit and len(results) >= limit:
-break
-return results
-#These generators can be used to navigate starting from both
-#NavigableStrings and Tags.
-@property
-def next_elements(self):
-"""All PageElements that were parsed after this one.
-:yield: A sequence of PageElements.
-"""
-i = self.next_element
-while i is not None:
-yield i
-i = i.next_element
-@property
-def next_siblings(self):
-"""All PageElements that are siblings of this one but were parsed
-later.
-:yield: A sequence of PageElements.
-"""
-i = self.next_sibling
-while i is not None:
-yield i
-i = i.next_sibling
-@property
-def previous_elements(self):
-"""All PageElements that were parsed before this one.
-:yield: A sequence of PageElements.
-"""
-i = self.previous_element
-while i is not None:
-yield i
-i = i.previous_element
-@property
-def previous_siblings(self):
-"""All PageElements that are siblings of this one but were parsed
-earlier.
-:yield: A sequence of PageElements.
-"""
-i = self.previous_sibling
-while i is not None:
-yield i
-i = i.previous_sibling
-@property
-def parents(self):
-"""All PageElements that are parents of this PageElement.
-:yield: A sequence of PageElements.
-"""
-i = self.parent
-while i is not None:
-yield i
-i = i.parent
-@property
-def decomposed(self):
-"""Check whether a PageElement has been decomposed.
-:rtype: bool
-"""
-return getattr(self, '_decomposed', False) or False
-# Old non-property versions of the generators, for backwards
-# compatibility with BS3.
-def nextGenerator(self):
-return self.next_elements
-def nextSiblingGenerator(self):
-return self.next_siblings
-def previousGenerator(self):
-return self.previous_elements
-def previousSiblingGenerator(self):
-return self.previous_siblings
-def parentGenerator(self):
-return self.parents
-class NavigableString(str, PageElement):
-"""A Python Unicode string that is part of a parse tree.
-When Beautiful Soup parses the markup <b>penguin</b>, it will
-create a NavigableString for the string "penguin".
-"""
-PREFIX = ''
-SUFFIX = ''
-# We can't tell just by looking at a string whether it's contained
-# in an XML document or an HTML document.
-known_xml = None
-def __new__(cls, value):
-"""Create a new NavigableString.
-When unpickling a NavigableString, this method is called with
-the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
-passed in to the superclass's __new__ or the superclass won't know
-how to handle non-ASCII characters.
-"""
-if isinstance(value, str):
-u = str.__new__(cls, value)
-else:
-u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
-u.setup()
-return u
-def __copy__(self):
-"""A copy of a NavigableString has the same contents and class
-as the original, but it is not connected to the parse tree.
-"""
-return type(self)(self)
-def __getnewargs__(self):
-return (str(self),)
-def __getattr__(self, attr):
-"""text.string gives you text. This is for backwards
-compatibility for Navigable*String, but for CData* it lets you
-get the string without the CData wrapper."""
-if attr == 'string':
-return self
-else:
-raise AttributeError(
-"'%s' object has no attribute '%s'" % (
-self.__class__.__name__, attr))
-def output_ready(self, formatter="minimal"):
-"""Run the string through the provided formatter.
-:param formatter: A Formatter object, or a string naming one of the standard formatters.
-"""
-output = self.format_string(self, formatter)
-return self.PREFIX + output + self.SUFFIX
-@property
-def name(self):
-"""Since a NavigableString is not a Tag, it has no .name.
-This property is implemented so that code like this doesn't crash
-when run on a mixture of Tag and NavigableString objects:
-[x.name for x in tag.children]
-"""
-return None
-@name.setter
-def name(self, name):
-"""Prevent NavigableString.name from ever being set."""
-raise AttributeError("A NavigableString cannot be given a name.")
-class PreformattedString(NavigableString):
-"""A NavigableString not subject to the normal formatting rules.
-This is an abstract class used for special kinds of strings such
-as comments (the Comment class) and CDATA blocks (the CData
-class).
-"""
-PREFIX = ''
-SUFFIX = ''
-def output_ready(self, formatter=None):
-"""Make this string ready for output by adding any subclass-specific
-prefix or suffix.
-:param formatter: A Formatter object, or a string naming one
-of the standard formatters. The string will be passed into the
-Formatter, but only to trigger any side effects: the return
-value is ignored.
-:return: The string, with any subclass-specific prefix and
-suffix added on.
-"""
-if formatter is not None:
-ignore = self.format_string(self, formatter)
-return self.PREFIX + self + self.SUFFIX
-class CData(PreformattedString):
-"""A CDATA block."""
-PREFIX = '<![CDATA['
-SUFFIX = ']]>'
-class ProcessingInstruction(PreformattedString):
-"""A SGML processing instruction."""
-PREFIX = '<?'
-SUFFIX = '>'
-class XMLProcessingInstruction(ProcessingInstruction):
-"""An XML processing instruction."""
-PREFIX = '<?'
-SUFFIX = '?>'
-class Comment(PreformattedString):
-"""An HTML or XML comment."""
-PREFIX = '<!--'
-SUFFIX = '-->'
-class Declaration(PreformattedString):
-"""An XML declaration."""
-PREFIX = '<?'
-SUFFIX = '?>'
-class Doctype(PreformattedString):
-"""A document type declaration."""
-@classmethod
-def for_name_and_ids(cls, name, pub_id, system_id):
-"""Generate an appropriate document type declaration for a given
-public ID and system ID.
-:param name: The name of the document's root element, e.g. 'html'.
-:param pub_id: The Formal Public Identifier for this document type,
-e.g. '-//W3C//DTD XHTML 1.1//EN'
-:param system_id: The system identifier for this document type,
-e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
-:return: A Doctype.
-"""
-value = name or ''
-if pub_id is not None:
-value += ' PUBLIC "%s"' % pub_id
-if system_id is not None:
-value += ' "%s"' % system_id
-elif system_id is not None:
-value += ' SYSTEM "%s"' % system_id
-return Doctype(value)
-PREFIX = '<!DOCTYPE '
-SUFFIX = '>\n'
-class Stylesheet(NavigableString):
-"""A NavigableString representing an stylesheet (probably
-CSS).
-Used to distinguish embedded stylesheets from textual content.
-"""
-pass
-class Script(NavigableString):
-"""A NavigableString representing an executable script (probably
-Javascript).
-Used to distinguish executable code from textual content.
-"""
-pass
-class TemplateString(NavigableString):
-"""A NavigableString representing a string found inside an HTML
-template embedded in a larger document.
-Used to distinguish such strings from the main body of the document.
-"""
-pass
-class Tag(PageElement):
-"""Represents an HTML or XML tag that is part of a parse tree, along
-with its attributes and contents.
-When Beautiful Soup parses the markup <b>penguin</b>, it will
-create a Tag object representing the <b> tag.
-"""
-def __init__(self, parser=None, builder=None, name=None, namespace=None,
-prefix=None, attrs=None, parent=None, previous=None,
-is_xml=None, sourceline=None, sourcepos=None,
-can_be_empty_element=None, cdata_list_attributes=None,
-preserve_whitespace_tags=None
-):
-"""Basic constructor.
-:param parser: A BeautifulSoup object.
-:param builder: A TreeBuilder.
-:param name: The name of the tag.
-:param namespace: The URI of this Tag's XML namespace, if any.
-:param prefix: The prefix for this Tag's XML namespace, if any.
-:param attrs: A dictionary of this Tag's attribute values.
-:param parent: The PageElement to use as this Tag's parent.
-:param previous: The PageElement that was parsed immediately before
-this tag.
-:param is_xml: If True, this is an XML tag. Otherwise, this is an
-HTML tag.
-:param sourceline: The line number where this tag was found in its
-source document.
-:param sourcepos: The character position within `sourceline` where this
-tag was found.
-:param can_be_empty_element: If True, this tag should be
-represented as <tag/>. If False, this tag should be represented
-as <tag></tag>.
-:param cdata_list_attributes: A list of attributes whose values should
-be treated as CDATA if they ever show up on this tag.
-:param preserve_whitespace_tags: A list of tag names whose contents
-should have their whitespace preserved.
-"""
-if parser is None:
-self.parser_class = None
-else:
-# We don't actually store the parser object: that lets extracted
-# chunks be garbage-collected.
-self.parser_class = parser.__class__
-if name is None:
-raise ValueError("No value provided for new tag's name.")
-self.name = name
-self.namespace = namespace
-self.prefix = prefix
-if ((not builder or builder.store_line_numbers)
-and (sourceline is not None or sourcepos is not None)):
-self.sourceline = sourceline
-self.sourcepos = sourcepos
-if attrs is None:
-attrs = {}
-elif attrs:
-if builder is not None and builder.cdata_list_attributes:
-attrs = builder._replace_cdata_list_attribute_values(
-self.name, attrs)
-else:
-attrs = dict(attrs)
-else:
-attrs = dict(attrs)
-# If possible, determine ahead of time whether this tag is an
-# XML tag.
-if builder:
-self.known_xml = builder.is_xml
-else:
-self.known_xml = is_xml
-self.attrs = attrs
-self.contents = []
-self.setup(parent, previous)
-self.hidden = False
-if builder is None:
-# In the absence of a TreeBuilder, use whatever values were
-# passed in here. They're probably None, unless this is a copy of some
-# other tag.
-self.can_be_empty_element = can_be_empty_element
-self.cdata_list_attributes = cdata_list_attributes
-self.preserve_whitespace_tags = preserve_whitespace_tags
-else:
-# Set up any substitutions for this tag, such as the charset in a META tag.
-builder.set_up_substitutions(self)
-# Ask the TreeBuilder whether this tag might be an empty-element tag.
-self.can_be_empty_element = builder.can_be_empty_element(name)
-# Keep track of the list of attributes of this tag that
-# might need to be treated as a list.
-#
-# For performance reasons, we store the whole data structure
-# rather than asking the question of every tag. Asking would
-# require building a new data structure every time, and
-# (unlike can_be_empty_element), we almost never need
-# to check this.
-self.cdata_list_attributes = builder.cdata_list_attributes
-# Keep track of the names that might cause this tag to be treated as a
-# whitespace-preserved tag.
-self.preserve_whitespace_tags = builder.preserve_whitespace_tags
-parserClass = _alias("parser_class")  # BS3
-def __copy__(self):
-"""A copy of a Tag is a new Tag, unconnected to the parse tree.
-Its contents are a copy of the old Tag's contents.
-"""
-clone = type(self)(
-None, self.builder, self.name, self.namespace,
-self.prefix, self.attrs, is_xml=self._is_xml,
-sourceline=self.sourceline, sourcepos=self.sourcepos,
-can_be_empty_element=self.can_be_empty_element,
-cdata_list_attributes=self.cdata_list_attributes,
-preserve_whitespace_tags=self.preserve_whitespace_tags
-)
-for attr in ('can_be_empty_element', 'hidden'):
-setattr(clone, attr, getattr(self, attr))
-for child in self.contents:
-clone.append(child.__copy__())
-return clone
-@property
-def is_empty_element(self):
-"""Is this tag an empty-element tag? (aka a self-closing tag)
-A tag that has contents is never an empty-element tag.
-A tag that has no contents may or may not be an empty-element
-tag. It depends on the builder used to create the tag. If the
-builder has a designated list of empty-element tags, then only
-a tag whose name shows up in that list is considered an
-empty-element tag.
-If the builder has no designated list of empty-element tags,
-then any tag with no contents is an empty-element tag.
-"""
-return len(self.contents) == 0 and self.can_be_empty_element
-isSelfClosing = is_empty_element  # BS3
-@property
-def string(self):
-"""Convenience property to get the single string within this
-PageElement.
-TODO It might make sense to have NavigableString.string return
-itself.
-:return: If this element has a single string child, return
-value is that string. If this element has one child tag,
-return value is the 'string' attribute of the child tag,
-recursively. If this element is itself a string, has no
-children, or has more than one child, return value is None.
-"""
-if len(self.contents) != 1:
-return None
-child = self.contents[0]
-if isinstance(child, NavigableString):
-return child
-return child.string
-@string.setter
-def string(self, string):
-"""Replace this PageElement's contents with `string`."""
-self.clear()
-self.append(string.__class__(string))
-def _all_strings(self, strip=False, types=(NavigableString, CData)):
-"""Yield all strings of certain classes, possibly stripping them.
-:param strip: If True, all strings will be stripped before being
-yielded.
-:types: A tuple of NavigableString subclasses. Any strings of
-a subclass not found in this list will be ignored. By
-default, this means only NavigableString and CData objects
-will be considered. So no comments, processing instructions,
-etc.
-:yield: A sequence of strings.
-"""
-for descendant in self.descendants:
-if (
-(types is None and not isinstance(descendant, NavigableString))
-or
-(types is not None and type(descendant) not in types)):
-continue
-if strip:
-descendant = descendant.strip()
-if len(descendant) == 0:
-continue
-yield descendant
-strings = property(_all_strings)
-@property
-def stripped_strings(self):
-"""Yield all strings in the document, stripping them first.
-:yield: A sequence of stripped strings.
-"""
-for string in self._all_strings(True):
-yield string
-def get_text(self, separator="", strip=False,
-types=(NavigableString, CData)):
-"""Get all child strings, concatenated using the given separator.
-:param separator: Strings will be concatenated using this separator.
-:param strip: If True, strings will be stripped before being
-concatenated.
-:types: A tuple of NavigableString subclasses. Any strings of
-a subclass not found in this list will be ignored. By
-default, this means only NavigableString and CData objects
-will be considered. So no comments, processing instructions,
-stylesheets, etc.
-:return: A string.
-"""
-return separator.join([s for s in self._all_strings(
-strip, types=types)])
-getText = get_text
-text = property(get_text)
-def decompose(self):
-"""Recursively destroys this PageElement and its children.
-This element will be removed from the tree and wiped out; so
-will everything beneath it.
-The behavior of a decomposed PageElement is undefined and you
-should never use one for anything, but if you need to _check_
-whether an element has been decomposed, you can use the
-`decomposed` property.
-"""
-self.extract()
-i = self
-while i is not None:
-n = i.next_element
-i.__dict__.clear()
-i.contents = []
-i._decomposed = True
-i = n
-def clear(self, decompose=False):
-"""Wipe out all children of this PageElement by calling extract()
-on them.
-:param decompose: If this is True, decompose() (a more
-destructive method) will be called instead of extract().
-"""
-if decompose:
-for element in self.contents[:]:
-if isinstance(element, Tag):
-element.decompose()
-else:
-element.extract()
-else:
-for element in self.contents[:]:
-element.extract()
-def smooth(self):
-"""Smooth out this element's children by consolidating consecutive
-strings.
-This makes pretty-printed output look more natural following a
-lot of operations that modified the tree.
-"""
-# Mark the first position of every pair of children that need
-# to be consolidated.  Do this rather than making a copy of
-# self.contents, since in most cases very few strings will be
-# affected.
-marked = []
-for i, a in enumerate(self.contents):
-if isinstance(a, Tag):
-# Recursively smooth children.
-a.smooth()
-if i == len(self.contents)-1:
-# This is the last item in .contents, and it's not a
-# tag. There's no chance it needs any work.
-continue
-b = self.contents[i+1]
-if (isinstance(a, NavigableString)
-and isinstance(b, NavigableString)
-and not isinstance(a, PreformattedString)
-and not isinstance(b, PreformattedString)
-):
-marked.append(i)
-# Go over the marked positions in reverse order, so that
-# removing items from .contents won't affect the remaining
-# positions.
-for i in reversed(marked):
-a = self.contents[i]
-b = self.contents[i+1]
-b.extract()
-n = NavigableString(a+b)
-a.replace_with(n)
-def index(self, element):
-"""Find the index of a child by identity, not value.
-Avoids issues with tag.contents.index(element) getting the
-index of equal elements.
-:param element: Look for this PageElement in `self.contents`.
-"""
-for i, child in enumerate(self.contents):
-if child is element:
-return i
-raise ValueError("Tag.index: element not in tag")
-def get(self, key, default=None):
-"""Returns the value of the 'key' attribute for the tag, or
-the value given for 'default' if it doesn't have that
-attribute."""
-return self.attrs.get(key, default)
-def get_attribute_list(self, key, default=None):
-"""The same as get(), but always returns a list.
-:param key: The attribute to look for.
-:param default: Use this value if the attribute is not present
-on this PageElement.
-:return: A list of values, probably containing only a single
-value.
-"""
-value = self.get(key, default)
-if not isinstance(value, list):
-value = [value]
-return value
-def has_attr(self, key):
-"""Does this PageElement have an attribute with the given name?"""
-return key in self.attrs
-def __hash__(self):
-return str(self).__hash__()
-def __getitem__(self, key):
-"""tag[key] returns the value of the 'key' attribute for the Tag,
-and throws an exception if it's not there."""
-return self.attrs[key]
-def __iter__(self):
-"Iterating over a Tag iterates over its contents."
-return iter(self.contents)
-def __len__(self):
-"The length of a Tag is the length of its list of contents."
-return len(self.contents)
-def __contains__(self, x):
-return x in self.contents
-def __bool__(self):
-"A tag is non-None even if it has no contents."
-return True
-def __setitem__(self, key, value):
-"""Setting tag[key] sets the value of the 'key' attribute for the
-tag."""
-self.attrs[key] = value
-def __delitem__(self, key):
-"Deleting tag[key] deletes all 'key' attributes for the tag."
-self.attrs.pop(key, None)
-def __call__(self, *args, **kwargs):
-"""Calling a Tag like a function is the same as calling its
-find_all() method. Eg. tag('a') returns a list of all the A tags
-found within this tag."""
-return self.find_all(*args, **kwargs)
-def __getattr__(self, tag):
-"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
-#print "Getattr %s.%s" % (self.__class__, tag)
-if len(tag) > 3 and tag.endswith('Tag'):
-# BS3: soup.aTag -> "soup.find("a")
-tag_name = tag[:-3]
-warnings.warn(
-'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
-name=tag_name
-)
-)
-return self.find(tag_name)
-# We special case contents to avoid recursion.
-elif not tag.startswith("__") and not tag == "contents":
-return self.find(tag)
-raise AttributeError(
-"'%s' object has no attribute '%s'" % (self.__class__, tag))
-def __eq__(self, other):
-"""Returns true iff this Tag has the same name, the same attributes,
-and the same contents (recursively) as `other`."""
-if self is other:
-return True
-if (not hasattr(other, 'name') or
-not hasattr(other, 'attrs') or
-not hasattr(other, 'contents') or
-self.name != other.name or
-self.attrs != other.attrs or
-len(self) != len(other)):
-return False
-for i, my_child in enumerate(self.contents):
-if my_child != other.contents[i]:
-return False
-return True
-def __ne__(self, other):
-"""Returns true iff this Tag is not identical to `other`,
-as defined in __eq__."""
-return not self == other
-def __repr__(self, encoding="unicode-escape"):
-"""Renders this PageElement as a string.
-:param encoding: The encoding to use (Python 2 only).
-:return: Under Python 2, a bytestring; under Python 3,
-a Unicode string.
-"""
-if PY3K:
-# "The return value must be a string object", i.e. Unicode
-return self.decode()
-else:
-# "The return value must be a string object", i.e. a bytestring.
-# By convention, the return value of __repr__ should also be
-# an ASCII string.
-return self.encode(encoding)
-def __unicode__(self):
-"""Renders this PageElement as a Unicode string."""
-return self.decode()
-def __str__(self):
-"""Renders this PageElement as a generic string.
-:return: Under Python 2, a UTF-8 bytestring; under Python 3,
-a Unicode string.
-"""
-if PY3K:
-return self.decode()
-else:
-return self.encode()
-if PY3K:
-__str__ = __repr__ = __unicode__
-def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-indent_level=None, formatter="minimal",
-errors="xmlcharrefreplace"):
-"""Render a bytestring representation of this PageElement and its
-contents.
-:param encoding: The destination encoding.
-:param indent_level: Each line of the rendering will be
-indented this many spaces. Used internally in
-recursive calls while pretty-printing.
-:param formatter: A Formatter object, or a string naming one of
-the standard formatters.
-:param errors: An error handling strategy such as
-'xmlcharrefreplace'. This value is passed along into
-encode() and its value should be one of the constants
-defined by Python.
-:return: A bytestring.
-"""
-# Turn the data structure into Unicode, then encode the
-# Unicode.
-u = self.decode(indent_level, encoding, formatter)
-return u.encode(encoding, errors)
-def decode(self, indent_level=None,
-eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-formatter="minimal"):
-"""Render a Unicode representation of this PageElement and its
-contents.
-:param indent_level: Each line of the rendering will be
-indented this many spaces. Used internally in
-recursive calls while pretty-printing.
-:param eventual_encoding: The tag is destined to be
-encoded into this encoding. This method is _not_
-responsible for performing that encoding. This information
-is passed in so that it can be substituted in if the
-document contains a <META> tag that mentions the document's
-encoding.
-:param formatter: A Formatter object, or a string naming one of
-the standard formatters.
-"""
-# First off, turn a non-Formatter `formatter` into a Formatter
-# object. This will stop the lookup from happening over and
-# over again.
-if not isinstance(formatter, Formatter):
-formatter = self.formatter_for_name(formatter)
-attributes = formatter.attributes(self)
-attrs = []
-for key, val in attributes:
-if val is None:
-decoded = key
-else:
-if isinstance(val, list) or isinstance(val, tuple):
-val = ' '.join(val)
-elif not isinstance(val, str):
-val = str(val)
-elif (
-isinstance(val, AttributeValueWithCharsetSubstitution)
-and eventual_encoding is not None
-):
-val = val.encode(eventual_encoding)
-text = formatter.attribute_value(val)
-decoded = (
-str(key) + '='
-+ formatter.quoted_attribute_value(text))
-attrs.append(decoded)
-close = ''
-closeTag = ''
-prefix = ''
-if self.prefix:
-prefix = self.prefix + ":"
-if self.is_empty_element:
-close = formatter.void_element_close_prefix or ''
-else:
-closeTag = '</%s%s>' % (prefix, self.name)
-pretty_print = self._should_pretty_print(indent_level)
-space = ''
-indent_space = ''
-if indent_level is not None:
-indent_space = (' ' * (indent_level - 1))
-if pretty_print:
-space = indent_space
-indent_contents = indent_level + 1
-else:
-indent_contents = None
-contents = self.decode_contents(
-indent_contents, eventual_encoding, formatter
-)
-if self.hidden:
-# This is the 'document root' object.
-s = contents
-else:
-s = []
-attribute_string = ''
-if attrs:
-attribute_string = ' ' + ' '.join(attrs)
-if indent_level is not None:
-# Even if this particular tag is not pretty-printed,
-# we should indent up to the start of the tag.
-s.append(indent_space)
-s.append('<%s%s%s%s>' % (
-prefix, self.name, attribute_string, close))
-if pretty_print:
-s.append("\n")
-s.append(contents)
-if pretty_print and contents and contents[-1] != "\n":
-s.append("\n")
-if pretty_print and closeTag:
-s.append(space)
-s.append(closeTag)
-if indent_level is not None and closeTag and self.next_sibling:
-# Even if this particular tag is not pretty-printed,
-# we're now done with the tag, and we should add a
-# newline if appropriate.
-s.append("\n")
-s = ''.join(s)
-return s
-def _should_pretty_print(self, indent_level):
-"""Should this tag be pretty-printed?
-Most of them should, but some (such as <pre> in HTML
-documents) should not.
-"""
-return (
-indent_level is not None
-and (
-not self.preserve_whitespace_tags
-or self.name not in self.preserve_whitespace_tags
-)
-)
-def prettify(self, encoding=None, formatter="minimal"):
-"""Pretty-print this PageElement as a string.
-:param encoding: The eventual encoding of the string. If this is None,
-a Unicode string will be returned.
-:param formatter: A Formatter object, or a string naming one of
-the standard formatters.
-:return: A Unicode string (if encoding==None) or a bytestring
-(otherwise).
-"""
-if encoding is None:
-return self.decode(True, formatter=formatter)
-else:
-return self.encode(encoding, True, formatter=formatter)
-def decode_contents(self, indent_level=None,
-eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-formatter="minimal"):
-"""Renders the contents of this tag as a Unicode string.
-:param indent_level: Each line of the rendering will be
-indented this many spaces. Used internally in
-recursive calls while pretty-printing.
-:param eventual_encoding: The tag is destined to be
-encoded into this encoding. decode_contents() is _not_
-responsible for performing that encoding. This information
-is passed in so that it can be substituted in if the
-document contains a <META> tag that mentions the document's
-encoding.
-:param formatter: A Formatter object, or a string naming one of
-the standard Formatters.
-"""
-# First off, turn a string formatter into a Formatter object. This
-# will stop the lookup from happening over and over again.
-if not isinstance(formatter, Formatter):
-formatter = self.formatter_for_name(formatter)
-pretty_print = (indent_level is not None)
-s = []
-for c in self:
-text = None
-if isinstance(c, NavigableString):
-text = c.output_ready(formatter)
-elif isinstance(c, Tag):
-s.append(c.decode(indent_level, eventual_encoding,
-formatter))
-preserve_whitespace = (
-self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
-)
-if text and indent_level and not preserve_whitespace:
-text = text.strip()
-if text:
-if pretty_print and not preserve_whitespace:
-s.append(" " * (indent_level - 1))
-s.append(text)
-if pretty_print and not preserve_whitespace:
-s.append("\n")
-return ''.join(s)
-def encode_contents(
-self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
-formatter="minimal"):
-"""Renders the contents of this PageElement as a bytestring.
-:param indent_level: Each line of the rendering will be
-indented this many spaces. Used internally in
-recursive calls while pretty-printing.
-:param eventual_encoding: The bytestring will be in this encoding.
-:param formatter: A Formatter object, or a string naming one of
-the standard Formatters.
-:return: A bytestring.
-"""
-contents = self.decode_contents(indent_level, encoding, formatter)
-return contents.encode(encoding)
-# Old method for BS3 compatibility
-def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-prettyPrint=False, indentLevel=0):
-"""Deprecated method for BS3 compatibility."""
-if not prettyPrint:
-indentLevel = None
-return self.encode_contents(
-indent_level=indentLevel, encoding=encoding)
-#Soup methods
-def find(self, name=None, attrs={}, recursive=True, text=None,
-**kwargs):
-"""Look in the children of this PageElement and find the first
-PageElement that matches the given criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param recursive: If this is True, find() will perform a
-recursive search of this PageElement's children. Otherwise,
-only the direct children will be considered.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A PageElement.
-:rtype: bs4.element.Tag | bs4.element.NavigableString
-"""
-r = None
-l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
-if l:
-r = l[0]
-return r
-findChild = find
-def find_all(self, name=None, attrs={}, recursive=True, text=None,
-limit=None, **kwargs):
-"""Look in the children of this PageElement and find all
-PageElements that match the given criteria.
-All find_* methods take a common set of arguments. See the online
-documentation for detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param recursive: If this is True, find_all() will perform a
-recursive search of this PageElement's children. Otherwise,
-only the direct children will be considered.
-:param limit: Stop looking after finding this many results.
-:kwargs: A dictionary of filters on attribute values.
-:return: A ResultSet of PageElements.
-:rtype: bs4.element.ResultSet
-"""
-generator = self.descendants
-if not recursive:
-generator = self.children
-return self._find_all(name, attrs, text, limit, generator, **kwargs)
-findAll = find_all       # BS3
-findChildren = find_all  # BS2
-#Generator methods
-@property
-def children(self):
-"""Iterate over all direct children of this PageElement.
-:yield: A sequence of PageElements.
-"""
-# return iter() to make the purpose of the method clear
-return iter(self.contents)  # XXX This seems to be untested.
-@property
-def descendants(self):
-"""Iterate over all children of this PageElement in a
-breadth-first sequence.
-:yield: A sequence of PageElements.
-"""
-if not len(self.contents):
-return
-stopNode = self._last_descendant().next_element
-current = self.contents[0]
-while current is not stopNode:
-yield current
-current = current.next_element
-# CSS selector code
-def select_one(self, selector, namespaces=None, **kwargs):
-"""Perform a CSS selection operation on the current element.
-:param selector: A CSS selector.
-:param namespaces: A dictionary mapping namespace prefixes
-used in the CSS selector to namespace URIs. By default,
-Beautiful Soup will use the prefixes it encountered while
-parsing the document.
-:param kwargs: Keyword arguments to be passed into SoupSieve's
-soupsieve.select() method.
-:return: A Tag.
-:rtype: bs4.element.Tag
-"""
-value = self.select(selector, namespaces, 1, **kwargs)
-if value:
-return value[0]
-return None
-def select(self, selector, namespaces=None, limit=None, **kwargs):
-"""Perform a CSS selection operation on the current element.
-This uses the SoupSieve library.
-:param selector: A string containing a CSS selector.
-:param namespaces: A dictionary mapping namespace prefixes
-used in the CSS selector to namespace URIs. By default,
-Beautiful Soup will use the prefixes it encountered while
-parsing the document.
-:param limit: After finding this number of results, stop looking.
-:param kwargs: Keyword arguments to be passed into SoupSieve's
-soupsieve.select() method.
-:return: A ResultSet of Tags.
-:rtype: bs4.element.ResultSet
-"""
-if namespaces is None:
-namespaces = self._namespaces
-if limit is None:
-limit = 0
-if soupsieve is None:
-raise NotImplementedError(
-"Cannot execute CSS selectors because the soupsieve package is not installed."
-)
-results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
-# We do this because it's more consistent and because
-# ResultSet.__getattr__ has a helpful error message.
-return ResultSet(None, results)
-# Old names for backwards compatibility
-def childGenerator(self):
-"""Deprecated generator."""
-return self.children
-def recursiveChildGenerator(self):
-"""Deprecated generator."""
-return self.descendants
-def has_key(self, key):
-"""Deprecated method. This was kind of misleading because has_key()
-(attributes) was different from __in__ (contents).
-has_key() is gone in Python 3, anyway.
-"""
-warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
-key))
-return self.has_attr(key)
-# Next, a couple classes to represent queries and their results.
-class SoupStrainer(object):
-"""Encapsulates a number of ways of matching a markup element (tag or
-string).
-This is primarily used to underpin the find_* methods, but you can
-create one yourself and pass it in as `parse_only` to the
-`BeautifulSoup` constructor, to parse a subset of a large
-document.
-"""
-def __init__(self, name=None, attrs={}, text=None, **kwargs):
-"""Constructor.
-The SoupStrainer constructor takes the same arguments passed
-into the find_* methods. See the online documentation for
-detailed explanations.
-:param name: A filter on tag name.
-:param attrs: A dictionary of filters on attribute values.
-:param text: A filter for a NavigableString with specific text.
-:kwargs: A dictionary of filters on attribute values.
-"""
-self.name = self._normalize_search_value(name)
-if not isinstance(attrs, dict):
-# Treat a non-dict value for attrs as a search for the 'class'
-# attribute.
-kwargs['class'] = attrs
-attrs = None
-if 'class_' in kwargs:
-# Treat class_="foo" as a search for the 'class'
-# attribute, overriding any non-dict value for attrs.
-kwargs['class'] = kwargs['class_']
-del kwargs['class_']
-if kwargs:
-if attrs:
-attrs = attrs.copy()
-attrs.update(kwargs)
-else:
-attrs = kwargs
-normalized_attrs = {}
-for key, value in list(attrs.items()):
-normalized_attrs[key] = self._normalize_search_value(value)
-self.attrs = normalized_attrs
-self.text = self._normalize_search_value(text)
-def _normalize_search_value(self, value):
-# Leave it alone if it's a Unicode string, a callable, a
-# regular expression, a boolean, or None.
-if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
-or isinstance(value, bool) or value is None):
-return value
-# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
-if isinstance(value, bytes):
-return value.decode("utf8")
-# If it's listlike, convert it into a list of strings.
-if hasattr(value, '__iter__'):
-new_value = []
-for v in value:
-if (hasattr(v, '__iter__') and not isinstance(v, bytes)
-and not isinstance(v, str)):
-# This is almost certainly the user's mistake. In the
-# interests of avoiding infinite loops, we'll let
-# it through as-is rather than doing a recursive call.
-new_value.append(v)
-else:
-new_value.append(self._normalize_search_value(v))
-return new_value
-# Otherwise, convert it into a Unicode string.
-# The unicode(str()) thing is so this will do the same thing on Python 2
-# and Python 3.
-return str(str(value))
-def __str__(self):
-"""A human-readable representation of this SoupStrainer."""
-if self.text:
-return self.text
-else:
-return "%s|%s" % (self.name, self.attrs)
-def search_tag(self, markup_name=None, markup_attrs={}):
-"""Check whether a Tag with the given name and attributes would
-match this SoupStrainer.
-Used prospectively to decide whether to even bother creating a Tag
-object.
-:param markup_name: A tag name as found in some markup.
-:param markup_attrs: A dictionary of attributes as found in some markup.
-:return: True if the prospective tag would match this SoupStrainer;
-False otherwise.
-"""
-found = None
-markup = None
-if isinstance(markup_name, Tag):
-markup = markup_name
-markup_attrs = markup
-call_function_with_tag_data = (
-isinstance(self.name, Callable)
-and not isinstance(markup_name, Tag))
-if ((not self.name)
-or call_function_with_tag_data
-or (markup and self._matches(markup, self.name))
-or (not markup and self._matches(markup_name, self.name))):
-if call_function_with_tag_data:
-match = self.name(markup_name, markup_attrs)
-else:
-match = True
-markup_attr_map = None
-for attr, match_against in list(self.attrs.items()):
-if not markup_attr_map:
-if hasattr(markup_attrs, 'get'):
-markup_attr_map = markup_attrs
-else:
-markup_attr_map = {}
-for k, v in markup_attrs:
-markup_attr_map[k] = v
-attr_value = markup_attr_map.get(attr)
-if not self._matches(attr_value, match_against):
-match = False
-break
-if match:
-if markup:
-found = markup
-else:
-found = markup_name
-if found and self.text and not self._matches(found.string, self.text):
-found = None
-return found
-# For BS3 compatibility.
-searchTag = search_tag
-def search(self, markup):
-"""Find all items in `markup` that match this SoupStrainer.
-Used by the core _find_all() method, which is ultimately
-called by all find_* methods.
-:param markup: A PageElement or a list of them.
-"""
-# print 'looking for %s in %s' % (self, markup)
-found = None
-# If given a list of items, scan it for a text element that
-# matches.
-if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
-for element in markup:
-if isinstance(element, NavigableString) \
-and self.search(element):
-found = element
-break
-# If it's a Tag, make sure its name or attributes match.
-# Don't bother with Tags if we're searching for text.
-elif isinstance(markup, Tag):
-if not self.text or self.name or self.attrs:
-found = self.search_tag(markup)
-# If it's text, make sure the text matches.
-elif isinstance(markup, NavigableString) or \
-isinstance(markup, str):
-if not self.name and not self.attrs and self._matches(markup, self.text):
-found = markup
-else:
-raise Exception(
-"I don't know how to match against a %s" % markup.__class__)
-return found
-def _matches(self, markup, match_against, already_tried=None):
-# print u"Matching %s against %s" % (markup, match_against)
-result = False
-if isinstance(markup, list) or isinstance(markup, tuple):
-# This should only happen when searching a multi-valued attribute
-# like 'class'.
-for item in markup:
-if self._matches(item, match_against):
-return True
-# We didn't match any particular value of the multivalue
-# attribute, but maybe we match the attribute value when
-# considered as a string.
-if self._matches(' '.join(markup), match_against):
-return True
-return False
-if match_against is True:
-# True matches any non-None value.
-return markup is not None
-if isinstance(match_against, Callable):
-return match_against(markup)
-# Custom callables take the tag as an argument, but all
-# other ways of matching match the tag name as a string.
-original_markup = markup
-if isinstance(markup, Tag):
-markup = markup.name
-# Ensure that `markup` is either a Unicode string, or None.
-markup = self._normalize_search_value(markup)
-if markup is None:
-# None matches None, False, an empty string, an empty list, and so on.
-return not match_against
-if (hasattr(match_against, '__iter__')
-and not isinstance(match_against, str)):
-# We're asked to match against an iterable of items.
-# The markup must be match at least one item in the
-# iterable. We'll try each one in turn.
-#
-# To avoid infinite recursion we need to keep track of
-# items we've already seen.
-if not already_tried:
-already_tried = set()
-for item in match_against:
-if item.__hash__:
-key = item
-else:
-key = id(item)
-if key in already_tried:
-continue
-else:
-already_tried.add(key)
-if self._matches(original_markup, item, already_tried):
-return True
-else:
-return False
-# Beyond this point we might need to run the test twice: once against
-# the tag's name and once against its prefixed name.
-match = False
-if not match and isinstance(match_against, str):
-# Exact string match
-match = markup == match_against
-if not match and hasattr(match_against, 'search'):
-# Regexp match
-return match_against.search(markup)
-if (not match
-and isinstance(original_markup, Tag)
-and original_markup.prefix):
-# Try the whole thing again with the prefixed tag name.
-return self._matches(
-original_markup.prefix + ':' + original_markup.name, match_against
-)
-return match
-class ResultSet(list):
-"""A ResultSet is just a list that keeps track of the SoupStrainer
-that created it."""
-def __init__(self, source, result=()):
-"""Constructor.
-:param source: A SoupStrainer.
-:param result: A list of PageElements.
-"""
-super(ResultSet, self).__init__(result)
-self.source = source
-def __getattr__(self, key):
-"""Raise a helpful exception to explain a common code fix."""
-raise AttributeError(
-"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
-)

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/bs4/element.py @ 5:9b1c78e6ba9c draft default tip