Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/bs4/builder/__init__.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/bs4/builder/__init__.py Fri Jul 31 00:18:57 2020 -0400 @@ -0,0 +1,520 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, + nonwhitespace_re +) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a textual document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + # Whitespace should be preserved inside these tags. + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} + + USE_DEFAULT = object() + + # Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS = False + + def __init__(self, multi_valued_attributes=USE_DEFAULT, + preserve_whitespace_tags=USE_DEFAULT, + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: A list of tags to treat + the way <pre> tags are treated in HTML. Tags in this list + are immune from pretty-printing; their contents will always be + output as-is. + + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing DEFAULT_STRING_CONTAINERS. + + :param store_line_numbers: If the parser keeps track of the + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. + """ + self.soup = soup + + def reset(self): + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p/>" or "<p>". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will + be left alone. + + :param tag_name: The name of a markup tag. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ + yield markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. + """ + return fragment + + def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print("Start %s, %r" % (name, attrs)) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print("End %s" % name) + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably <noscript> could go here but it seems + # qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS = { + 'style': Stylesheet, + 'script': Script, + 'template': TemplateString, + } + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + # We are only interested in <meta> tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + """An Exception to be raised when the underlying parser simply + refuses to parse the given markup. + """ + def __init__(self, message_or_exception): + """Explain why the parser rejected the given markup, either + with a textual explanation or another exception. + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception + message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last resort. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass