Mercurial > repos > guerler > hhblits
diff lib/python3.8/site-packages/pip/_vendor/html5lib/serializer.py @ 0:9e54283cc701 draft
"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author | guerler |
---|---|
date | Mon, 27 Jul 2020 03:47:31 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python3.8/site-packages/pip/_vendor/html5lib/serializer.py Mon Jul 27 03:47:31 2020 -0400 @@ -0,0 +1,409 @@ +from __future__ import absolute_import, division, unicode_literals +from pip._vendor.six import text_type + +import re + +from codecs import register_error, xmlcharrefreplace_errors + +from .constants import voidElements, booleanAttributes, spaceCharacters +from .constants import rcdataElements, entities, xmlEntities +from . import treewalkers, _utils +from xml.sax.saxutils import escape + +_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" +_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") +_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" + "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" + "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" + "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" + "\u3000]") + + +_encode_entity_map = {} +_is_ucs4 = len("\U0010FFFF") == 1 +for k, v in list(entities.items()): + # skip multi-character entities + if ((_is_ucs4 and len(v) > 1) or + (not _is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = _utils.surrogatePairToCodepoint(v) + else: + v = ord(v) + if v not in _encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + _encode_entity_map[v] = k + + +def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = _encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("&#x%s;" % (hex(cp)[2:])) + return ("".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) + + +register_error("htmlentityreplace", htmlentityreplace_errors) + + +def serialize(input, tree="etree", encoding=None, **serializer_opts): + """Serializes the input token stream using the specified treewalker + + :arg input: the token stream to serialize + + :arg tree: the treewalker to use + + :arg encoding: the encoding to use + + :arg serializer_opts: any options to pass to the + :py:class:`html5lib.serializer.HTMLSerializer` that gets created + + :returns: the tree serialized as a string + + Example: + + >>> from html5lib.html5parser import parse + >>> from html5lib.serializer import serialize + >>> token_stream = parse('<html><body><p>Hi!</p></body></html>') + >>> serialize(token_stream, omit_optional_tags=False) + '<html><head></head><body><p>Hi!</p></body></html>' + + """ + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + s = HTMLSerializer(**serializer_opts) + return s.render(walker(input), encoding) + + +class HTMLSerializer(object): + + # attribute quoting options + quote_attr_values = "legacy" # be secure by default + quote_char = '"' + use_best_quote_char = True + + # tag syntax options + omit_optional_tags = True + minimize_boolean_attributes = True + use_trailing_solidus = False + space_before_trailing_solidus = True + + # escaping options + escape_lt_in_attrs = False + escape_rcdata = False + resolve_entities = True + + # miscellaneous options + alphabetical_attributes = False + inject_meta_charset = True + strip_whitespace = False + sanitize = False + + options = ("quote_attr_values", "quote_char", "use_best_quote_char", + "omit_optional_tags", "minimize_boolean_attributes", + "use_trailing_solidus", "space_before_trailing_solidus", + "escape_lt_in_attrs", "escape_rcdata", "resolve_entities", + "alphabetical_attributes", "inject_meta_charset", + "strip_whitespace", "sanitize") + + def __init__(self, **kwargs): + """Initialize HTMLSerializer + + :arg inject_meta_charset: Whether or not to inject the meta charset. + + Defaults to ``True``. + + :arg quote_attr_values: Whether to quote attribute values that don't + require quoting per legacy browser behavior (``"legacy"``), when + required by the standard (``"spec"``), or always (``"always"``). + + Defaults to ``"legacy"``. + + :arg quote_char: Use given quote character for attribute quoting. + + Defaults to ``"`` which will use double quotes unless attribute + value contains a double quote, in which case single quotes are + used. + + :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute + values. + + Defaults to ``False``. + + :arg escape_rcdata: Whether to escape characters that need to be + escaped within normal elements within rcdata elements such as + style. + + Defaults to ``False``. + + :arg resolve_entities: Whether to resolve named character entities that + appear in the source tree. The XML predefined entities < > + & " ' are unaffected by this setting. + + Defaults to ``True``. + + :arg strip_whitespace: Whether to remove semantically meaningless + whitespace. (This compresses all whitespace to a single space + except within ``pre``.) + + Defaults to ``False``. + + :arg minimize_boolean_attributes: Shortens boolean attributes to give + just the attribute value, for example:: + + <input disabled="disabled"> + + becomes:: + + <input disabled> + + Defaults to ``True``. + + :arg use_trailing_solidus: Includes a close-tag slash at the end of the + start tag of void elements (empty elements whose end tag is + forbidden). E.g. ``<hr/>``. + + Defaults to ``False``. + + :arg space_before_trailing_solidus: Places a space immediately before + the closing slash in a tag using a trailing solidus. E.g. + ``<hr />``. Requires ``use_trailing_solidus=True``. + + Defaults to ``True``. + + :arg sanitize: Strip all unsafe or unknown constructs from output. + See :py:class:`html5lib.filters.sanitizer.Filter`. + + Defaults to ``False``. + + :arg omit_optional_tags: Omit start/end tags that are optional. + + Defaults to ``True``. + + :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. + + Defaults to ``False``. + + """ + unexpected_args = frozenset(kwargs) - frozenset(self.options) + if len(unexpected_args) > 0: + raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args))) + if 'quote_char' in kwargs: + self.use_best_quote_char = False + for attr in self.options: + setattr(self, attr, kwargs.get(attr, getattr(self, attr))) + self.errors = [] + self.strict = False + + def encode(self, string): + assert(isinstance(string, text_type)) + if self.encoding: + return string.encode(self.encoding, "htmlentityreplace") + else: + return string + + def encodeStrict(self, string): + assert(isinstance(string, text_type)) + if self.encoding: + return string.encode(self.encoding, "strict") + else: + return string + + def serialize(self, treewalker, encoding=None): + # pylint:disable=too-many-nested-blocks + self.encoding = encoding + in_cdata = False + self.errors = [] + + if encoding and self.inject_meta_charset: + from .filters.inject_meta_charset import Filter + treewalker = Filter(treewalker, encoding) + # Alphabetical attributes is here under the assumption that none of + # the later filters add or change order of attributes; it needs to be + # before the sanitizer so escaped elements come out correctly + if self.alphabetical_attributes: + from .filters.alphabeticalattributes import Filter + treewalker = Filter(treewalker) + # WhitespaceFilter should be used before OptionalTagFilter + # for maximum efficiently of this latter filter + if self.strip_whitespace: + from .filters.whitespace import Filter + treewalker = Filter(treewalker) + if self.sanitize: + from .filters.sanitizer import Filter + treewalker = Filter(treewalker) + if self.omit_optional_tags: + from .filters.optionaltags import Filter + treewalker = Filter(treewalker) + + for token in treewalker: + type = token["type"] + if type == "Doctype": + doctype = "<!DOCTYPE %s" % token["name"] + + if token["publicId"]: + doctype += ' PUBLIC "%s"' % token["publicId"] + elif token["systemId"]: + doctype += " SYSTEM" + if token["systemId"]: + if token["systemId"].find('"') >= 0: + if token["systemId"].find("'") >= 0: + self.serializeError("System identifer contains both single and double quote characters") + quote_char = "'" + else: + quote_char = '"' + doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) + + doctype += ">" + yield self.encodeStrict(doctype) + + elif type in ("Characters", "SpaceCharacters"): + if type == "SpaceCharacters" or in_cdata: + if in_cdata and token["data"].find("</") >= 0: + self.serializeError("Unexpected </ in CDATA") + yield self.encode(token["data"]) + else: + yield self.encode(escape(token["data"])) + + elif type in ("StartTag", "EmptyTag"): + name = token["name"] + yield self.encodeStrict("<%s" % name) + if name in rcdataElements and not self.escape_rcdata: + in_cdata = True + elif in_cdata: + self.serializeError("Unexpected child element of a CDATA element") + for (_, attr_name), attr_value in token["data"].items(): + # TODO: Add namespace support here + k = attr_name + v = attr_value + yield self.encodeStrict(' ') + + yield self.encodeStrict(k) + if not self.minimize_boolean_attributes or \ + (k not in booleanAttributes.get(name, tuple()) and + k not in booleanAttributes.get("", tuple())): + yield self.encodeStrict("=") + if self.quote_attr_values == "always" or len(v) == 0: + quote_attr = True + elif self.quote_attr_values == "spec": + quote_attr = _quoteAttributeSpec.search(v) is not None + elif self.quote_attr_values == "legacy": + quote_attr = _quoteAttributeLegacy.search(v) is not None + else: + raise ValueError("quote_attr_values must be one of: " + "'always', 'spec', or 'legacy'") + v = v.replace("&", "&") + if self.escape_lt_in_attrs: + v = v.replace("<", "<") + if quote_attr: + quote_char = self.quote_char + if self.use_best_quote_char: + if "'" in v and '"' not in v: + quote_char = '"' + elif '"' in v and "'" not in v: + quote_char = "'" + if quote_char == "'": + v = v.replace("'", "'") + else: + v = v.replace('"', """) + yield self.encodeStrict(quote_char) + yield self.encode(v) + yield self.encodeStrict(quote_char) + else: + yield self.encode(v) + if name in voidElements and self.use_trailing_solidus: + if self.space_before_trailing_solidus: + yield self.encodeStrict(" /") + else: + yield self.encodeStrict("/") + yield self.encode(">") + + elif type == "EndTag": + name = token["name"] + if name in rcdataElements: + in_cdata = False + elif in_cdata: + self.serializeError("Unexpected child element of a CDATA element") + yield self.encodeStrict("</%s>" % name) + + elif type == "Comment": + data = token["data"] + if data.find("--") >= 0: + self.serializeError("Comment contains --") + yield self.encodeStrict("<!--%s-->" % token["data"]) + + elif type == "Entity": + name = token["name"] + key = name + ";" + if key not in entities: + self.serializeError("Entity %s not recognized" % name) + if self.resolve_entities and key not in xmlEntities: + data = entities[key] + else: + data = "&%s;" % name + yield self.encodeStrict(data) + + else: + self.serializeError(token["data"]) + + def render(self, treewalker, encoding=None): + """Serializes the stream from the treewalker into a string + + :arg treewalker: the treewalker to serialize + + :arg encoding: the string encoding to use + + :returns: the serialized tree + + Example: + + >>> from html5lib import parse, getTreeWalker + >>> from html5lib.serializer import HTMLSerializer + >>> token_stream = parse('<html><body>Hi!</body></html>') + >>> walker = getTreeWalker('etree') + >>> serializer = HTMLSerializer(omit_optional_tags=False) + >>> serializer.render(walker(token_stream)) + '<html><head></head><body>Hi!</body></html>' + + """ + if encoding: + return b"".join(list(self.serialize(treewalker, encoding))) + else: + return "".join(list(self.serialize(treewalker))) + + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): + # XXX The idea is to make data mandatory. + self.errors.append(data) + if self.strict: + raise SerializeError + + +class SerializeError(Exception): + """Error in serialized tree""" + pass