diff lib/python3.8/site-packages/pip/_vendor/html5lib/serializer.py @ 1:64071f2a4cf0 draft default tip

Deleted selected files
author guerler
date Mon, 27 Jul 2020 03:55:49 -0400
parents 9e54283cc701
children
line wrap: on
line diff
--- a/lib/python3.8/site-packages/pip/_vendor/html5lib/serializer.py	Mon Jul 27 03:47:31 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,409 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-from pip._vendor.six import text_type
-
-import re
-
-from codecs import register_error, xmlcharrefreplace_errors
-
-from .constants import voidElements, booleanAttributes, spaceCharacters
-from .constants import rcdataElements, entities, xmlEntities
-from . import treewalkers, _utils
-from xml.sax.saxutils import escape
-
-_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
-_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
-_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
-                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
-                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
-                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
-                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
-                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
-                                   "\u3000]")
-
-
-_encode_entity_map = {}
-_is_ucs4 = len("\U0010FFFF") == 1
-for k, v in list(entities.items()):
-    # skip multi-character entities
-    if ((_is_ucs4 and len(v) > 1) or
-            (not _is_ucs4 and len(v) > 2)):
-        continue
-    if v != "&":
-        if len(v) == 2:
-            v = _utils.surrogatePairToCodepoint(v)
-        else:
-            v = ord(v)
-        if v not in _encode_entity_map or k.islower():
-            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
-            _encode_entity_map[v] = k
-
-
-def htmlentityreplace_errors(exc):
-    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
-        res = []
-        codepoints = []
-        skip = False
-        for i, c in enumerate(exc.object[exc.start:exc.end]):
-            if skip:
-                skip = False
-                continue
-            index = i + exc.start
-            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
-                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
-                skip = True
-            else:
-                codepoint = ord(c)
-            codepoints.append(codepoint)
-        for cp in codepoints:
-            e = _encode_entity_map.get(cp)
-            if e:
-                res.append("&")
-                res.append(e)
-                if not e.endswith(";"):
-                    res.append(";")
-            else:
-                res.append("&#x%s;" % (hex(cp)[2:]))
-        return ("".join(res), exc.end)
-    else:
-        return xmlcharrefreplace_errors(exc)
-
-
-register_error("htmlentityreplace", htmlentityreplace_errors)
-
-
-def serialize(input, tree="etree", encoding=None, **serializer_opts):
-    """Serializes the input token stream using the specified treewalker
-
-    :arg input: the token stream to serialize
-
-    :arg tree: the treewalker to use
-
-    :arg encoding: the encoding to use
-
-    :arg serializer_opts: any options to pass to the
-        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
-
-    :returns: the tree serialized as a string
-
-    Example:
-
-    >>> from html5lib.html5parser import parse
-    >>> from html5lib.serializer import serialize
-    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
-    >>> serialize(token_stream, omit_optional_tags=False)
-    '<html><head></head><body><p>Hi!</p></body></html>'
-
-    """
-    # XXX: Should we cache this?
-    walker = treewalkers.getTreeWalker(tree)
-    s = HTMLSerializer(**serializer_opts)
-    return s.render(walker(input), encoding)
-
-
-class HTMLSerializer(object):
-
-    # attribute quoting options
-    quote_attr_values = "legacy"  # be secure by default
-    quote_char = '"'
-    use_best_quote_char = True
-
-    # tag syntax options
-    omit_optional_tags = True
-    minimize_boolean_attributes = True
-    use_trailing_solidus = False
-    space_before_trailing_solidus = True
-
-    # escaping options
-    escape_lt_in_attrs = False
-    escape_rcdata = False
-    resolve_entities = True
-
-    # miscellaneous options
-    alphabetical_attributes = False
-    inject_meta_charset = True
-    strip_whitespace = False
-    sanitize = False
-
-    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
-               "omit_optional_tags", "minimize_boolean_attributes",
-               "use_trailing_solidus", "space_before_trailing_solidus",
-               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
-               "alphabetical_attributes", "inject_meta_charset",
-               "strip_whitespace", "sanitize")
-
-    def __init__(self, **kwargs):
-        """Initialize HTMLSerializer
-
-        :arg inject_meta_charset: Whether or not to inject the meta charset.
-
-            Defaults to ``True``.
-
-        :arg quote_attr_values: Whether to quote attribute values that don't
-            require quoting per legacy browser behavior (``"legacy"``), when
-            required by the standard (``"spec"``), or always (``"always"``).
-
-            Defaults to ``"legacy"``.
-
-        :arg quote_char: Use given quote character for attribute quoting.
-
-            Defaults to ``"`` which will use double quotes unless attribute
-            value contains a double quote, in which case single quotes are
-            used.
-
-        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
-            values.
-
-            Defaults to ``False``.
-
-        :arg escape_rcdata: Whether to escape characters that need to be
-            escaped within normal elements within rcdata elements such as
-            style.
-
-            Defaults to ``False``.
-
-        :arg resolve_entities: Whether to resolve named character entities that
-            appear in the source tree. The XML predefined entities &lt; &gt;
-            &amp; &quot; &apos; are unaffected by this setting.
-
-            Defaults to ``True``.
-
-        :arg strip_whitespace: Whether to remove semantically meaningless
-            whitespace. (This compresses all whitespace to a single space
-            except within ``pre``.)
-
-            Defaults to ``False``.
-
-        :arg minimize_boolean_attributes: Shortens boolean attributes to give
-            just the attribute value, for example::
-
-              <input disabled="disabled">
-
-            becomes::
-
-              <input disabled>
-
-            Defaults to ``True``.
-
-        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
-            start tag of void elements (empty elements whose end tag is
-            forbidden). E.g. ``<hr/>``.
-
-            Defaults to ``False``.
-
-        :arg space_before_trailing_solidus: Places a space immediately before
-            the closing slash in a tag using a trailing solidus. E.g.
-            ``<hr />``. Requires ``use_trailing_solidus=True``.
-
-            Defaults to ``True``.
-
-        :arg sanitize: Strip all unsafe or unknown constructs from output.
-            See :py:class:`html5lib.filters.sanitizer.Filter`.
-
-            Defaults to ``False``.
-
-        :arg omit_optional_tags: Omit start/end tags that are optional.
-
-            Defaults to ``True``.
-
-        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
-
-            Defaults to ``False``.
-
-        """
-        unexpected_args = frozenset(kwargs) - frozenset(self.options)
-        if len(unexpected_args) > 0:
-            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
-        if 'quote_char' in kwargs:
-            self.use_best_quote_char = False
-        for attr in self.options:
-            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
-        self.errors = []
-        self.strict = False
-
-    def encode(self, string):
-        assert(isinstance(string, text_type))
-        if self.encoding:
-            return string.encode(self.encoding, "htmlentityreplace")
-        else:
-            return string
-
-    def encodeStrict(self, string):
-        assert(isinstance(string, text_type))
-        if self.encoding:
-            return string.encode(self.encoding, "strict")
-        else:
-            return string
-
-    def serialize(self, treewalker, encoding=None):
-        # pylint:disable=too-many-nested-blocks
-        self.encoding = encoding
-        in_cdata = False
-        self.errors = []
-
-        if encoding and self.inject_meta_charset:
-            from .filters.inject_meta_charset import Filter
-            treewalker = Filter(treewalker, encoding)
-        # Alphabetical attributes is here under the assumption that none of
-        # the later filters add or change order of attributes; it needs to be
-        # before the sanitizer so escaped elements come out correctly
-        if self.alphabetical_attributes:
-            from .filters.alphabeticalattributes import Filter
-            treewalker = Filter(treewalker)
-        # WhitespaceFilter should be used before OptionalTagFilter
-        # for maximum efficiently of this latter filter
-        if self.strip_whitespace:
-            from .filters.whitespace import Filter
-            treewalker = Filter(treewalker)
-        if self.sanitize:
-            from .filters.sanitizer import Filter
-            treewalker = Filter(treewalker)
-        if self.omit_optional_tags:
-            from .filters.optionaltags import Filter
-            treewalker = Filter(treewalker)
-
-        for token in treewalker:
-            type = token["type"]
-            if type == "Doctype":
-                doctype = "<!DOCTYPE %s" % token["name"]
-
-                if token["publicId"]:
-                    doctype += ' PUBLIC "%s"' % token["publicId"]
-                elif token["systemId"]:
-                    doctype += " SYSTEM"
-                if token["systemId"]:
-                    if token["systemId"].find('"') >= 0:
-                        if token["systemId"].find("'") >= 0:
-                            self.serializeError("System identifer contains both single and double quote characters")
-                        quote_char = "'"
-                    else:
-                        quote_char = '"'
-                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
-
-                doctype += ">"
-                yield self.encodeStrict(doctype)
-
-            elif type in ("Characters", "SpaceCharacters"):
-                if type == "SpaceCharacters" or in_cdata:
-                    if in_cdata and token["data"].find("</") >= 0:
-                        self.serializeError("Unexpected </ in CDATA")
-                    yield self.encode(token["data"])
-                else:
-                    yield self.encode(escape(token["data"]))
-
-            elif type in ("StartTag", "EmptyTag"):
-                name = token["name"]
-                yield self.encodeStrict("<%s" % name)
-                if name in rcdataElements and not self.escape_rcdata:
-                    in_cdata = True
-                elif in_cdata:
-                    self.serializeError("Unexpected child element of a CDATA element")
-                for (_, attr_name), attr_value in token["data"].items():
-                    # TODO: Add namespace support here
-                    k = attr_name
-                    v = attr_value
-                    yield self.encodeStrict(' ')
-
-                    yield self.encodeStrict(k)
-                    if not self.minimize_boolean_attributes or \
-                        (k not in booleanAttributes.get(name, tuple()) and
-                         k not in booleanAttributes.get("", tuple())):
-                        yield self.encodeStrict("=")
-                        if self.quote_attr_values == "always" or len(v) == 0:
-                            quote_attr = True
-                        elif self.quote_attr_values == "spec":
-                            quote_attr = _quoteAttributeSpec.search(v) is not None
-                        elif self.quote_attr_values == "legacy":
-                            quote_attr = _quoteAttributeLegacy.search(v) is not None
-                        else:
-                            raise ValueError("quote_attr_values must be one of: "
-                                             "'always', 'spec', or 'legacy'")
-                        v = v.replace("&", "&amp;")
-                        if self.escape_lt_in_attrs:
-                            v = v.replace("<", "&lt;")
-                        if quote_attr:
-                            quote_char = self.quote_char
-                            if self.use_best_quote_char:
-                                if "'" in v and '"' not in v:
-                                    quote_char = '"'
-                                elif '"' in v and "'" not in v:
-                                    quote_char = "'"
-                            if quote_char == "'":
-                                v = v.replace("'", "&#39;")
-                            else:
-                                v = v.replace('"', "&quot;")
-                            yield self.encodeStrict(quote_char)
-                            yield self.encode(v)
-                            yield self.encodeStrict(quote_char)
-                        else:
-                            yield self.encode(v)
-                if name in voidElements and self.use_trailing_solidus:
-                    if self.space_before_trailing_solidus:
-                        yield self.encodeStrict(" /")
-                    else:
-                        yield self.encodeStrict("/")
-                yield self.encode(">")
-
-            elif type == "EndTag":
-                name = token["name"]
-                if name in rcdataElements:
-                    in_cdata = False
-                elif in_cdata:
-                    self.serializeError("Unexpected child element of a CDATA element")
-                yield self.encodeStrict("</%s>" % name)
-
-            elif type == "Comment":
-                data = token["data"]
-                if data.find("--") >= 0:
-                    self.serializeError("Comment contains --")
-                yield self.encodeStrict("<!--%s-->" % token["data"])
-
-            elif type == "Entity":
-                name = token["name"]
-                key = name + ";"
-                if key not in entities:
-                    self.serializeError("Entity %s not recognized" % name)
-                if self.resolve_entities and key not in xmlEntities:
-                    data = entities[key]
-                else:
-                    data = "&%s;" % name
-                yield self.encodeStrict(data)
-
-            else:
-                self.serializeError(token["data"])
-
-    def render(self, treewalker, encoding=None):
-        """Serializes the stream from the treewalker into a string
-
-        :arg treewalker: the treewalker to serialize
-
-        :arg encoding: the string encoding to use
-
-        :returns: the serialized tree
-
-        Example:
-
-        >>> from html5lib import parse, getTreeWalker
-        >>> from html5lib.serializer import HTMLSerializer
-        >>> token_stream = parse('<html><body>Hi!</body></html>')
-        >>> walker = getTreeWalker('etree')
-        >>> serializer = HTMLSerializer(omit_optional_tags=False)
-        >>> serializer.render(walker(token_stream))
-        '<html><head></head><body>Hi!</body></html>'
-
-        """
-        if encoding:
-            return b"".join(list(self.serialize(treewalker, encoding)))
-        else:
-            return "".join(list(self.serialize(treewalker)))
-
-    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
-        # XXX The idea is to make data mandatory.
-        self.errors.append(data)
-        if self.strict:
-            raise SerializeError
-
-
-class SerializeError(Exception):
-    """Error in serialized tree"""
-    pass