diff env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bleach/sanitizer.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,624 +0,0 @@
-from __future__ import unicode_literals
-
-from itertools import chain
-import re
-
-import six
-from six.moves.urllib.parse import urlparse
-from xml.sax.saxutils import unescape
-
-from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
-
-
-#: List of allowed tags
-ALLOWED_TAGS = [
-    'a',
-    'abbr',
-    'acronym',
-    'b',
-    'blockquote',
-    'code',
-    'em',
-    'i',
-    'li',
-    'ol',
-    'strong',
-    'ul',
-]
-
-
-#: Map of allowed attributes by tag
-ALLOWED_ATTRIBUTES = {
-    'a': ['href', 'title'],
-    'abbr': ['title'],
-    'acronym': ['title'],
-}
-
-#: List of allowed styles
-ALLOWED_STYLES = []
-
-#: List of allowed protocols
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-
-#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
-INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
-
-#: Regexp for characters that are invisible
-INVISIBLE_CHARACTERS_RE = re.compile(
-    '[' + INVISIBLE_CHARACTERS + ']',
-    re.UNICODE
-)
-
-#: String to replace invisible characters with. This can be a character, a
-#: string, or even a function that takes a Python re matchobj
-INVISIBLE_REPLACEMENT_CHAR = '?'
-
-
-class Cleaner(object):
-    """Cleaner for cleaning HTML fragments of malicious content
-
-    This cleaner is a security-focused function whose sole purpose is to remove
-    malicious content from a string such that it can be displayed as content in
-    a web page.
-
-    To use::
-
-        from bleach.sanitizer import Cleaner
-
-        cleaner = Cleaner()
-
-        for text in all_the_yucky_things:
-            sanitized = cleaner.clean(text)
-
-    .. Note::
-
-       This cleaner is not designed to use to transform content to be used in
-       non-web-page contexts.
-
-    .. Warning::
-
-       This cleaner is not thread-safe--the html parser has internal state.
-       Create a separate cleaner per thread!
-
-
-    """
-
-    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-                 strip_comments=True, filters=None):
-        """Initializes a Cleaner
-
-        :arg list tags: allowed list of tags; defaults to
-            ``bleach.sanitizer.ALLOWED_TAGS``
-
-        :arg dict attributes: allowed attributes; can be a callable, list or dict;
-            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-
-        :arg list styles: allowed list of css styles; defaults to
-            ``bleach.sanitizer.ALLOWED_STYLES``
-
-        :arg list protocols: allowed list of protocols for links; defaults
-            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-
-        :arg bool strip: whether or not to strip disallowed elements
-
-        :arg bool strip_comments: whether or not to strip HTML comments
-
-        :arg list filters: list of html5lib Filter classes to pass streamed content through
-
-            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
-
-            .. Warning::
-
-               Using filters changes the output of ``bleach.Cleaner.clean``.
-               Make sure the way the filters change the output are secure.
-
-        """
-        self.tags = tags
-        self.attributes = attributes
-        self.styles = styles
-        self.protocols = protocols
-        self.strip = strip
-        self.strip_comments = strip_comments
-        self.filters = filters or []
-
-        self.parser = html5lib_shim.BleachHTMLParser(
-            tags=self.tags,
-            strip=self.strip,
-            consume_entities=False,
-            namespaceHTMLElements=False
-        )
-        self.walker = html5lib_shim.getTreeWalker('etree')
-        self.serializer = html5lib_shim.BleachHTMLSerializer(
-            quote_attr_values='always',
-            omit_optional_tags=False,
-            escape_lt_in_attrs=True,
-
-            # We want to leave entities as they are without escaping or
-            # resolving or expanding
-            resolve_entities=False,
-
-            # Bleach has its own sanitizer, so don't use the html5lib one
-            sanitize=False,
-
-            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
-            alphabetical_attributes=False,
-        )
-
-    def clean(self, text):
-        """Cleans text and returns sanitized result as unicode
-
-        :arg str text: text to be cleaned
-
-        :returns: sanitized text as unicode
-
-        :raises TypeError: if ``text`` is not a text type
-
-        """
-        if not isinstance(text, six.string_types):
-            message = "argument cannot be of '{name}' type, must be of text type".format(
-                name=text.__class__.__name__)
-            raise TypeError(message)
-
-        if not text:
-            return ''
-
-        text = force_unicode(text)
-
-        dom = self.parser.parseFragment(text)
-        filtered = BleachSanitizerFilter(
-            source=self.walker(dom),
-
-            # Bleach-sanitizer-specific things
-            attributes=self.attributes,
-            strip_disallowed_elements=self.strip,
-            strip_html_comments=self.strip_comments,
-
-            # html5lib-sanitizer things
-            allowed_elements=self.tags,
-            allowed_css_properties=self.styles,
-            allowed_protocols=self.protocols,
-            allowed_svg_properties=[],
-        )
-
-        # Apply any filters after the BleachSanitizerFilter
-        for filter_class in self.filters:
-            filtered = filter_class(source=filtered)
-
-        return self.serializer.render(filtered)
-
-
-def attribute_filter_factory(attributes):
-    """Generates attribute filter function for the given attributes value
-
-    The attributes value can take one of several shapes. This returns a filter
-    function appropriate to the attributes value. One nice thing about this is
-    that there's less if/then shenanigans in the ``allow_token`` method.
-
-    """
-    if callable(attributes):
-        return attributes
-
-    if isinstance(attributes, dict):
-        def _attr_filter(tag, attr, value):
-            if tag in attributes:
-                attr_val = attributes[tag]
-                if callable(attr_val):
-                    return attr_val(tag, attr, value)
-
-                if attr in attr_val:
-                    return True
-
-            if '*' in attributes:
-                attr_val = attributes['*']
-                if callable(attr_val):
-                    return attr_val(tag, attr, value)
-
-                return attr in attr_val
-
-            return False
-
-        return _attr_filter
-
-    if isinstance(attributes, list):
-        def _attr_filter(tag, attr, value):
-            return attr in attributes
-
-        return _attr_filter
-
-    raise ValueError('attributes needs to be a callable, a list or a dict')
-
-
-class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
-    """html5lib Filter that sanitizes text
-
-    This filter can be used anywhere html5lib filters can be used.
-
-    """
-    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
-                 strip_disallowed_elements=False, strip_html_comments=True,
-                 **kwargs):
-        """Creates a BleachSanitizerFilter instance
-
-        :arg Treewalker source: stream
-
-        :arg list tags: allowed list of tags; defaults to
-            ``bleach.sanitizer.ALLOWED_TAGS``
-
-        :arg dict attributes: allowed attributes; can be a callable, list or dict;
-            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-
-        :arg list styles: allowed list of css styles; defaults to
-            ``bleach.sanitizer.ALLOWED_STYLES``
-
-        :arg list protocols: allowed list of protocols for links; defaults
-            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-
-        :arg bool strip_disallowed_elements: whether or not to strip disallowed
-            elements
-
-        :arg bool strip_html_comments: whether or not to strip HTML comments
-
-        """
-        self.attr_filter = attribute_filter_factory(attributes)
-        self.strip_disallowed_elements = strip_disallowed_elements
-        self.strip_html_comments = strip_html_comments
-
-        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
-
-    def sanitize_stream(self, token_iterator):
-        for token in token_iterator:
-            ret = self.sanitize_token(token)
-
-            if not ret:
-                continue
-
-            if isinstance(ret, list):
-                for subtoken in ret:
-                    yield subtoken
-            else:
-                yield ret
-
-    def merge_characters(self, token_iterator):
-        """Merge consecutive Characters tokens in a stream"""
-        characters_buffer = []
-
-        for token in token_iterator:
-            if characters_buffer:
-                if token['type'] == 'Characters':
-                    characters_buffer.append(token)
-                    continue
-                else:
-                    # Merge all the characters tokens together into one and then
-                    # operate on it.
-                    new_token = {
-                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-                        'type': 'Characters'
-                    }
-                    characters_buffer = []
-                    yield new_token
-
-            elif token['type'] == 'Characters':
-                characters_buffer.append(token)
-                continue
-
-            yield token
-
-        new_token = {
-            'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-            'type': 'Characters'
-        }
-        yield new_token
-
-    def __iter__(self):
-        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
-
-    def sanitize_token(self, token):
-        """Sanitize a token either by HTML-encoding or dropping.
-
-        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
-        ['attribute', 'pairs'], 'tag': callable}.
-
-        Here callable is a function with two arguments of attribute name and
-        value. It should return true of false.
-
-        Also gives the option to strip tags instead of encoding.
-
-        :arg dict token: token to sanitize
-
-        :returns: token or list of tokens
-
-        """
-        token_type = token['type']
-        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
-            if token['name'] in self.allowed_elements:
-                return self.allow_token(token)
-
-            elif self.strip_disallowed_elements:
-                return None
-
-            else:
-                if 'data' in token:
-                    # Alphabetize the attributes before calling .disallowed_token()
-                    # so that the resulting string is stable
-                    token['data'] = alphabetize_attributes(token['data'])
-                return self.disallowed_token(token)
-
-        elif token_type == 'Comment':
-            if not self.strip_html_comments:
-                return token
-            else:
-                return None
-
-        elif token_type == 'Characters':
-            return self.sanitize_characters(token)
-
-        else:
-            return token
-
-    def sanitize_characters(self, token):
-        """Handles Characters tokens
-
-        Our overridden tokenizer doesn't do anything with entities. However,
-        that means that the serializer will convert all ``&`` in Characters
-        tokens to ``&``.
-
-        Since we don't want that, we extract entities here and convert them to
-        Entity tokens so the serializer will let them be.
-
-        :arg token: the Characters token to work on
-
-        :returns: a list of tokens
-
-        """
-        data = token.get('data', '')
-
-        if not data:
-            return token
-
-        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
-        token['data'] = data
-
-        # If there isn't a & in the data, we can return now
-        if '&' not in data:
-            return token
-
-        new_tokens = []
-
-        # For each possible entity that starts with a "&", we try to extract an
-        # actual entity and re-tokenize accordingly
-        for part in html5lib_shim.next_possible_entity(data):
-            if not part:
-                continue
-
-            if part.startswith('&'):
-                entity = html5lib_shim.match_entity(part)
-                if entity is not None:
-                    if entity == 'amp':
-                        # LinkifyFilter can't match urls across token boundaries
-                        # which is problematic with & since that shows up in
-                        # querystrings all the time. This special-cases &
-                        # and converts it to a & and sticks it in as a
-                        # Characters token. It'll get merged with surrounding
-                        # tokens in the BleachSanitizerfilter.__iter__ and
-                        # escaped in the serializer.
-                        new_tokens.append({'type': 'Characters', 'data': '&'})
-                    else:
-                        new_tokens.append({'type': 'Entity', 'name': entity})
-
-                    # Length of the entity plus 2--one for & at the beginning
-                    # and one for ; at the end
-                    remainder = part[len(entity) + 2:]
-                    if remainder:
-                        new_tokens.append({'type': 'Characters', 'data': remainder})
-                    continue
-
-            new_tokens.append({'type': 'Characters', 'data': part})
-
-        return new_tokens
-
-    def sanitize_uri_value(self, value, allowed_protocols):
-        """Checks a uri value to see if it's allowed
-
-        :arg value: the uri value to sanitize
-        :arg allowed_protocols: list of allowed protocols
-
-        :returns: allowed value or None
-
-        """
-        # NOTE(willkg): This transforms the value into one that's easier to
-        # match and verify, but shouldn't get returned since it's vastly
-        # different than the original value.
-
-        # Convert all character entities in the value
-        new_value = html5lib_shim.convert_entities(value)
-
-        # Nix backtick, space characters, and control characters
-        new_value = re.sub(
-            r"[`\000-\040\177-\240\s]+",
-            '',
-            new_value
-        )
-
-        # Remove REPLACEMENT characters
-        new_value = new_value.replace('\ufffd', '')
-
-        # Lowercase it--this breaks the value, but makes it easier to match
-        # against
-        new_value = new_value.lower()
-
-        try:
-            # Drop attributes with uri values that have protocols that aren't
-            # allowed
-            parsed = urlparse(new_value)
-        except ValueError:
-            # URI is impossible to parse, therefore it's not allowed
-            return None
-
-        if parsed.scheme:
-            # If urlparse found a scheme, check that
-            if parsed.scheme in allowed_protocols:
-                return value
-
-        else:
-            # Allow uris that are just an anchor
-            if new_value.startswith('#'):
-                return value
-
-            # Handle protocols that urlparse doesn't recognize like "myprotocol"
-            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
-                return value
-
-            # If there's no protocol/scheme specified, then assume it's "http"
-            # and see if that's allowed
-            if 'http' in allowed_protocols:
-                return value
-
-        return None
-
-    def allow_token(self, token):
-        """Handles the case where we're allowing the tag"""
-        if 'data' in token:
-            # Loop through all the attributes and drop the ones that are not
-            # allowed, are unsafe or break other rules. Additionally, fix
-            # attribute values that need fixing.
-            #
-            # At the end of this loop, we have the final set of attributes
-            # we're keeping.
-            attrs = {}
-            for namespaced_name, val in token['data'].items():
-                namespace, name = namespaced_name
-
-                # Drop attributes that are not explicitly allowed
-                #
-                # NOTE(willkg): We pass in the attribute name--not a namespaced
-                # name.
-                if not self.attr_filter(token['name'], name, val):
-                    continue
-
-                # Drop attributes with uri values that use a disallowed protocol
-                # Sanitize attributes with uri values
-                if namespaced_name in self.attr_val_is_uri:
-                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
-                    if new_value is None:
-                        continue
-                    val = new_value
-
-                # Drop values in svg attrs with non-local IRIs
-                if namespaced_name in self.svg_attr_val_allows_ref:
-                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                     ' ',
-                                     unescape(val))
-                    new_val = new_val.strip()
-                    if not new_val:
-                        continue
-
-                    else:
-                        # Replace the val with the unescaped version because
-                        # it's a iri
-                        val = new_val
-
-                # Drop href and xlink:href attr for svg elements with non-local IRIs
-                if (None, token['name']) in self.svg_allow_local_href:
-                    if namespaced_name in [
-                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
-                    ]:
-                        if re.search(r'^\s*[^#\s]', val):
-                            continue
-
-                # If it's a style attribute, sanitize it
-                if namespaced_name == (None, 'style'):
-                    val = self.sanitize_css(val)
-
-                # At this point, we want to keep the attribute, so add it in
-                attrs[namespaced_name] = val
-
-            token['data'] = alphabetize_attributes(attrs)
-
-        return token
-
-    def disallowed_token(self, token):
-        token_type = token["type"]
-        if token_type == "EndTag":
-            token["data"] = "</%s>" % token["name"]
-
-        elif token["data"]:
-            assert token_type in ("StartTag", "EmptyTag")
-            attrs = []
-            for (ns, name), v in token["data"].items():
-                # If we end up with a namespace, but no name, switch them so we
-                # have a valid name to use.
-                if ns and not name:
-                    ns, name = name, ns
-
-                # Figure out namespaced name if the namespace is appropriate
-                # and exists; if the ns isn't in prefixes, then drop it.
-                if ns is None or ns not in html5lib_shim.prefixes:
-                    namespaced_name = name
-                else:
-                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
-
-                attrs.append(' %s="%s"' % (
-                    namespaced_name,
-                    # NOTE(willkg): HTMLSerializer escapes attribute values
-                    # already, so if we do it here (like HTMLSerializer does),
-                    # then we end up double-escaping.
-                    v)
-                )
-            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
-
-        else:
-            token["data"] = "<%s>" % token["name"]
-
-        if token.get("selfClosing"):
-            token["data"] = token["data"][:-1] + "/>"
-
-        token["type"] = "Characters"
-
-        del token["name"]
-        return token
-
-    def sanitize_css(self, style):
-        """Sanitizes css in style tags"""
-        # Convert entities in the style so that it can be parsed as CSS
-        style = html5lib_shim.convert_entities(style)
-
-        # Drop any url values before we do anything else
-        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
-        # The gauntlet of sanitization
-
-        # Validate the css in the style tag and if it's not valid, then drop
-        # the whole thing.
-        parts = style.split(';')
-        gauntlet = re.compile(
-            r"""^(  # consider a style attribute value as composed of:
-[/:,#%!.\s\w]    # a non-newline character
-|\w-\w           # 3 characters in the form \w-\w
-|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
-|"[\s\w]+"       # a double quoted string of [\s\w]+
-|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
-)*$""",
-            flags=re.U | re.VERBOSE
-        )
-
-        for part in parts:
-            if not gauntlet.match(part):
-                return ''
-
-        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
-
-        clean = []
-        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
-            if not value:
-                continue
-
-            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
-
-            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
-
-        return ' '.join(clean)