Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bleach/sanitizer.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/bleach/sanitizer.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,624 @@ +from __future__ import unicode_literals + +from itertools import chain +import re + +import six +from six.moves.urllib.parse import urlparse +from xml.sax.saxutils import unescape + +from bleach import html5lib_shim +from bleach.utils import alphabetize_attributes, force_unicode + + +#: List of allowed tags +ALLOWED_TAGS = [ + 'a', + 'abbr', + 'acronym', + 'b', + 'blockquote', + 'code', + 'em', + 'i', + 'li', + 'ol', + 'strong', + 'ul', +] + + +#: Map of allowed attributes by tag +ALLOWED_ATTRIBUTES = { + 'a': ['href', 'title'], + 'abbr': ['title'], + 'acronym': ['title'], +} + +#: List of allowed styles +ALLOWED_STYLES = [] + +#: List of allowed protocols +ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] + +#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) +INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]) + +#: Regexp for characters that are invisible +INVISIBLE_CHARACTERS_RE = re.compile( + '[' + INVISIBLE_CHARACTERS + ']', + re.UNICODE +) + +#: String to replace invisible characters with. This can be a character, a +#: string, or even a function that takes a Python re matchobj +INVISIBLE_REPLACEMENT_CHAR = '?' + + +class Cleaner(object): + """Cleaner for cleaning HTML fragments of malicious content + + This cleaner is a security-focused function whose sole purpose is to remove + malicious content from a string such that it can be displayed as content in + a web page. + + To use:: + + from bleach.sanitizer import Cleaner + + cleaner = Cleaner() + + for text in all_the_yucky_things: + sanitized = cleaner.clean(text) + + .. Note:: + + This cleaner is not designed to use to transform content to be used in + non-web-page contexts. + + .. Warning:: + + This cleaner is not thread-safe--the html parser has internal state. + Create a separate cleaner per thread! + + + """ + + def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, + strip_comments=True, filters=None): + """Initializes a Cleaner + + :arg list tags: allowed list of tags; defaults to + ``bleach.sanitizer.ALLOWED_TAGS`` + + :arg dict attributes: allowed attributes; can be a callable, list or dict; + defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` + + :arg list styles: allowed list of css styles; defaults to + ``bleach.sanitizer.ALLOWED_STYLES`` + + :arg list protocols: allowed list of protocols for links; defaults + to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` + + :arg bool strip: whether or not to strip disallowed elements + + :arg bool strip_comments: whether or not to strip HTML comments + + :arg list filters: list of html5lib Filter classes to pass streamed content through + + .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters + + .. Warning:: + + Using filters changes the output of ``bleach.Cleaner.clean``. + Make sure the way the filters change the output are secure. + + """ + self.tags = tags + self.attributes = attributes + self.styles = styles + self.protocols = protocols + self.strip = strip + self.strip_comments = strip_comments + self.filters = filters or [] + + self.parser = html5lib_shim.BleachHTMLParser( + tags=self.tags, + strip=self.strip, + consume_entities=False, + namespaceHTMLElements=False + ) + self.walker = html5lib_shim.getTreeWalker('etree') + self.serializer = html5lib_shim.BleachHTMLSerializer( + quote_attr_values='always', + omit_optional_tags=False, + escape_lt_in_attrs=True, + + # We want to leave entities as they are without escaping or + # resolving or expanding + resolve_entities=False, + + # Bleach has its own sanitizer, so don't use the html5lib one + sanitize=False, + + # Bleach sanitizer alphabetizes already, so don't use the html5lib one + alphabetical_attributes=False, + ) + + def clean(self, text): + """Cleans text and returns sanitized result as unicode + + :arg str text: text to be cleaned + + :returns: sanitized text as unicode + + :raises TypeError: if ``text`` is not a text type + + """ + if not isinstance(text, six.string_types): + message = "argument cannot be of '{name}' type, must be of text type".format( + name=text.__class__.__name__) + raise TypeError(message) + + if not text: + return '' + + text = force_unicode(text) + + dom = self.parser.parseFragment(text) + filtered = BleachSanitizerFilter( + source=self.walker(dom), + + # Bleach-sanitizer-specific things + attributes=self.attributes, + strip_disallowed_elements=self.strip, + strip_html_comments=self.strip_comments, + + # html5lib-sanitizer things + allowed_elements=self.tags, + allowed_css_properties=self.styles, + allowed_protocols=self.protocols, + allowed_svg_properties=[], + ) + + # Apply any filters after the BleachSanitizerFilter + for filter_class in self.filters: + filtered = filter_class(source=filtered) + + return self.serializer.render(filtered) + + +def attribute_filter_factory(attributes): + """Generates attribute filter function for the given attributes value + + The attributes value can take one of several shapes. This returns a filter + function appropriate to the attributes value. One nice thing about this is + that there's less if/then shenanigans in the ``allow_token`` method. + + """ + if callable(attributes): + return attributes + + if isinstance(attributes, dict): + def _attr_filter(tag, attr, value): + if tag in attributes: + attr_val = attributes[tag] + if callable(attr_val): + return attr_val(tag, attr, value) + + if attr in attr_val: + return True + + if '*' in attributes: + attr_val = attributes['*'] + if callable(attr_val): + return attr_val(tag, attr, value) + + return attr in attr_val + + return False + + return _attr_filter + + if isinstance(attributes, list): + def _attr_filter(tag, attr, value): + return attr in attributes + + return _attr_filter + + raise ValueError('attributes needs to be a callable, a list or a dict') + + +class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): + """html5lib Filter that sanitizes text + + This filter can be used anywhere html5lib filters can be used. + + """ + def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, + strip_disallowed_elements=False, strip_html_comments=True, + **kwargs): + """Creates a BleachSanitizerFilter instance + + :arg Treewalker source: stream + + :arg list tags: allowed list of tags; defaults to + ``bleach.sanitizer.ALLOWED_TAGS`` + + :arg dict attributes: allowed attributes; can be a callable, list or dict; + defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` + + :arg list styles: allowed list of css styles; defaults to + ``bleach.sanitizer.ALLOWED_STYLES`` + + :arg list protocols: allowed list of protocols for links; defaults + to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` + + :arg bool strip_disallowed_elements: whether or not to strip disallowed + elements + + :arg bool strip_html_comments: whether or not to strip HTML comments + + """ + self.attr_filter = attribute_filter_factory(attributes) + self.strip_disallowed_elements = strip_disallowed_elements + self.strip_html_comments = strip_html_comments + + return super(BleachSanitizerFilter, self).__init__(source, **kwargs) + + def sanitize_stream(self, token_iterator): + for token in token_iterator: + ret = self.sanitize_token(token) + + if not ret: + continue + + if isinstance(ret, list): + for subtoken in ret: + yield subtoken + else: + yield ret + + def merge_characters(self, token_iterator): + """Merge consecutive Characters tokens in a stream""" + characters_buffer = [] + + for token in token_iterator: + if characters_buffer: + if token['type'] == 'Characters': + characters_buffer.append(token) + continue + else: + # Merge all the characters tokens together into one and then + # operate on it. + new_token = { + 'data': ''.join([char_token['data'] for char_token in characters_buffer]), + 'type': 'Characters' + } + characters_buffer = [] + yield new_token + + elif token['type'] == 'Characters': + characters_buffer.append(token) + continue + + yield token + + new_token = { + 'data': ''.join([char_token['data'] for char_token in characters_buffer]), + 'type': 'Characters' + } + yield new_token + + def __iter__(self): + return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self))) + + def sanitize_token(self, token): + """Sanitize a token either by HTML-encoding or dropping. + + Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': + ['attribute', 'pairs'], 'tag': callable}. + + Here callable is a function with two arguments of attribute name and + value. It should return true of false. + + Also gives the option to strip tags instead of encoding. + + :arg dict token: token to sanitize + + :returns: token or list of tokens + + """ + token_type = token['type'] + if token_type in ['StartTag', 'EndTag', 'EmptyTag']: + if token['name'] in self.allowed_elements: + return self.allow_token(token) + + elif self.strip_disallowed_elements: + return None + + else: + if 'data' in token: + # Alphabetize the attributes before calling .disallowed_token() + # so that the resulting string is stable + token['data'] = alphabetize_attributes(token['data']) + return self.disallowed_token(token) + + elif token_type == 'Comment': + if not self.strip_html_comments: + return token + else: + return None + + elif token_type == 'Characters': + return self.sanitize_characters(token) + + else: + return token + + def sanitize_characters(self, token): + """Handles Characters tokens + + Our overridden tokenizer doesn't do anything with entities. However, + that means that the serializer will convert all ``&`` in Characters + tokens to ``&``. + + Since we don't want that, we extract entities here and convert them to + Entity tokens so the serializer will let them be. + + :arg token: the Characters token to work on + + :returns: a list of tokens + + """ + data = token.get('data', '') + + if not data: + return token + + data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) + token['data'] = data + + # If there isn't a & in the data, we can return now + if '&' not in data: + return token + + new_tokens = [] + + # For each possible entity that starts with a "&", we try to extract an + # actual entity and re-tokenize accordingly + for part in html5lib_shim.next_possible_entity(data): + if not part: + continue + + if part.startswith('&'): + entity = html5lib_shim.match_entity(part) + if entity is not None: + if entity == 'amp': + # LinkifyFilter can't match urls across token boundaries + # which is problematic with & since that shows up in + # querystrings all the time. This special-cases & + # and converts it to a & and sticks it in as a + # Characters token. It'll get merged with surrounding + # tokens in the BleachSanitizerfilter.__iter__ and + # escaped in the serializer. + new_tokens.append({'type': 'Characters', 'data': '&'}) + else: + new_tokens.append({'type': 'Entity', 'name': entity}) + + # Length of the entity plus 2--one for & at the beginning + # and one for ; at the end + remainder = part[len(entity) + 2:] + if remainder: + new_tokens.append({'type': 'Characters', 'data': remainder}) + continue + + new_tokens.append({'type': 'Characters', 'data': part}) + + return new_tokens + + def sanitize_uri_value(self, value, allowed_protocols): + """Checks a uri value to see if it's allowed + + :arg value: the uri value to sanitize + :arg allowed_protocols: list of allowed protocols + + :returns: allowed value or None + + """ + # NOTE(willkg): This transforms the value into one that's easier to + # match and verify, but shouldn't get returned since it's vastly + # different than the original value. + + # Convert all character entities in the value + new_value = html5lib_shim.convert_entities(value) + + # Nix backtick, space characters, and control characters + new_value = re.sub( + r"[`\000-\040\177-\240\s]+", + '', + new_value + ) + + # Remove REPLACEMENT characters + new_value = new_value.replace('\ufffd', '') + + # Lowercase it--this breaks the value, but makes it easier to match + # against + new_value = new_value.lower() + + try: + # Drop attributes with uri values that have protocols that aren't + # allowed + parsed = urlparse(new_value) + except ValueError: + # URI is impossible to parse, therefore it's not allowed + return None + + if parsed.scheme: + # If urlparse found a scheme, check that + if parsed.scheme in allowed_protocols: + return value + + else: + # Allow uris that are just an anchor + if new_value.startswith('#'): + return value + + # Handle protocols that urlparse doesn't recognize like "myprotocol" + if ':' in new_value and new_value.split(':')[0] in allowed_protocols: + return value + + # If there's no protocol/scheme specified, then assume it's "http" + # and see if that's allowed + if 'http' in allowed_protocols: + return value + + return None + + def allow_token(self, token): + """Handles the case where we're allowing the tag""" + if 'data' in token: + # Loop through all the attributes and drop the ones that are not + # allowed, are unsafe or break other rules. Additionally, fix + # attribute values that need fixing. + # + # At the end of this loop, we have the final set of attributes + # we're keeping. + attrs = {} + for namespaced_name, val in token['data'].items(): + namespace, name = namespaced_name + + # Drop attributes that are not explicitly allowed + # + # NOTE(willkg): We pass in the attribute name--not a namespaced + # name. + if not self.attr_filter(token['name'], name, val): + continue + + # Drop attributes with uri values that use a disallowed protocol + # Sanitize attributes with uri values + if namespaced_name in self.attr_val_is_uri: + new_value = self.sanitize_uri_value(val, self.allowed_protocols) + if new_value is None: + continue + val = new_value + + # Drop values in svg attrs with non-local IRIs + if namespaced_name in self.svg_attr_val_allows_ref: + new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(val)) + new_val = new_val.strip() + if not new_val: + continue + + else: + # Replace the val with the unescaped version because + # it's a iri + val = new_val + + # Drop href and xlink:href attr for svg elements with non-local IRIs + if (None, token['name']) in self.svg_allow_local_href: + if namespaced_name in [ + (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') + ]: + if re.search(r'^\s*[^#\s]', val): + continue + + # If it's a style attribute, sanitize it + if namespaced_name == (None, 'style'): + val = self.sanitize_css(val) + + # At this point, we want to keep the attribute, so add it in + attrs[namespaced_name] = val + + token['data'] = alphabetize_attributes(attrs) + + return token + + def disallowed_token(self, token): + token_type = token["type"] + if token_type == "EndTag": + token["data"] = "</%s>" % token["name"] + + elif token["data"]: + assert token_type in ("StartTag", "EmptyTag") + attrs = [] + for (ns, name), v in token["data"].items(): + # If we end up with a namespace, but no name, switch them so we + # have a valid name to use. + if ns and not name: + ns, name = name, ns + + # Figure out namespaced name if the namespace is appropriate + # and exists; if the ns isn't in prefixes, then drop it. + if ns is None or ns not in html5lib_shim.prefixes: + namespaced_name = name + else: + namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name) + + attrs.append(' %s="%s"' % ( + namespaced_name, + # NOTE(willkg): HTMLSerializer escapes attribute values + # already, so if we do it here (like HTMLSerializer does), + # then we end up double-escaping. + v) + ) + token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) + + else: + token["data"] = "<%s>" % token["name"] + + if token.get("selfClosing"): + token["data"] = token["data"][:-1] + "/>" + + token["type"] = "Characters" + + del token["name"] + return token + + def sanitize_css(self, style): + """Sanitizes css in style tags""" + # Convert entities in the style so that it can be parsed as CSS + style = html5lib_shim.convert_entities(style) + + # Drop any url values before we do anything else + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + + # The gauntlet of sanitization + + # Validate the css in the style tag and if it's not valid, then drop + # the whole thing. + parts = style.split(';') + gauntlet = re.compile( + r"""^( # consider a style attribute value as composed of: +[/:,#%!.\s\w] # a non-newline character +|\w-\w # 3 characters in the form \w-\w +|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space +|"[\s\w]+" # a double quoted string of [\s\w]+ +|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'' +)*$""", + flags=re.U | re.VERBOSE + ) + + for part in parts: + if not gauntlet.match(part): + return '' + + if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + return '' + + clean = [] + for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): + if not value: + continue + + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean)