Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bleach/sanitizer.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,624 +0,0 @@ -from __future__ import unicode_literals - -from itertools import chain -import re - -import six -from six.moves.urllib.parse import urlparse -from xml.sax.saxutils import unescape - -from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes, force_unicode - - -#: List of allowed tags -ALLOWED_TAGS = [ - 'a', - 'abbr', - 'acronym', - 'b', - 'blockquote', - 'code', - 'em', - 'i', - 'li', - 'ol', - 'strong', - 'ul', -] - - -#: Map of allowed attributes by tag -ALLOWED_ATTRIBUTES = { - 'a': ['href', 'title'], - 'abbr': ['title'], - 'acronym': ['title'], -} - -#: List of allowed styles -ALLOWED_STYLES = [] - -#: List of allowed protocols -ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] - -#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) -INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]) - -#: Regexp for characters that are invisible -INVISIBLE_CHARACTERS_RE = re.compile( - '[' + INVISIBLE_CHARACTERS + ']', - re.UNICODE -) - -#: String to replace invisible characters with. This can be a character, a -#: string, or even a function that takes a Python re matchobj -INVISIBLE_REPLACEMENT_CHAR = '?' - - -class Cleaner(object): - """Cleaner for cleaning HTML fragments of malicious content - - This cleaner is a security-focused function whose sole purpose is to remove - malicious content from a string such that it can be displayed as content in - a web page. - - To use:: - - from bleach.sanitizer import Cleaner - - cleaner = Cleaner() - - for text in all_the_yucky_things: - sanitized = cleaner.clean(text) - - .. Note:: - - This cleaner is not designed to use to transform content to be used in - non-web-page contexts. - - .. Warning:: - - This cleaner is not thread-safe--the html parser has internal state. - Create a separate cleaner per thread! - - - """ - - def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, - styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, - strip_comments=True, filters=None): - """Initializes a Cleaner - - :arg list tags: allowed list of tags; defaults to - ``bleach.sanitizer.ALLOWED_TAGS`` - - :arg dict attributes: allowed attributes; can be a callable, list or dict; - defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` - - :arg list styles: allowed list of css styles; defaults to - ``bleach.sanitizer.ALLOWED_STYLES`` - - :arg list protocols: allowed list of protocols for links; defaults - to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` - - :arg bool strip: whether or not to strip disallowed elements - - :arg bool strip_comments: whether or not to strip HTML comments - - :arg list filters: list of html5lib Filter classes to pass streamed content through - - .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters - - .. Warning:: - - Using filters changes the output of ``bleach.Cleaner.clean``. - Make sure the way the filters change the output are secure. - - """ - self.tags = tags - self.attributes = attributes - self.styles = styles - self.protocols = protocols - self.strip = strip - self.strip_comments = strip_comments - self.filters = filters or [] - - self.parser = html5lib_shim.BleachHTMLParser( - tags=self.tags, - strip=self.strip, - consume_entities=False, - namespaceHTMLElements=False - ) - self.walker = html5lib_shim.getTreeWalker('etree') - self.serializer = html5lib_shim.BleachHTMLSerializer( - quote_attr_values='always', - omit_optional_tags=False, - escape_lt_in_attrs=True, - - # We want to leave entities as they are without escaping or - # resolving or expanding - resolve_entities=False, - - # Bleach has its own sanitizer, so don't use the html5lib one - sanitize=False, - - # Bleach sanitizer alphabetizes already, so don't use the html5lib one - alphabetical_attributes=False, - ) - - def clean(self, text): - """Cleans text and returns sanitized result as unicode - - :arg str text: text to be cleaned - - :returns: sanitized text as unicode - - :raises TypeError: if ``text`` is not a text type - - """ - if not isinstance(text, six.string_types): - message = "argument cannot be of '{name}' type, must be of text type".format( - name=text.__class__.__name__) - raise TypeError(message) - - if not text: - return '' - - text = force_unicode(text) - - dom = self.parser.parseFragment(text) - filtered = BleachSanitizerFilter( - source=self.walker(dom), - - # Bleach-sanitizer-specific things - attributes=self.attributes, - strip_disallowed_elements=self.strip, - strip_html_comments=self.strip_comments, - - # html5lib-sanitizer things - allowed_elements=self.tags, - allowed_css_properties=self.styles, - allowed_protocols=self.protocols, - allowed_svg_properties=[], - ) - - # Apply any filters after the BleachSanitizerFilter - for filter_class in self.filters: - filtered = filter_class(source=filtered) - - return self.serializer.render(filtered) - - -def attribute_filter_factory(attributes): - """Generates attribute filter function for the given attributes value - - The attributes value can take one of several shapes. This returns a filter - function appropriate to the attributes value. One nice thing about this is - that there's less if/then shenanigans in the ``allow_token`` method. - - """ - if callable(attributes): - return attributes - - if isinstance(attributes, dict): - def _attr_filter(tag, attr, value): - if tag in attributes: - attr_val = attributes[tag] - if callable(attr_val): - return attr_val(tag, attr, value) - - if attr in attr_val: - return True - - if '*' in attributes: - attr_val = attributes['*'] - if callable(attr_val): - return attr_val(tag, attr, value) - - return attr in attr_val - - return False - - return _attr_filter - - if isinstance(attributes, list): - def _attr_filter(tag, attr, value): - return attr in attributes - - return _attr_filter - - raise ValueError('attributes needs to be a callable, a list or a dict') - - -class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): - """html5lib Filter that sanitizes text - - This filter can be used anywhere html5lib filters can be used. - - """ - def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, - strip_disallowed_elements=False, strip_html_comments=True, - **kwargs): - """Creates a BleachSanitizerFilter instance - - :arg Treewalker source: stream - - :arg list tags: allowed list of tags; defaults to - ``bleach.sanitizer.ALLOWED_TAGS`` - - :arg dict attributes: allowed attributes; can be a callable, list or dict; - defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` - - :arg list styles: allowed list of css styles; defaults to - ``bleach.sanitizer.ALLOWED_STYLES`` - - :arg list protocols: allowed list of protocols for links; defaults - to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` - - :arg bool strip_disallowed_elements: whether or not to strip disallowed - elements - - :arg bool strip_html_comments: whether or not to strip HTML comments - - """ - self.attr_filter = attribute_filter_factory(attributes) - self.strip_disallowed_elements = strip_disallowed_elements - self.strip_html_comments = strip_html_comments - - return super(BleachSanitizerFilter, self).__init__(source, **kwargs) - - def sanitize_stream(self, token_iterator): - for token in token_iterator: - ret = self.sanitize_token(token) - - if not ret: - continue - - if isinstance(ret, list): - for subtoken in ret: - yield subtoken - else: - yield ret - - def merge_characters(self, token_iterator): - """Merge consecutive Characters tokens in a stream""" - characters_buffer = [] - - for token in token_iterator: - if characters_buffer: - if token['type'] == 'Characters': - characters_buffer.append(token) - continue - else: - # Merge all the characters tokens together into one and then - # operate on it. - new_token = { - 'data': ''.join([char_token['data'] for char_token in characters_buffer]), - 'type': 'Characters' - } - characters_buffer = [] - yield new_token - - elif token['type'] == 'Characters': - characters_buffer.append(token) - continue - - yield token - - new_token = { - 'data': ''.join([char_token['data'] for char_token in characters_buffer]), - 'type': 'Characters' - } - yield new_token - - def __iter__(self): - return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self))) - - def sanitize_token(self, token): - """Sanitize a token either by HTML-encoding or dropping. - - Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': - ['attribute', 'pairs'], 'tag': callable}. - - Here callable is a function with two arguments of attribute name and - value. It should return true of false. - - Also gives the option to strip tags instead of encoding. - - :arg dict token: token to sanitize - - :returns: token or list of tokens - - """ - token_type = token['type'] - if token_type in ['StartTag', 'EndTag', 'EmptyTag']: - if token['name'] in self.allowed_elements: - return self.allow_token(token) - - elif self.strip_disallowed_elements: - return None - - else: - if 'data' in token: - # Alphabetize the attributes before calling .disallowed_token() - # so that the resulting string is stable - token['data'] = alphabetize_attributes(token['data']) - return self.disallowed_token(token) - - elif token_type == 'Comment': - if not self.strip_html_comments: - return token - else: - return None - - elif token_type == 'Characters': - return self.sanitize_characters(token) - - else: - return token - - def sanitize_characters(self, token): - """Handles Characters tokens - - Our overridden tokenizer doesn't do anything with entities. However, - that means that the serializer will convert all ``&`` in Characters - tokens to ``&``. - - Since we don't want that, we extract entities here and convert them to - Entity tokens so the serializer will let them be. - - :arg token: the Characters token to work on - - :returns: a list of tokens - - """ - data = token.get('data', '') - - if not data: - return token - - data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) - token['data'] = data - - # If there isn't a & in the data, we can return now - if '&' not in data: - return token - - new_tokens = [] - - # For each possible entity that starts with a "&", we try to extract an - # actual entity and re-tokenize accordingly - for part in html5lib_shim.next_possible_entity(data): - if not part: - continue - - if part.startswith('&'): - entity = html5lib_shim.match_entity(part) - if entity is not None: - if entity == 'amp': - # LinkifyFilter can't match urls across token boundaries - # which is problematic with & since that shows up in - # querystrings all the time. This special-cases & - # and converts it to a & and sticks it in as a - # Characters token. It'll get merged with surrounding - # tokens in the BleachSanitizerfilter.__iter__ and - # escaped in the serializer. - new_tokens.append({'type': 'Characters', 'data': '&'}) - else: - new_tokens.append({'type': 'Entity', 'name': entity}) - - # Length of the entity plus 2--one for & at the beginning - # and one for ; at the end - remainder = part[len(entity) + 2:] - if remainder: - new_tokens.append({'type': 'Characters', 'data': remainder}) - continue - - new_tokens.append({'type': 'Characters', 'data': part}) - - return new_tokens - - def sanitize_uri_value(self, value, allowed_protocols): - """Checks a uri value to see if it's allowed - - :arg value: the uri value to sanitize - :arg allowed_protocols: list of allowed protocols - - :returns: allowed value or None - - """ - # NOTE(willkg): This transforms the value into one that's easier to - # match and verify, but shouldn't get returned since it's vastly - # different than the original value. - - # Convert all character entities in the value - new_value = html5lib_shim.convert_entities(value) - - # Nix backtick, space characters, and control characters - new_value = re.sub( - r"[`\000-\040\177-\240\s]+", - '', - new_value - ) - - # Remove REPLACEMENT characters - new_value = new_value.replace('\ufffd', '') - - # Lowercase it--this breaks the value, but makes it easier to match - # against - new_value = new_value.lower() - - try: - # Drop attributes with uri values that have protocols that aren't - # allowed - parsed = urlparse(new_value) - except ValueError: - # URI is impossible to parse, therefore it's not allowed - return None - - if parsed.scheme: - # If urlparse found a scheme, check that - if parsed.scheme in allowed_protocols: - return value - - else: - # Allow uris that are just an anchor - if new_value.startswith('#'): - return value - - # Handle protocols that urlparse doesn't recognize like "myprotocol" - if ':' in new_value and new_value.split(':')[0] in allowed_protocols: - return value - - # If there's no protocol/scheme specified, then assume it's "http" - # and see if that's allowed - if 'http' in allowed_protocols: - return value - - return None - - def allow_token(self, token): - """Handles the case where we're allowing the tag""" - if 'data' in token: - # Loop through all the attributes and drop the ones that are not - # allowed, are unsafe or break other rules. Additionally, fix - # attribute values that need fixing. - # - # At the end of this loop, we have the final set of attributes - # we're keeping. - attrs = {} - for namespaced_name, val in token['data'].items(): - namespace, name = namespaced_name - - # Drop attributes that are not explicitly allowed - # - # NOTE(willkg): We pass in the attribute name--not a namespaced - # name. - if not self.attr_filter(token['name'], name, val): - continue - - # Drop attributes with uri values that use a disallowed protocol - # Sanitize attributes with uri values - if namespaced_name in self.attr_val_is_uri: - new_value = self.sanitize_uri_value(val, self.allowed_protocols) - if new_value is None: - continue - val = new_value - - # Drop values in svg attrs with non-local IRIs - if namespaced_name in self.svg_attr_val_allows_ref: - new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(val)) - new_val = new_val.strip() - if not new_val: - continue - - else: - # Replace the val with the unescaped version because - # it's a iri - val = new_val - - # Drop href and xlink:href attr for svg elements with non-local IRIs - if (None, token['name']) in self.svg_allow_local_href: - if namespaced_name in [ - (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') - ]: - if re.search(r'^\s*[^#\s]', val): - continue - - # If it's a style attribute, sanitize it - if namespaced_name == (None, 'style'): - val = self.sanitize_css(val) - - # At this point, we want to keep the attribute, so add it in - attrs[namespaced_name] = val - - token['data'] = alphabetize_attributes(attrs) - - return token - - def disallowed_token(self, token): - token_type = token["type"] - if token_type == "EndTag": - token["data"] = "</%s>" % token["name"] - - elif token["data"]: - assert token_type in ("StartTag", "EmptyTag") - attrs = [] - for (ns, name), v in token["data"].items(): - # If we end up with a namespace, but no name, switch them so we - # have a valid name to use. - if ns and not name: - ns, name = name, ns - - # Figure out namespaced name if the namespace is appropriate - # and exists; if the ns isn't in prefixes, then drop it. - if ns is None or ns not in html5lib_shim.prefixes: - namespaced_name = name - else: - namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name) - - attrs.append(' %s="%s"' % ( - namespaced_name, - # NOTE(willkg): HTMLSerializer escapes attribute values - # already, so if we do it here (like HTMLSerializer does), - # then we end up double-escaping. - v) - ) - token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) - - else: - token["data"] = "<%s>" % token["name"] - - if token.get("selfClosing"): - token["data"] = token["data"][:-1] + "/>" - - token["type"] = "Characters" - - del token["name"] - return token - - def sanitize_css(self, style): - """Sanitizes css in style tags""" - # Convert entities in the style so that it can be parsed as CSS - style = html5lib_shim.convert_entities(style) - - # Drop any url values before we do anything else - style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) - - # The gauntlet of sanitization - - # Validate the css in the style tag and if it's not valid, then drop - # the whole thing. - parts = style.split(';') - gauntlet = re.compile( - r"""^( # consider a style attribute value as composed of: -[/:,#%!.\s\w] # a non-newline character -|\w-\w # 3 characters in the form \w-\w -|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space -|"[\s\w]+" # a double quoted string of [\s\w]+ -|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'' -)*$""", - flags=re.U | re.VERBOSE - ) - - for part in parts: - if not gauntlet.match(part): - return '' - - if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): - return '' - - clean = [] - for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): - if not value: - continue - - if prop.lower() in self.allowed_css_properties: - clean.append(prop + ': ' + value + ';') - - elif prop.lower() in self.allowed_svg_properties: - clean.append(prop + ': ' + value + ';') - - return ' '.join(clean)