guppy_basecaller: env/lib/python3.7/site-packages/bleach/sanitizer.py comparison

comparison env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"

author	shellac
date	Mon, 01 Jun 2020 08:59:25 -0400
parents	79f47841a781
children

comparison

equal deleted inserted replaced

-:79f47841a781
+:9b1c78e6ba9c
-from __future__ import unicode_literals
-from itertools import chain
-import re
-import six
-from six.moves.urllib.parse import urlparse
-from xml.sax.saxutils import unescape
-from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
-#: List of allowed tags
-ALLOWED_TAGS = [
-'a',
-'abbr',
-'acronym',
-'b',
-'blockquote',
-'code',
-'em',
-'i',
-'li',
-'ol',
-'strong',
-'ul',
-]
-#: Map of allowed attributes by tag
-ALLOWED_ATTRIBUTES = {
-'a': ['href', 'title'],
-'abbr': ['title'],
-'acronym': ['title'],
-}
-#: List of allowed styles
-ALLOWED_STYLES = []
-#: List of allowed protocols
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
-INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
-#: Regexp for characters that are invisible
-INVISIBLE_CHARACTERS_RE = re.compile(
-'[' + INVISIBLE_CHARACTERS + ']',
-re.UNICODE
-)
-#: String to replace invisible characters with. This can be a character, a
-#: string, or even a function that takes a Python re matchobj
-INVISIBLE_REPLACEMENT_CHAR = '?'
-class Cleaner(object):
-"""Cleaner for cleaning HTML fragments of malicious content
-This cleaner is a security-focused function whose sole purpose is to remove
-malicious content from a string such that it can be displayed as content in
-a web page.
-To use::
-from bleach.sanitizer import Cleaner
-cleaner = Cleaner()
-for text in all_the_yucky_things:
-sanitized = cleaner.clean(text)
-.. Note::
-This cleaner is not designed to use to transform content to be used in
-non-web-page contexts.
-.. Warning::
-This cleaner is not thread-safe--the html parser has internal state.
-Create a separate cleaner per thread!
-"""
-def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-strip_comments=True, filters=None):
-"""Initializes a Cleaner
-:arg list tags: allowed list of tags; defaults to
-``bleach.sanitizer.ALLOWED_TAGS``
-:arg dict attributes: allowed attributes; can be a callable, list or dict;
-defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-:arg list styles: allowed list of css styles; defaults to
-``bleach.sanitizer.ALLOWED_STYLES``
-:arg list protocols: allowed list of protocols for links; defaults
-to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-:arg bool strip: whether or not to strip disallowed elements
-:arg bool strip_comments: whether or not to strip HTML comments
-:arg list filters: list of html5lib Filter classes to pass streamed content through
-.. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
-.. Warning::
-Using filters changes the output of ``bleach.Cleaner.clean``.
-Make sure the way the filters change the output are secure.
-"""
-self.tags = tags
-self.attributes = attributes
-self.styles = styles
-self.protocols = protocols
-self.strip = strip
-self.strip_comments = strip_comments
-self.filters = filters or []
-self.parser = html5lib_shim.BleachHTMLParser(
-tags=self.tags,
-strip=self.strip,
-consume_entities=False,
-namespaceHTMLElements=False
-)
-self.walker = html5lib_shim.getTreeWalker('etree')
-self.serializer = html5lib_shim.BleachHTMLSerializer(
-quote_attr_values='always',
-omit_optional_tags=False,
-escape_lt_in_attrs=True,
-# We want to leave entities as they are without escaping or
-# resolving or expanding
-resolve_entities=False,
-# Bleach has its own sanitizer, so don't use the html5lib one
-sanitize=False,
-# Bleach sanitizer alphabetizes already, so don't use the html5lib one
-alphabetical_attributes=False,
-)
-def clean(self, text):
-"""Cleans text and returns sanitized result as unicode
-:arg str text: text to be cleaned
-:returns: sanitized text as unicode
-:raises TypeError: if ``text`` is not a text type
-"""
-if not isinstance(text, six.string_types):
-message = "argument cannot be of '{name}' type, must be of text type".format(
-name=text.__class__.__name__)
-raise TypeError(message)
-if not text:
-return ''
-text = force_unicode(text)
-dom = self.parser.parseFragment(text)
-filtered = BleachSanitizerFilter(
-source=self.walker(dom),
-# Bleach-sanitizer-specific things
-attributes=self.attributes,
-strip_disallowed_elements=self.strip,
-strip_html_comments=self.strip_comments,
-# html5lib-sanitizer things
-allowed_elements=self.tags,
-allowed_css_properties=self.styles,
-allowed_protocols=self.protocols,
-allowed_svg_properties=[],
-)
-# Apply any filters after the BleachSanitizerFilter
-for filter_class in self.filters:
-filtered = filter_class(source=filtered)
-return self.serializer.render(filtered)
-def attribute_filter_factory(attributes):
-"""Generates attribute filter function for the given attributes value
-The attributes value can take one of several shapes. This returns a filter
-function appropriate to the attributes value. One nice thing about this is
-that there's less if/then shenanigans in the ``allow_token`` method.
-"""
-if callable(attributes):
-return attributes
-if isinstance(attributes, dict):
-def _attr_filter(tag, attr, value):
-if tag in attributes:
-attr_val = attributes[tag]
-if callable(attr_val):
-return attr_val(tag, attr, value)
-if attr in attr_val:
-return True
-if '*' in attributes:
-attr_val = attributes['*']
-if callable(attr_val):
-return attr_val(tag, attr, value)
-return attr in attr_val
-return False
-return _attr_filter
-if isinstance(attributes, list):
-def _attr_filter(tag, attr, value):
-return attr in attributes
-return _attr_filter
-raise ValueError('attributes needs to be a callable, a list or a dict')
-class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
-"""html5lib Filter that sanitizes text
-This filter can be used anywhere html5lib filters can be used.
-"""
-def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
-strip_disallowed_elements=False, strip_html_comments=True,
-**kwargs):
-"""Creates a BleachSanitizerFilter instance
-:arg Treewalker source: stream
-:arg list tags: allowed list of tags; defaults to
-``bleach.sanitizer.ALLOWED_TAGS``
-:arg dict attributes: allowed attributes; can be a callable, list or dict;
-defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-:arg list styles: allowed list of css styles; defaults to
-``bleach.sanitizer.ALLOWED_STYLES``
-:arg list protocols: allowed list of protocols for links; defaults
-to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-:arg bool strip_disallowed_elements: whether or not to strip disallowed
-elements
-:arg bool strip_html_comments: whether or not to strip HTML comments
-"""
-self.attr_filter = attribute_filter_factory(attributes)
-self.strip_disallowed_elements = strip_disallowed_elements
-self.strip_html_comments = strip_html_comments
-return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
-def sanitize_stream(self, token_iterator):
-for token in token_iterator:
-ret = self.sanitize_token(token)
-if not ret:
-continue
-if isinstance(ret, list):
-for subtoken in ret:
-yield subtoken
-else:
-yield ret
-def merge_characters(self, token_iterator):
-"""Merge consecutive Characters tokens in a stream"""
-characters_buffer = []
-for token in token_iterator:
-if characters_buffer:
-if token['type'] == 'Characters':
-characters_buffer.append(token)
-continue
-else:
-# Merge all the characters tokens together into one and then
-# operate on it.
-new_token = {
-'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-'type': 'Characters'
-}
-characters_buffer = []
-yield new_token
-elif token['type'] == 'Characters':
-characters_buffer.append(token)
-continue
-yield token
-new_token = {
-'data': ''.join([char_token['data'] for char_token in characters_buffer]),
-'type': 'Characters'
-}
-yield new_token
-def __iter__(self):
-return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
-def sanitize_token(self, token):
-"""Sanitize a token either by HTML-encoding or dropping.
-Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
-['attribute', 'pairs'], 'tag': callable}.
-Here callable is a function with two arguments of attribute name and
-value. It should return true of false.
-Also gives the option to strip tags instead of encoding.
-:arg dict token: token to sanitize
-:returns: token or list of tokens
-"""
-token_type = token['type']
-if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
-if token['name'] in self.allowed_elements:
-return self.allow_token(token)
-elif self.strip_disallowed_elements:
-return None
-else:
-if 'data' in token:
-# Alphabetize the attributes before calling .disallowed_token()
-# so that the resulting string is stable
-token['data'] = alphabetize_attributes(token['data'])
-return self.disallowed_token(token)
-elif token_type == 'Comment':
-if not self.strip_html_comments:
-return token
-else:
-return None
-elif token_type == 'Characters':
-return self.sanitize_characters(token)
-else:
-return token
-def sanitize_characters(self, token):
-"""Handles Characters tokens
-Our overridden tokenizer doesn't do anything with entities. However,
-that means that the serializer will convert all ``&`` in Characters
-tokens to ``&amp;``.
-Since we don't want that, we extract entities here and convert them to
-Entity tokens so the serializer will let them be.
-:arg token: the Characters token to work on
-:returns: a list of tokens
-"""
-data = token.get('data', '')
-if not data:
-return token
-data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
-token['data'] = data
-# If there isn't a & in the data, we can return now
-if '&' not in data:
-return token
-new_tokens = []
-# For each possible entity that starts with a "&", we try to extract an
-# actual entity and re-tokenize accordingly
-for part in html5lib_shim.next_possible_entity(data):
-if not part:
-continue
-if part.startswith('&'):
-entity = html5lib_shim.match_entity(part)
-if entity is not None:
-if entity == 'amp':
-# LinkifyFilter can't match urls across token boundaries
-# which is problematic with &amp; since that shows up in
-# querystrings all the time. This special-cases &amp;
-# and converts it to a & and sticks it in as a
-# Characters token. It'll get merged with surrounding
-# tokens in the BleachSanitizerfilter.__iter__ and
-# escaped in the serializer.
-new_tokens.append({'type': 'Characters', 'data': '&'})
-else:
-new_tokens.append({'type': 'Entity', 'name': entity})
-# Length of the entity plus 2--one for & at the beginning
-# and one for ; at the end
-remainder = part[len(entity) + 2:]
-if remainder:
-new_tokens.append({'type': 'Characters', 'data': remainder})
-continue
-new_tokens.append({'type': 'Characters', 'data': part})
-return new_tokens
-def sanitize_uri_value(self, value, allowed_protocols):
-"""Checks a uri value to see if it's allowed
-:arg value: the uri value to sanitize
-:arg allowed_protocols: list of allowed protocols
-:returns: allowed value or None
-"""
-# NOTE(willkg): This transforms the value into one that's easier to
-# match and verify, but shouldn't get returned since it's vastly
-# different than the original value.
-# Convert all character entities in the value
-new_value = html5lib_shim.convert_entities(value)
-# Nix backtick, space characters, and control characters
-new_value = re.sub(
-r"[`\000-\040\177-\240\s]+",
-'',
-new_value
-)
-# Remove REPLACEMENT characters
-new_value = new_value.replace('\ufffd', '')
-# Lowercase it--this breaks the value, but makes it easier to match
-# against
-new_value = new_value.lower()
-try:
-# Drop attributes with uri values that have protocols that aren't
-# allowed
-parsed = urlparse(new_value)
-except ValueError:
-# URI is impossible to parse, therefore it's not allowed
-return None
-if parsed.scheme:
-# If urlparse found a scheme, check that
-if parsed.scheme in allowed_protocols:
-return value
-else:
-# Allow uris that are just an anchor
-if new_value.startswith('#'):
-return value
-# Handle protocols that urlparse doesn't recognize like "myprotocol"
-if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
-return value
-# If there's no protocol/scheme specified, then assume it's "http"
-# and see if that's allowed
-if 'http' in allowed_protocols:
-return value
-return None
-def allow_token(self, token):
-"""Handles the case where we're allowing the tag"""
-if 'data' in token:
-# Loop through all the attributes and drop the ones that are not
-# allowed, are unsafe or break other rules. Additionally, fix
-# attribute values that need fixing.
-#
-# At the end of this loop, we have the final set of attributes
-# we're keeping.
-attrs = {}
-for namespaced_name, val in token['data'].items():
-namespace, name = namespaced_name
-# Drop attributes that are not explicitly allowed
-#
-# NOTE(willkg): We pass in the attribute name--not a namespaced
-# name.
-if not self.attr_filter(token['name'], name, val):
-continue
-# Drop attributes with uri values that use a disallowed protocol
-# Sanitize attributes with uri values
-if namespaced_name in self.attr_val_is_uri:
-new_value = self.sanitize_uri_value(val, self.allowed_protocols)
-if new_value is None:
-continue
-val = new_value
-# Drop values in svg attrs with non-local IRIs
-if namespaced_name in self.svg_attr_val_allows_ref:
-new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-' ',
-unescape(val))
-new_val = new_val.strip()
-if not new_val:
-continue
-else:
-# Replace the val with the unescaped version because
-# it's a iri
-val = new_val
-# Drop href and xlink:href attr for svg elements with non-local IRIs
-if (None, token['name']) in self.svg_allow_local_href:
-if namespaced_name in [
-(None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
-]:
-if re.search(r'^\s*[^#\s]', val):
-continue
-# If it's a style attribute, sanitize it
-if namespaced_name == (None, 'style'):
-val = self.sanitize_css(val)
-# At this point, we want to keep the attribute, so add it in
-attrs[namespaced_name] = val
-token['data'] = alphabetize_attributes(attrs)
-return token
-def disallowed_token(self, token):
-token_type = token["type"]
-if token_type == "EndTag":
-token["data"] = "</%s>" % token["name"]
-elif token["data"]:
-assert token_type in ("StartTag", "EmptyTag")
-attrs = []
-for (ns, name), v in token["data"].items():
-# If we end up with a namespace, but no name, switch them so we
-# have a valid name to use.
-if ns and not name:
-ns, name = name, ns
-# Figure out namespaced name if the namespace is appropriate
-# and exists; if the ns isn't in prefixes, then drop it.
-if ns is None or ns not in html5lib_shim.prefixes:
-namespaced_name = name
-else:
-namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
-attrs.append(' %s="%s"' % (
-namespaced_name,
-# NOTE(willkg): HTMLSerializer escapes attribute values
-# already, so if we do it here (like HTMLSerializer does),
-# then we end up double-escaping.
-v)
-)
-token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
-else:
-token["data"] = "<%s>" % token["name"]
-if token.get("selfClosing"):
-token["data"] = token["data"][:-1] + "/>"
-token["type"] = "Characters"
-del token["name"]
-return token
-def sanitize_css(self, style):
-"""Sanitizes css in style tags"""
-# Convert entities in the style so that it can be parsed as CSS
-style = html5lib_shim.convert_entities(style)
-# Drop any url values before we do anything else
-style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-# The gauntlet of sanitization
-# Validate the css in the style tag and if it's not valid, then drop
-# the whole thing.
-parts = style.split(';')
-gauntlet = re.compile(
-r"""^(  # consider a style attribute value as composed of:
-[/:,#%!.\s\w]    # a non-newline character
-|\w-\w           # 3 characters in the form \w-\w
-|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
-|"[\s\w]+"       # a double quoted string of [\s\w]+
-|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
-)*$""",
-flags=re.U | re.VERBOSE
-)
-for part in parts:
-if not gauntlet.match(part):
-return ''
-if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-return ''
-clean = []
-for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
-if not value:
-continue
-if prop.lower() in self.allowed_css_properties:
-clean.append(prop + ': ' + value + ';')
-elif prop.lower() in self.allowed_svg_properties:
-clean.append(prop + ': ' + value + ';')
-return ' '.join(clean)

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip