comparison env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1 from __future__ import unicode_literals
2
3 from itertools import chain
4 import re
5
6 import six
7 from six.moves.urllib.parse import urlparse
8 from xml.sax.saxutils import unescape
9
10 from bleach import html5lib_shim
11 from bleach.utils import alphabetize_attributes, force_unicode
12
13
14 #: List of allowed tags
15 ALLOWED_TAGS = [
16 'a',
17 'abbr',
18 'acronym',
19 'b',
20 'blockquote',
21 'code',
22 'em',
23 'i',
24 'li',
25 'ol',
26 'strong',
27 'ul',
28 ]
29
30
31 #: Map of allowed attributes by tag
32 ALLOWED_ATTRIBUTES = {
33 'a': ['href', 'title'],
34 'abbr': ['title'],
35 'acronym': ['title'],
36 }
37
38 #: List of allowed styles
39 ALLOWED_STYLES = []
40
41 #: List of allowed protocols
42 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
43
44 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
45 INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
46
47 #: Regexp for characters that are invisible
48 INVISIBLE_CHARACTERS_RE = re.compile(
49 '[' + INVISIBLE_CHARACTERS + ']',
50 re.UNICODE
51 )
52
53 #: String to replace invisible characters with. This can be a character, a
54 #: string, or even a function that takes a Python re matchobj
55 INVISIBLE_REPLACEMENT_CHAR = '?'
56
57
58 class Cleaner(object):
59 """Cleaner for cleaning HTML fragments of malicious content
60
61 This cleaner is a security-focused function whose sole purpose is to remove
62 malicious content from a string such that it can be displayed as content in
63 a web page.
64
65 To use::
66
67 from bleach.sanitizer import Cleaner
68
69 cleaner = Cleaner()
70
71 for text in all_the_yucky_things:
72 sanitized = cleaner.clean(text)
73
74 .. Note::
75
76 This cleaner is not designed to use to transform content to be used in
77 non-web-page contexts.
78
79 .. Warning::
80
81 This cleaner is not thread-safe--the html parser has internal state.
82 Create a separate cleaner per thread!
83
84
85 """
86
87 def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
88 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
89 strip_comments=True, filters=None):
90 """Initializes a Cleaner
91
92 :arg list tags: allowed list of tags; defaults to
93 ``bleach.sanitizer.ALLOWED_TAGS``
94
95 :arg dict attributes: allowed attributes; can be a callable, list or dict;
96 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
97
98 :arg list styles: allowed list of css styles; defaults to
99 ``bleach.sanitizer.ALLOWED_STYLES``
100
101 :arg list protocols: allowed list of protocols for links; defaults
102 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
103
104 :arg bool strip: whether or not to strip disallowed elements
105
106 :arg bool strip_comments: whether or not to strip HTML comments
107
108 :arg list filters: list of html5lib Filter classes to pass streamed content through
109
110 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
111
112 .. Warning::
113
114 Using filters changes the output of ``bleach.Cleaner.clean``.
115 Make sure the way the filters change the output are secure.
116
117 """
118 self.tags = tags
119 self.attributes = attributes
120 self.styles = styles
121 self.protocols = protocols
122 self.strip = strip
123 self.strip_comments = strip_comments
124 self.filters = filters or []
125
126 self.parser = html5lib_shim.BleachHTMLParser(
127 tags=self.tags,
128 strip=self.strip,
129 consume_entities=False,
130 namespaceHTMLElements=False
131 )
132 self.walker = html5lib_shim.getTreeWalker('etree')
133 self.serializer = html5lib_shim.BleachHTMLSerializer(
134 quote_attr_values='always',
135 omit_optional_tags=False,
136 escape_lt_in_attrs=True,
137
138 # We want to leave entities as they are without escaping or
139 # resolving or expanding
140 resolve_entities=False,
141
142 # Bleach has its own sanitizer, so don't use the html5lib one
143 sanitize=False,
144
145 # Bleach sanitizer alphabetizes already, so don't use the html5lib one
146 alphabetical_attributes=False,
147 )
148
149 def clean(self, text):
150 """Cleans text and returns sanitized result as unicode
151
152 :arg str text: text to be cleaned
153
154 :returns: sanitized text as unicode
155
156 :raises TypeError: if ``text`` is not a text type
157
158 """
159 if not isinstance(text, six.string_types):
160 message = "argument cannot be of '{name}' type, must be of text type".format(
161 name=text.__class__.__name__)
162 raise TypeError(message)
163
164 if not text:
165 return ''
166
167 text = force_unicode(text)
168
169 dom = self.parser.parseFragment(text)
170 filtered = BleachSanitizerFilter(
171 source=self.walker(dom),
172
173 # Bleach-sanitizer-specific things
174 attributes=self.attributes,
175 strip_disallowed_elements=self.strip,
176 strip_html_comments=self.strip_comments,
177
178 # html5lib-sanitizer things
179 allowed_elements=self.tags,
180 allowed_css_properties=self.styles,
181 allowed_protocols=self.protocols,
182 allowed_svg_properties=[],
183 )
184
185 # Apply any filters after the BleachSanitizerFilter
186 for filter_class in self.filters:
187 filtered = filter_class(source=filtered)
188
189 return self.serializer.render(filtered)
190
191
192 def attribute_filter_factory(attributes):
193 """Generates attribute filter function for the given attributes value
194
195 The attributes value can take one of several shapes. This returns a filter
196 function appropriate to the attributes value. One nice thing about this is
197 that there's less if/then shenanigans in the ``allow_token`` method.
198
199 """
200 if callable(attributes):
201 return attributes
202
203 if isinstance(attributes, dict):
204 def _attr_filter(tag, attr, value):
205 if tag in attributes:
206 attr_val = attributes[tag]
207 if callable(attr_val):
208 return attr_val(tag, attr, value)
209
210 if attr in attr_val:
211 return True
212
213 if '*' in attributes:
214 attr_val = attributes['*']
215 if callable(attr_val):
216 return attr_val(tag, attr, value)
217
218 return attr in attr_val
219
220 return False
221
222 return _attr_filter
223
224 if isinstance(attributes, list):
225 def _attr_filter(tag, attr, value):
226 return attr in attributes
227
228 return _attr_filter
229
230 raise ValueError('attributes needs to be a callable, a list or a dict')
231
232
233 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
234 """html5lib Filter that sanitizes text
235
236 This filter can be used anywhere html5lib filters can be used.
237
238 """
239 def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
240 strip_disallowed_elements=False, strip_html_comments=True,
241 **kwargs):
242 """Creates a BleachSanitizerFilter instance
243
244 :arg Treewalker source: stream
245
246 :arg list tags: allowed list of tags; defaults to
247 ``bleach.sanitizer.ALLOWED_TAGS``
248
249 :arg dict attributes: allowed attributes; can be a callable, list or dict;
250 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
251
252 :arg list styles: allowed list of css styles; defaults to
253 ``bleach.sanitizer.ALLOWED_STYLES``
254
255 :arg list protocols: allowed list of protocols for links; defaults
256 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
257
258 :arg bool strip_disallowed_elements: whether or not to strip disallowed
259 elements
260
261 :arg bool strip_html_comments: whether or not to strip HTML comments
262
263 """
264 self.attr_filter = attribute_filter_factory(attributes)
265 self.strip_disallowed_elements = strip_disallowed_elements
266 self.strip_html_comments = strip_html_comments
267
268 return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
269
270 def sanitize_stream(self, token_iterator):
271 for token in token_iterator:
272 ret = self.sanitize_token(token)
273
274 if not ret:
275 continue
276
277 if isinstance(ret, list):
278 for subtoken in ret:
279 yield subtoken
280 else:
281 yield ret
282
283 def merge_characters(self, token_iterator):
284 """Merge consecutive Characters tokens in a stream"""
285 characters_buffer = []
286
287 for token in token_iterator:
288 if characters_buffer:
289 if token['type'] == 'Characters':
290 characters_buffer.append(token)
291 continue
292 else:
293 # Merge all the characters tokens together into one and then
294 # operate on it.
295 new_token = {
296 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
297 'type': 'Characters'
298 }
299 characters_buffer = []
300 yield new_token
301
302 elif token['type'] == 'Characters':
303 characters_buffer.append(token)
304 continue
305
306 yield token
307
308 new_token = {
309 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
310 'type': 'Characters'
311 }
312 yield new_token
313
314 def __iter__(self):
315 return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
316
317 def sanitize_token(self, token):
318 """Sanitize a token either by HTML-encoding or dropping.
319
320 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
321 ['attribute', 'pairs'], 'tag': callable}.
322
323 Here callable is a function with two arguments of attribute name and
324 value. It should return true of false.
325
326 Also gives the option to strip tags instead of encoding.
327
328 :arg dict token: token to sanitize
329
330 :returns: token or list of tokens
331
332 """
333 token_type = token['type']
334 if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
335 if token['name'] in self.allowed_elements:
336 return self.allow_token(token)
337
338 elif self.strip_disallowed_elements:
339 return None
340
341 else:
342 if 'data' in token:
343 # Alphabetize the attributes before calling .disallowed_token()
344 # so that the resulting string is stable
345 token['data'] = alphabetize_attributes(token['data'])
346 return self.disallowed_token(token)
347
348 elif token_type == 'Comment':
349 if not self.strip_html_comments:
350 return token
351 else:
352 return None
353
354 elif token_type == 'Characters':
355 return self.sanitize_characters(token)
356
357 else:
358 return token
359
360 def sanitize_characters(self, token):
361 """Handles Characters tokens
362
363 Our overridden tokenizer doesn't do anything with entities. However,
364 that means that the serializer will convert all ``&`` in Characters
365 tokens to ``&``.
366
367 Since we don't want that, we extract entities here and convert them to
368 Entity tokens so the serializer will let them be.
369
370 :arg token: the Characters token to work on
371
372 :returns: a list of tokens
373
374 """
375 data = token.get('data', '')
376
377 if not data:
378 return token
379
380 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
381 token['data'] = data
382
383 # If there isn't a & in the data, we can return now
384 if '&' not in data:
385 return token
386
387 new_tokens = []
388
389 # For each possible entity that starts with a "&", we try to extract an
390 # actual entity and re-tokenize accordingly
391 for part in html5lib_shim.next_possible_entity(data):
392 if not part:
393 continue
394
395 if part.startswith('&'):
396 entity = html5lib_shim.match_entity(part)
397 if entity is not None:
398 if entity == 'amp':
399 # LinkifyFilter can't match urls across token boundaries
400 # which is problematic with & since that shows up in
401 # querystrings all the time. This special-cases &
402 # and converts it to a & and sticks it in as a
403 # Characters token. It'll get merged with surrounding
404 # tokens in the BleachSanitizerfilter.__iter__ and
405 # escaped in the serializer.
406 new_tokens.append({'type': 'Characters', 'data': '&'})
407 else:
408 new_tokens.append({'type': 'Entity', 'name': entity})
409
410 # Length of the entity plus 2--one for & at the beginning
411 # and one for ; at the end
412 remainder = part[len(entity) + 2:]
413 if remainder:
414 new_tokens.append({'type': 'Characters', 'data': remainder})
415 continue
416
417 new_tokens.append({'type': 'Characters', 'data': part})
418
419 return new_tokens
420
421 def sanitize_uri_value(self, value, allowed_protocols):
422 """Checks a uri value to see if it's allowed
423
424 :arg value: the uri value to sanitize
425 :arg allowed_protocols: list of allowed protocols
426
427 :returns: allowed value or None
428
429 """
430 # NOTE(willkg): This transforms the value into one that's easier to
431 # match and verify, but shouldn't get returned since it's vastly
432 # different than the original value.
433
434 # Convert all character entities in the value
435 new_value = html5lib_shim.convert_entities(value)
436
437 # Nix backtick, space characters, and control characters
438 new_value = re.sub(
439 r"[`\000-\040\177-\240\s]+",
440 '',
441 new_value
442 )
443
444 # Remove REPLACEMENT characters
445 new_value = new_value.replace('\ufffd', '')
446
447 # Lowercase it--this breaks the value, but makes it easier to match
448 # against
449 new_value = new_value.lower()
450
451 try:
452 # Drop attributes with uri values that have protocols that aren't
453 # allowed
454 parsed = urlparse(new_value)
455 except ValueError:
456 # URI is impossible to parse, therefore it's not allowed
457 return None
458
459 if parsed.scheme:
460 # If urlparse found a scheme, check that
461 if parsed.scheme in allowed_protocols:
462 return value
463
464 else:
465 # Allow uris that are just an anchor
466 if new_value.startswith('#'):
467 return value
468
469 # Handle protocols that urlparse doesn't recognize like "myprotocol"
470 if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
471 return value
472
473 # If there's no protocol/scheme specified, then assume it's "http"
474 # and see if that's allowed
475 if 'http' in allowed_protocols:
476 return value
477
478 return None
479
480 def allow_token(self, token):
481 """Handles the case where we're allowing the tag"""
482 if 'data' in token:
483 # Loop through all the attributes and drop the ones that are not
484 # allowed, are unsafe or break other rules. Additionally, fix
485 # attribute values that need fixing.
486 #
487 # At the end of this loop, we have the final set of attributes
488 # we're keeping.
489 attrs = {}
490 for namespaced_name, val in token['data'].items():
491 namespace, name = namespaced_name
492
493 # Drop attributes that are not explicitly allowed
494 #
495 # NOTE(willkg): We pass in the attribute name--not a namespaced
496 # name.
497 if not self.attr_filter(token['name'], name, val):
498 continue
499
500 # Drop attributes with uri values that use a disallowed protocol
501 # Sanitize attributes with uri values
502 if namespaced_name in self.attr_val_is_uri:
503 new_value = self.sanitize_uri_value(val, self.allowed_protocols)
504 if new_value is None:
505 continue
506 val = new_value
507
508 # Drop values in svg attrs with non-local IRIs
509 if namespaced_name in self.svg_attr_val_allows_ref:
510 new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
511 ' ',
512 unescape(val))
513 new_val = new_val.strip()
514 if not new_val:
515 continue
516
517 else:
518 # Replace the val with the unescaped version because
519 # it's a iri
520 val = new_val
521
522 # Drop href and xlink:href attr for svg elements with non-local IRIs
523 if (None, token['name']) in self.svg_allow_local_href:
524 if namespaced_name in [
525 (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
526 ]:
527 if re.search(r'^\s*[^#\s]', val):
528 continue
529
530 # If it's a style attribute, sanitize it
531 if namespaced_name == (None, 'style'):
532 val = self.sanitize_css(val)
533
534 # At this point, we want to keep the attribute, so add it in
535 attrs[namespaced_name] = val
536
537 token['data'] = alphabetize_attributes(attrs)
538
539 return token
540
541 def disallowed_token(self, token):
542 token_type = token["type"]
543 if token_type == "EndTag":
544 token["data"] = "</%s>" % token["name"]
545
546 elif token["data"]:
547 assert token_type in ("StartTag", "EmptyTag")
548 attrs = []
549 for (ns, name), v in token["data"].items():
550 # If we end up with a namespace, but no name, switch them so we
551 # have a valid name to use.
552 if ns and not name:
553 ns, name = name, ns
554
555 # Figure out namespaced name if the namespace is appropriate
556 # and exists; if the ns isn't in prefixes, then drop it.
557 if ns is None or ns not in html5lib_shim.prefixes:
558 namespaced_name = name
559 else:
560 namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
561
562 attrs.append(' %s="%s"' % (
563 namespaced_name,
564 # NOTE(willkg): HTMLSerializer escapes attribute values
565 # already, so if we do it here (like HTMLSerializer does),
566 # then we end up double-escaping.
567 v)
568 )
569 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
570
571 else:
572 token["data"] = "<%s>" % token["name"]
573
574 if token.get("selfClosing"):
575 token["data"] = token["data"][:-1] + "/>"
576
577 token["type"] = "Characters"
578
579 del token["name"]
580 return token
581
582 def sanitize_css(self, style):
583 """Sanitizes css in style tags"""
584 # Convert entities in the style so that it can be parsed as CSS
585 style = html5lib_shim.convert_entities(style)
586
587 # Drop any url values before we do anything else
588 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
589
590 # The gauntlet of sanitization
591
592 # Validate the css in the style tag and if it's not valid, then drop
593 # the whole thing.
594 parts = style.split(';')
595 gauntlet = re.compile(
596 r"""^( # consider a style attribute value as composed of:
597 [/:,#%!.\s\w] # a non-newline character
598 |\w-\w # 3 characters in the form \w-\w
599 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
600 |"[\s\w]+" # a double quoted string of [\s\w]+
601 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)''
602 )*$""",
603 flags=re.U | re.VERBOSE
604 )
605
606 for part in parts:
607 if not gauntlet.match(part):
608 return ''
609
610 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
611 return ''
612
613 clean = []
614 for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
615 if not value:
616 continue
617
618 if prop.lower() in self.allowed_css_properties:
619 clean.append(prop + ': ' + value + ';')
620
621 elif prop.lower() in self.allowed_svg_properties:
622 clean.append(prop + ': ' + value + ';')
623
624 return ' '.join(clean)