Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bleach/sanitizer.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
comparison
equal
deleted
inserted
replaced
4:79f47841a781 | 5:9b1c78e6ba9c |
---|---|
1 from __future__ import unicode_literals | |
2 | |
3 from itertools import chain | |
4 import re | |
5 | |
6 import six | |
7 from six.moves.urllib.parse import urlparse | |
8 from xml.sax.saxutils import unescape | |
9 | |
10 from bleach import html5lib_shim | |
11 from bleach.utils import alphabetize_attributes, force_unicode | |
12 | |
13 | |
14 #: List of allowed tags | |
15 ALLOWED_TAGS = [ | |
16 'a', | |
17 'abbr', | |
18 'acronym', | |
19 'b', | |
20 'blockquote', | |
21 'code', | |
22 'em', | |
23 'i', | |
24 'li', | |
25 'ol', | |
26 'strong', | |
27 'ul', | |
28 ] | |
29 | |
30 | |
31 #: Map of allowed attributes by tag | |
32 ALLOWED_ATTRIBUTES = { | |
33 'a': ['href', 'title'], | |
34 'abbr': ['title'], | |
35 'acronym': ['title'], | |
36 } | |
37 | |
38 #: List of allowed styles | |
39 ALLOWED_STYLES = [] | |
40 | |
41 #: List of allowed protocols | |
42 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] | |
43 | |
44 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) | |
45 INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]) | |
46 | |
47 #: Regexp for characters that are invisible | |
48 INVISIBLE_CHARACTERS_RE = re.compile( | |
49 '[' + INVISIBLE_CHARACTERS + ']', | |
50 re.UNICODE | |
51 ) | |
52 | |
53 #: String to replace invisible characters with. This can be a character, a | |
54 #: string, or even a function that takes a Python re matchobj | |
55 INVISIBLE_REPLACEMENT_CHAR = '?' | |
56 | |
57 | |
58 class Cleaner(object): | |
59 """Cleaner for cleaning HTML fragments of malicious content | |
60 | |
61 This cleaner is a security-focused function whose sole purpose is to remove | |
62 malicious content from a string such that it can be displayed as content in | |
63 a web page. | |
64 | |
65 To use:: | |
66 | |
67 from bleach.sanitizer import Cleaner | |
68 | |
69 cleaner = Cleaner() | |
70 | |
71 for text in all_the_yucky_things: | |
72 sanitized = cleaner.clean(text) | |
73 | |
74 .. Note:: | |
75 | |
76 This cleaner is not designed to use to transform content to be used in | |
77 non-web-page contexts. | |
78 | |
79 .. Warning:: | |
80 | |
81 This cleaner is not thread-safe--the html parser has internal state. | |
82 Create a separate cleaner per thread! | |
83 | |
84 | |
85 """ | |
86 | |
87 def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, | |
88 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, | |
89 strip_comments=True, filters=None): | |
90 """Initializes a Cleaner | |
91 | |
92 :arg list tags: allowed list of tags; defaults to | |
93 ``bleach.sanitizer.ALLOWED_TAGS`` | |
94 | |
95 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
96 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
97 | |
98 :arg list styles: allowed list of css styles; defaults to | |
99 ``bleach.sanitizer.ALLOWED_STYLES`` | |
100 | |
101 :arg list protocols: allowed list of protocols for links; defaults | |
102 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
103 | |
104 :arg bool strip: whether or not to strip disallowed elements | |
105 | |
106 :arg bool strip_comments: whether or not to strip HTML comments | |
107 | |
108 :arg list filters: list of html5lib Filter classes to pass streamed content through | |
109 | |
110 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters | |
111 | |
112 .. Warning:: | |
113 | |
114 Using filters changes the output of ``bleach.Cleaner.clean``. | |
115 Make sure the way the filters change the output are secure. | |
116 | |
117 """ | |
118 self.tags = tags | |
119 self.attributes = attributes | |
120 self.styles = styles | |
121 self.protocols = protocols | |
122 self.strip = strip | |
123 self.strip_comments = strip_comments | |
124 self.filters = filters or [] | |
125 | |
126 self.parser = html5lib_shim.BleachHTMLParser( | |
127 tags=self.tags, | |
128 strip=self.strip, | |
129 consume_entities=False, | |
130 namespaceHTMLElements=False | |
131 ) | |
132 self.walker = html5lib_shim.getTreeWalker('etree') | |
133 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
134 quote_attr_values='always', | |
135 omit_optional_tags=False, | |
136 escape_lt_in_attrs=True, | |
137 | |
138 # We want to leave entities as they are without escaping or | |
139 # resolving or expanding | |
140 resolve_entities=False, | |
141 | |
142 # Bleach has its own sanitizer, so don't use the html5lib one | |
143 sanitize=False, | |
144 | |
145 # Bleach sanitizer alphabetizes already, so don't use the html5lib one | |
146 alphabetical_attributes=False, | |
147 ) | |
148 | |
149 def clean(self, text): | |
150 """Cleans text and returns sanitized result as unicode | |
151 | |
152 :arg str text: text to be cleaned | |
153 | |
154 :returns: sanitized text as unicode | |
155 | |
156 :raises TypeError: if ``text`` is not a text type | |
157 | |
158 """ | |
159 if not isinstance(text, six.string_types): | |
160 message = "argument cannot be of '{name}' type, must be of text type".format( | |
161 name=text.__class__.__name__) | |
162 raise TypeError(message) | |
163 | |
164 if not text: | |
165 return '' | |
166 | |
167 text = force_unicode(text) | |
168 | |
169 dom = self.parser.parseFragment(text) | |
170 filtered = BleachSanitizerFilter( | |
171 source=self.walker(dom), | |
172 | |
173 # Bleach-sanitizer-specific things | |
174 attributes=self.attributes, | |
175 strip_disallowed_elements=self.strip, | |
176 strip_html_comments=self.strip_comments, | |
177 | |
178 # html5lib-sanitizer things | |
179 allowed_elements=self.tags, | |
180 allowed_css_properties=self.styles, | |
181 allowed_protocols=self.protocols, | |
182 allowed_svg_properties=[], | |
183 ) | |
184 | |
185 # Apply any filters after the BleachSanitizerFilter | |
186 for filter_class in self.filters: | |
187 filtered = filter_class(source=filtered) | |
188 | |
189 return self.serializer.render(filtered) | |
190 | |
191 | |
192 def attribute_filter_factory(attributes): | |
193 """Generates attribute filter function for the given attributes value | |
194 | |
195 The attributes value can take one of several shapes. This returns a filter | |
196 function appropriate to the attributes value. One nice thing about this is | |
197 that there's less if/then shenanigans in the ``allow_token`` method. | |
198 | |
199 """ | |
200 if callable(attributes): | |
201 return attributes | |
202 | |
203 if isinstance(attributes, dict): | |
204 def _attr_filter(tag, attr, value): | |
205 if tag in attributes: | |
206 attr_val = attributes[tag] | |
207 if callable(attr_val): | |
208 return attr_val(tag, attr, value) | |
209 | |
210 if attr in attr_val: | |
211 return True | |
212 | |
213 if '*' in attributes: | |
214 attr_val = attributes['*'] | |
215 if callable(attr_val): | |
216 return attr_val(tag, attr, value) | |
217 | |
218 return attr in attr_val | |
219 | |
220 return False | |
221 | |
222 return _attr_filter | |
223 | |
224 if isinstance(attributes, list): | |
225 def _attr_filter(tag, attr, value): | |
226 return attr in attributes | |
227 | |
228 return _attr_filter | |
229 | |
230 raise ValueError('attributes needs to be a callable, a list or a dict') | |
231 | |
232 | |
233 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): | |
234 """html5lib Filter that sanitizes text | |
235 | |
236 This filter can be used anywhere html5lib filters can be used. | |
237 | |
238 """ | |
239 def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, | |
240 strip_disallowed_elements=False, strip_html_comments=True, | |
241 **kwargs): | |
242 """Creates a BleachSanitizerFilter instance | |
243 | |
244 :arg Treewalker source: stream | |
245 | |
246 :arg list tags: allowed list of tags; defaults to | |
247 ``bleach.sanitizer.ALLOWED_TAGS`` | |
248 | |
249 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
250 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
251 | |
252 :arg list styles: allowed list of css styles; defaults to | |
253 ``bleach.sanitizer.ALLOWED_STYLES`` | |
254 | |
255 :arg list protocols: allowed list of protocols for links; defaults | |
256 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
257 | |
258 :arg bool strip_disallowed_elements: whether or not to strip disallowed | |
259 elements | |
260 | |
261 :arg bool strip_html_comments: whether or not to strip HTML comments | |
262 | |
263 """ | |
264 self.attr_filter = attribute_filter_factory(attributes) | |
265 self.strip_disallowed_elements = strip_disallowed_elements | |
266 self.strip_html_comments = strip_html_comments | |
267 | |
268 return super(BleachSanitizerFilter, self).__init__(source, **kwargs) | |
269 | |
270 def sanitize_stream(self, token_iterator): | |
271 for token in token_iterator: | |
272 ret = self.sanitize_token(token) | |
273 | |
274 if not ret: | |
275 continue | |
276 | |
277 if isinstance(ret, list): | |
278 for subtoken in ret: | |
279 yield subtoken | |
280 else: | |
281 yield ret | |
282 | |
283 def merge_characters(self, token_iterator): | |
284 """Merge consecutive Characters tokens in a stream""" | |
285 characters_buffer = [] | |
286 | |
287 for token in token_iterator: | |
288 if characters_buffer: | |
289 if token['type'] == 'Characters': | |
290 characters_buffer.append(token) | |
291 continue | |
292 else: | |
293 # Merge all the characters tokens together into one and then | |
294 # operate on it. | |
295 new_token = { | |
296 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | |
297 'type': 'Characters' | |
298 } | |
299 characters_buffer = [] | |
300 yield new_token | |
301 | |
302 elif token['type'] == 'Characters': | |
303 characters_buffer.append(token) | |
304 continue | |
305 | |
306 yield token | |
307 | |
308 new_token = { | |
309 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | |
310 'type': 'Characters' | |
311 } | |
312 yield new_token | |
313 | |
314 def __iter__(self): | |
315 return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self))) | |
316 | |
317 def sanitize_token(self, token): | |
318 """Sanitize a token either by HTML-encoding or dropping. | |
319 | |
320 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': | |
321 ['attribute', 'pairs'], 'tag': callable}. | |
322 | |
323 Here callable is a function with two arguments of attribute name and | |
324 value. It should return true of false. | |
325 | |
326 Also gives the option to strip tags instead of encoding. | |
327 | |
328 :arg dict token: token to sanitize | |
329 | |
330 :returns: token or list of tokens | |
331 | |
332 """ | |
333 token_type = token['type'] | |
334 if token_type in ['StartTag', 'EndTag', 'EmptyTag']: | |
335 if token['name'] in self.allowed_elements: | |
336 return self.allow_token(token) | |
337 | |
338 elif self.strip_disallowed_elements: | |
339 return None | |
340 | |
341 else: | |
342 if 'data' in token: | |
343 # Alphabetize the attributes before calling .disallowed_token() | |
344 # so that the resulting string is stable | |
345 token['data'] = alphabetize_attributes(token['data']) | |
346 return self.disallowed_token(token) | |
347 | |
348 elif token_type == 'Comment': | |
349 if not self.strip_html_comments: | |
350 return token | |
351 else: | |
352 return None | |
353 | |
354 elif token_type == 'Characters': | |
355 return self.sanitize_characters(token) | |
356 | |
357 else: | |
358 return token | |
359 | |
360 def sanitize_characters(self, token): | |
361 """Handles Characters tokens | |
362 | |
363 Our overridden tokenizer doesn't do anything with entities. However, | |
364 that means that the serializer will convert all ``&`` in Characters | |
365 tokens to ``&``. | |
366 | |
367 Since we don't want that, we extract entities here and convert them to | |
368 Entity tokens so the serializer will let them be. | |
369 | |
370 :arg token: the Characters token to work on | |
371 | |
372 :returns: a list of tokens | |
373 | |
374 """ | |
375 data = token.get('data', '') | |
376 | |
377 if not data: | |
378 return token | |
379 | |
380 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) | |
381 token['data'] = data | |
382 | |
383 # If there isn't a & in the data, we can return now | |
384 if '&' not in data: | |
385 return token | |
386 | |
387 new_tokens = [] | |
388 | |
389 # For each possible entity that starts with a "&", we try to extract an | |
390 # actual entity and re-tokenize accordingly | |
391 for part in html5lib_shim.next_possible_entity(data): | |
392 if not part: | |
393 continue | |
394 | |
395 if part.startswith('&'): | |
396 entity = html5lib_shim.match_entity(part) | |
397 if entity is not None: | |
398 if entity == 'amp': | |
399 # LinkifyFilter can't match urls across token boundaries | |
400 # which is problematic with & since that shows up in | |
401 # querystrings all the time. This special-cases & | |
402 # and converts it to a & and sticks it in as a | |
403 # Characters token. It'll get merged with surrounding | |
404 # tokens in the BleachSanitizerfilter.__iter__ and | |
405 # escaped in the serializer. | |
406 new_tokens.append({'type': 'Characters', 'data': '&'}) | |
407 else: | |
408 new_tokens.append({'type': 'Entity', 'name': entity}) | |
409 | |
410 # Length of the entity plus 2--one for & at the beginning | |
411 # and one for ; at the end | |
412 remainder = part[len(entity) + 2:] | |
413 if remainder: | |
414 new_tokens.append({'type': 'Characters', 'data': remainder}) | |
415 continue | |
416 | |
417 new_tokens.append({'type': 'Characters', 'data': part}) | |
418 | |
419 return new_tokens | |
420 | |
421 def sanitize_uri_value(self, value, allowed_protocols): | |
422 """Checks a uri value to see if it's allowed | |
423 | |
424 :arg value: the uri value to sanitize | |
425 :arg allowed_protocols: list of allowed protocols | |
426 | |
427 :returns: allowed value or None | |
428 | |
429 """ | |
430 # NOTE(willkg): This transforms the value into one that's easier to | |
431 # match and verify, but shouldn't get returned since it's vastly | |
432 # different than the original value. | |
433 | |
434 # Convert all character entities in the value | |
435 new_value = html5lib_shim.convert_entities(value) | |
436 | |
437 # Nix backtick, space characters, and control characters | |
438 new_value = re.sub( | |
439 r"[`\000-\040\177-\240\s]+", | |
440 '', | |
441 new_value | |
442 ) | |
443 | |
444 # Remove REPLACEMENT characters | |
445 new_value = new_value.replace('\ufffd', '') | |
446 | |
447 # Lowercase it--this breaks the value, but makes it easier to match | |
448 # against | |
449 new_value = new_value.lower() | |
450 | |
451 try: | |
452 # Drop attributes with uri values that have protocols that aren't | |
453 # allowed | |
454 parsed = urlparse(new_value) | |
455 except ValueError: | |
456 # URI is impossible to parse, therefore it's not allowed | |
457 return None | |
458 | |
459 if parsed.scheme: | |
460 # If urlparse found a scheme, check that | |
461 if parsed.scheme in allowed_protocols: | |
462 return value | |
463 | |
464 else: | |
465 # Allow uris that are just an anchor | |
466 if new_value.startswith('#'): | |
467 return value | |
468 | |
469 # Handle protocols that urlparse doesn't recognize like "myprotocol" | |
470 if ':' in new_value and new_value.split(':')[0] in allowed_protocols: | |
471 return value | |
472 | |
473 # If there's no protocol/scheme specified, then assume it's "http" | |
474 # and see if that's allowed | |
475 if 'http' in allowed_protocols: | |
476 return value | |
477 | |
478 return None | |
479 | |
480 def allow_token(self, token): | |
481 """Handles the case where we're allowing the tag""" | |
482 if 'data' in token: | |
483 # Loop through all the attributes and drop the ones that are not | |
484 # allowed, are unsafe or break other rules. Additionally, fix | |
485 # attribute values that need fixing. | |
486 # | |
487 # At the end of this loop, we have the final set of attributes | |
488 # we're keeping. | |
489 attrs = {} | |
490 for namespaced_name, val in token['data'].items(): | |
491 namespace, name = namespaced_name | |
492 | |
493 # Drop attributes that are not explicitly allowed | |
494 # | |
495 # NOTE(willkg): We pass in the attribute name--not a namespaced | |
496 # name. | |
497 if not self.attr_filter(token['name'], name, val): | |
498 continue | |
499 | |
500 # Drop attributes with uri values that use a disallowed protocol | |
501 # Sanitize attributes with uri values | |
502 if namespaced_name in self.attr_val_is_uri: | |
503 new_value = self.sanitize_uri_value(val, self.allowed_protocols) | |
504 if new_value is None: | |
505 continue | |
506 val = new_value | |
507 | |
508 # Drop values in svg attrs with non-local IRIs | |
509 if namespaced_name in self.svg_attr_val_allows_ref: | |
510 new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', | |
511 ' ', | |
512 unescape(val)) | |
513 new_val = new_val.strip() | |
514 if not new_val: | |
515 continue | |
516 | |
517 else: | |
518 # Replace the val with the unescaped version because | |
519 # it's a iri | |
520 val = new_val | |
521 | |
522 # Drop href and xlink:href attr for svg elements with non-local IRIs | |
523 if (None, token['name']) in self.svg_allow_local_href: | |
524 if namespaced_name in [ | |
525 (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') | |
526 ]: | |
527 if re.search(r'^\s*[^#\s]', val): | |
528 continue | |
529 | |
530 # If it's a style attribute, sanitize it | |
531 if namespaced_name == (None, 'style'): | |
532 val = self.sanitize_css(val) | |
533 | |
534 # At this point, we want to keep the attribute, so add it in | |
535 attrs[namespaced_name] = val | |
536 | |
537 token['data'] = alphabetize_attributes(attrs) | |
538 | |
539 return token | |
540 | |
541 def disallowed_token(self, token): | |
542 token_type = token["type"] | |
543 if token_type == "EndTag": | |
544 token["data"] = "</%s>" % token["name"] | |
545 | |
546 elif token["data"]: | |
547 assert token_type in ("StartTag", "EmptyTag") | |
548 attrs = [] | |
549 for (ns, name), v in token["data"].items(): | |
550 # If we end up with a namespace, but no name, switch them so we | |
551 # have a valid name to use. | |
552 if ns and not name: | |
553 ns, name = name, ns | |
554 | |
555 # Figure out namespaced name if the namespace is appropriate | |
556 # and exists; if the ns isn't in prefixes, then drop it. | |
557 if ns is None or ns not in html5lib_shim.prefixes: | |
558 namespaced_name = name | |
559 else: | |
560 namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name) | |
561 | |
562 attrs.append(' %s="%s"' % ( | |
563 namespaced_name, | |
564 # NOTE(willkg): HTMLSerializer escapes attribute values | |
565 # already, so if we do it here (like HTMLSerializer does), | |
566 # then we end up double-escaping. | |
567 v) | |
568 ) | |
569 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) | |
570 | |
571 else: | |
572 token["data"] = "<%s>" % token["name"] | |
573 | |
574 if token.get("selfClosing"): | |
575 token["data"] = token["data"][:-1] + "/>" | |
576 | |
577 token["type"] = "Characters" | |
578 | |
579 del token["name"] | |
580 return token | |
581 | |
582 def sanitize_css(self, style): | |
583 """Sanitizes css in style tags""" | |
584 # Convert entities in the style so that it can be parsed as CSS | |
585 style = html5lib_shim.convert_entities(style) | |
586 | |
587 # Drop any url values before we do anything else | |
588 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) | |
589 | |
590 # The gauntlet of sanitization | |
591 | |
592 # Validate the css in the style tag and if it's not valid, then drop | |
593 # the whole thing. | |
594 parts = style.split(';') | |
595 gauntlet = re.compile( | |
596 r"""^( # consider a style attribute value as composed of: | |
597 [/:,#%!.\s\w] # a non-newline character | |
598 |\w-\w # 3 characters in the form \w-\w | |
599 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space | |
600 |"[\s\w]+" # a double quoted string of [\s\w]+ | |
601 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'' | |
602 )*$""", | |
603 flags=re.U | re.VERBOSE | |
604 ) | |
605 | |
606 for part in parts: | |
607 if not gauntlet.match(part): | |
608 return '' | |
609 | |
610 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): | |
611 return '' | |
612 | |
613 clean = [] | |
614 for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): | |
615 if not value: | |
616 continue | |
617 | |
618 if prop.lower() in self.allowed_css_properties: | |
619 clean.append(prop + ': ' + value + ';') | |
620 | |
621 elif prop.lower() in self.allowed_svg_properties: | |
622 clean.append(prop + ': ' + value + ';') | |
623 | |
624 return ' '.join(clean) |