comparison env/lib/python3.7/site-packages/bleach/html5lib_shim.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # flake8: noqa
2 """
3 Shim module between Bleach and html5lib. This makes it easier to upgrade the
4 html5lib library without having to change a lot of code.
5 """
6
7 from __future__ import unicode_literals
8
9 import re
10 import string
11
12 import six
13
14 from bleach._vendor.html5lib import (
15 HTMLParser,
16 getTreeWalker,
17 )
18 from bleach._vendor.html5lib import constants
19 from bleach._vendor.html5lib.constants import (
20 namespaces,
21 prefixes,
22 )
23 from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
24 from bleach._vendor.html5lib.filters.base import Filter
25 from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
26 from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
27 from bleach._vendor.html5lib._inputstream import HTMLInputStream
28 from bleach._vendor.html5lib.serializer import HTMLSerializer
29 from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
30 from bleach._vendor.html5lib._trie import Trie
31
32
33 #: Map of entity name to expanded entity
34 ENTITIES = constants.entities
35
36 #: Trie of html entity string -> character representation
37 ENTITIES_TRIE = Trie(ENTITIES)
38
39 #: Token type constants--these never change
40 TAG_TOKEN_TYPES = {
41 constants.tokenTypes['StartTag'],
42 constants.tokenTypes['EndTag'],
43 constants.tokenTypes['EmptyTag']
44 }
45 CHARACTERS_TYPE = constants.tokenTypes['Characters']
46 PARSEERROR_TYPE = constants.tokenTypes['ParseError']
47
48
49 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
50 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
51 HTML_TAGS = [
52 'a',
53 'abbr',
54 'address',
55 'area',
56 'article',
57 'aside',
58 'audio',
59 'b',
60 'base',
61 'bdi',
62 'bdo',
63 'blockquote',
64 'body',
65 'br',
66 'button',
67 'canvas',
68 'caption',
69 'cite',
70 'code',
71 'col',
72 'colgroup',
73 'data',
74 'datalist',
75 'dd',
76 'del',
77 'details',
78 'dfn',
79 'dialog',
80 'div',
81 'dl',
82 'dt',
83 'em',
84 'embed',
85 'fieldset',
86 'figcaption',
87 'figure',
88 'footer',
89 'form',
90 'h1',
91 'h2',
92 'h3',
93 'h4',
94 'h5',
95 'h6',
96 'head',
97 'header',
98 'hgroup',
99 'hr',
100 'html',
101 'i',
102 'iframe',
103 'img',
104 'input',
105 'ins',
106 'kbd',
107 'keygen',
108 'label',
109 'legend',
110 'li',
111 'link',
112 'map',
113 'mark',
114 'menu',
115 'meta',
116 'meter',
117 'nav',
118 'noscript',
119 'object',
120 'ol',
121 'optgroup',
122 'option',
123 'output',
124 'p',
125 'param',
126 'picture',
127 'pre',
128 'progress',
129 'q',
130 'rp',
131 'rt',
132 'ruby',
133 's',
134 'samp',
135 'script',
136 'section',
137 'select',
138 'slot',
139 'small',
140 'source',
141 'span',
142 'strong',
143 'style',
144 'sub',
145 'summary',
146 'sup',
147 'table',
148 'tbody',
149 'td',
150 'template',
151 'textarea',
152 'tfoot',
153 'th',
154 'thead',
155 'time',
156 'title',
157 'tr',
158 'track',
159 'u',
160 'ul',
161 'var',
162 'video',
163 'wbr',
164 ]
165
166
167 class InputStreamWithMemory(object):
168 """Wraps an HTMLInputStream to remember characters since last <
169
170 This wraps existing HTMLInputStream classes to keep track of the stream
171 since the last < which marked an open tag state.
172
173 """
174 def __init__(self, inner_stream):
175 self._inner_stream = inner_stream
176 self.reset = self._inner_stream.reset
177 self.position = self._inner_stream.position
178 self._buffer = []
179
180 @property
181 def errors(self):
182 return self._inner_stream.errors
183
184 @property
185 def charEncoding(self):
186 return self._inner_stream.charEncoding
187
188 @property
189 def changeEncoding(self):
190 return self._inner_stream.changeEncoding
191
192 def char(self):
193 c = self._inner_stream.char()
194 # char() can return None if EOF, so ignore that
195 if c:
196 self._buffer.append(c)
197 return c
198
199 def charsUntil(self, characters, opposite=False):
200 chars = self._inner_stream.charsUntil(characters, opposite=opposite)
201 self._buffer.extend(list(chars))
202 return chars
203
204 def unget(self, char):
205 if self._buffer:
206 self._buffer.pop(-1)
207 return self._inner_stream.unget(char)
208
209 def get_tag(self):
210 """Returns the stream history since last '<'
211
212 Since the buffer starts at the last '<' as as seen by tagOpenState(),
213 we know that everything from that point to when this method is called
214 is the "tag" that is being tokenized.
215
216 """
217 return six.text_type('').join(self._buffer)
218
219 def start_tag(self):
220 """Resets stream history to just '<'
221
222 This gets called by tagOpenState() which marks a '<' that denotes an
223 open tag. Any time we see that, we reset the buffer.
224
225 """
226 self._buffer = ['<']
227
228
229 class BleachHTMLTokenizer(HTMLTokenizer):
230 """Tokenizer that doesn't consume character entities"""
231 def __init__(self, consume_entities=False, **kwargs):
232 super(BleachHTMLTokenizer, self).__init__(**kwargs)
233
234 self.consume_entities = consume_entities
235
236 # Wrap the stream with one that remembers the history
237 self.stream = InputStreamWithMemory(self.stream)
238
239 def __iter__(self):
240 last_error_token = None
241
242 for token in super(BleachHTMLTokenizer, self).__iter__():
243 if last_error_token is not None:
244 if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
245 token['type'] in TAG_TOKEN_TYPES and
246 token.get('data'))):
247 # Remove attribute names that have ', " or < in them
248 # because those characters are invalid for attribute names.
249 token['data'] = [
250 item for item in token['data']
251 if ('"' not in item[0] and
252 "'" not in item[0] and
253 '<' not in item[0])
254 ]
255 last_error_token = None
256 yield token
257
258 elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
259 self.parser.tags is not None and
260 token['data'].lower().strip() not in self.parser.tags)):
261 # We've got either a malformed tag or a pseudo-tag or
262 # something that html5lib wants to turn into a malformed
263 # comment which Bleach clean() will drop so we interfere
264 # with the token stream to handle it more correctly.
265 #
266 # If this is an allowed tag, it's malformed and we just let
267 # the html5lib parser deal with it--we don't enter into this
268 # block.
269 #
270 # If this is not an allowed tag, then we convert it to
271 # characters and it'll get escaped in the sanitizer.
272 token['data'] = self.stream.get_tag()
273 token['type'] = CHARACTERS_TYPE
274
275 last_error_token = None
276 yield token
277
278 elif token['type'] == PARSEERROR_TYPE:
279 # If the token is a parse error, then let the last_error_token
280 # go, and make token the new last_error_token
281 yield last_error_token
282 last_error_token = token
283
284 else:
285 yield last_error_token
286 yield token
287 last_error_token = None
288
289 continue
290
291 # If the token is a ParseError, we hold on to it so we can get the
292 # next token and potentially fix it.
293 if token['type'] == PARSEERROR_TYPE:
294 last_error_token = token
295 continue
296
297 yield token
298
299 if last_error_token:
300 yield last_error_token
301
302 def consumeEntity(self, allowedChar=None, fromAttribute=False):
303 # If this tokenizer is set to consume entities, then we can let the
304 # superclass do its thing.
305 if self.consume_entities:
306 return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
307
308 # If this tokenizer is set to not consume entities, then we don't want
309 # to consume and convert them, so this overrides the html5lib tokenizer's
310 # consumeEntity so that it's now a no-op.
311 #
312 # However, when that gets called, it's consumed an &, so we put that back in
313 # the stream.
314 if fromAttribute:
315 self.currentToken['data'][-1][1] += '&'
316
317 else:
318 self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
319
320 def tagOpenState(self):
321 # This state marks a < that is either a StartTag, EndTag, EmptyTag,
322 # or ParseError. In all cases, we want to drop any stream history
323 # we've collected so far and we do that by calling start_tag() on
324 # the input stream wrapper.
325 self.stream.start_tag()
326 return super(BleachHTMLTokenizer, self).tagOpenState()
327
328 def emitCurrentToken(self):
329 token = self.currentToken
330
331 if ((self.parser.tags is not None and
332 token['type'] in TAG_TOKEN_TYPES and
333 token['name'].lower() not in self.parser.tags)):
334 # If this is a start/end/empty tag for a tag that's not in our
335 # allowed list, then it gets stripped or escaped. In both of these
336 # cases it gets converted to a Characters token.
337 if self.parser.strip:
338 # If we're stripping the token, we just throw in an empty
339 # string token.
340 new_data = ''
341
342 else:
343 # If we're escaping the token, we want to escape the exact
344 # original string. Since tokenizing also normalizes data
345 # and this is a tag-like thing, we've lost some information.
346 # So we go back through the stream to get the original
347 # string and use that.
348 new_data = self.stream.get_tag()
349
350 new_token = {
351 'type': CHARACTERS_TYPE,
352 'data': new_data
353 }
354
355 self.currentToken = new_token
356 self.tokenQueue.append(new_token)
357 self.state = self.dataState
358 return
359
360 super(BleachHTMLTokenizer, self).emitCurrentToken()
361
362
363 class BleachHTMLParser(HTMLParser):
364 """Parser that uses BleachHTMLTokenizer"""
365 def __init__(self, tags, strip, consume_entities, **kwargs):
366 """
367 :arg tags: list of allowed tags--everything else is either stripped or
368 escaped; if None, then this doesn't look at tags at all
369 :arg strip: whether to strip disallowed tags (True) or escape them (False);
370 if tags=None, then this doesn't have any effect
371 :arg consume_entities: whether to consume entities (default behavior) or
372 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
373
374 """
375 self.tags = [tag.lower() for tag in tags] if tags is not None else None
376 self.strip = strip
377 self.consume_entities = consume_entities
378 super(BleachHTMLParser, self).__init__(**kwargs)
379
380 def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
381 # set scripting=True to parse <noscript> as though JS is enabled to
382 # match the expected context in browsers
383 #
384 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
385 #
386 # Override HTMLParser so we can swap out the tokenizer for our own.
387 self.innerHTMLMode = innerHTML
388 self.container = container
389 self.scripting = scripting
390 self.tokenizer = BleachHTMLTokenizer(
391 stream=stream,
392 consume_entities=self.consume_entities,
393 parser=self,
394 **kwargs
395 )
396 self.reset()
397
398 try:
399 self.mainLoop()
400 except ReparseException:
401 self.reset()
402 self.mainLoop()
403
404
405 def convert_entity(value):
406 """Convert an entity (minus the & and ; part) into what it represents
407
408 This handles numeric, hex, and text entities.
409
410 :arg value: the string (minus the ``&`` and ``;`` part) to convert
411
412 :returns: unicode character or None if it's an ambiguous ampersand that
413 doesn't match a character entity
414
415 """
416 if value[0] == '#':
417 if value[1] in ('x', 'X'):
418 return six.unichr(int(value[2:], 16))
419 return six.unichr(int(value[1:], 10))
420
421 return ENTITIES.get(value, None)
422
423
424 def convert_entities(text):
425 """Converts all found entities in the text
426
427 :arg text: the text to convert entities in
428
429 :returns: unicode text with converted entities
430
431 """
432 if '&' not in text:
433 return text
434
435 new_text = []
436 for part in next_possible_entity(text):
437 if not part:
438 continue
439
440 if part.startswith('&'):
441 entity = match_entity(part)
442 if entity is not None:
443 converted = convert_entity(entity)
444
445 # If it's not an ambiguous ampersand, then replace with the
446 # unicode character. Otherwise, we leave the entity in.
447 if converted is not None:
448 new_text.append(converted)
449 remainder = part[len(entity) + 2:]
450 if part:
451 new_text.append(remainder)
452 continue
453
454 new_text.append(part)
455
456 return ''.join(new_text)
457
458
459 def match_entity(stream):
460 """Returns first entity in stream or None if no entity exists
461
462 Note: For Bleach purposes, entities must start with a "&" and end with
463 a ";". This ignoresambiguous character entities that have no ";" at the
464 end.
465
466 :arg stream: the character stream
467
468 :returns: ``None`` or the entity string without "&" or ";"
469
470 """
471 # Nix the & at the beginning
472 if stream[0] != '&':
473 raise ValueError('Stream should begin with "&"')
474
475 stream = stream[1:]
476
477 stream = list(stream)
478 possible_entity = ''
479 end_characters = '<&=;' + string.whitespace
480
481 # Handle number entities
482 if stream and stream[0] == '#':
483 possible_entity = '#'
484 stream.pop(0)
485
486 if stream and stream[0] in ('x', 'X'):
487 allowed = '0123456789abcdefABCDEF'
488 possible_entity += stream.pop(0)
489 else:
490 allowed = '0123456789'
491
492 # FIXME(willkg): Do we want to make sure these are valid number
493 # entities? This doesn't do that currently.
494 while stream and stream[0] not in end_characters:
495 c = stream.pop(0)
496 if c not in allowed:
497 break
498 possible_entity += c
499
500 if possible_entity and stream and stream[0] == ';':
501 return possible_entity
502 return None
503
504 # Handle character entities
505 while stream and stream[0] not in end_characters:
506 c = stream.pop(0)
507 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
508 break
509 possible_entity += c
510
511 if possible_entity and stream and stream[0] == ';':
512 return possible_entity
513
514 return None
515
516
517 AMP_SPLIT_RE = re.compile('(&)')
518
519
520 def next_possible_entity(text):
521 """Takes a text and generates a list of possible entities
522
523 :arg text: the text to look at
524
525 :returns: generator where each part (except the first) starts with an
526 "&"
527
528 """
529 for i, part in enumerate(AMP_SPLIT_RE.split(text)):
530 if i == 0:
531 yield part
532 elif i % 2 == 0:
533 yield '&' + part
534
535
536 class BleachHTMLSerializer(HTMLSerializer):
537 """HTMLSerializer that undoes & -> &amp; in attributes and sets
538 escape_rcdata to True
539 """
540
541 # per the HTMLSerializer.__init__ docstring:
542 #
543 # Whether to escape characters that need to be
544 # escaped within normal elements within rcdata elements such as
545 # style.
546 #
547 escape_rcdata = True
548
549 def escape_base_amp(self, stoken):
550 """Escapes just bare & in HTML attribute values"""
551 # First, undo escaping of &. We need to do this because html5lib's
552 # HTMLSerializer expected the tokenizer to consume all the character
553 # entities and convert them to their respective characters, but the
554 # BleachHTMLTokenizer doesn't do that. For example, this fixes
555 # &amp;entity; back to &entity; .
556 stoken = stoken.replace('&amp;', '&')
557
558 # However, we do want all bare & that are not marking character
559 # entities to be changed to &amp;, so let's do that carefully here.
560 for part in next_possible_entity(stoken):
561 if not part:
562 continue
563
564 if part.startswith('&'):
565 entity = match_entity(part)
566 # Only leave entities in that are not ambiguous. If they're
567 # ambiguous, then we escape the ampersand.
568 if entity is not None and convert_entity(entity) is not None:
569 yield '&' + entity + ';'
570
571 # Length of the entity plus 2--one for & at the beginning
572 # and one for ; at the end
573 part = part[len(entity) + 2:]
574 if part:
575 yield part
576 continue
577
578 yield part.replace('&', '&amp;')
579
580 def serialize(self, treewalker, encoding=None):
581 """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
582
583 Note that this converts & to &amp; in attribute values where the & isn't
584 already part of an unambiguous character entity.
585
586 """
587 in_tag = False
588 after_equals = False
589
590 for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
591 if in_tag:
592 if stoken == '>':
593 in_tag = False
594
595 elif after_equals:
596 if stoken != '"':
597 for part in self.escape_base_amp(stoken):
598 yield part
599
600 after_equals = False
601 continue
602
603 elif stoken == '=':
604 after_equals = True
605
606 yield stoken
607 else:
608 if stoken.startswith('<'):
609 in_tag = True
610 yield stoken