Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bleach/html5lib_shim.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # flake8: noqa | |
2 """ | |
3 Shim module between Bleach and html5lib. This makes it easier to upgrade the | |
4 html5lib library without having to change a lot of code. | |
5 """ | |
6 | |
7 from __future__ import unicode_literals | |
8 | |
9 import re | |
10 import string | |
11 | |
12 import six | |
13 | |
14 from bleach._vendor.html5lib import ( | |
15 HTMLParser, | |
16 getTreeWalker, | |
17 ) | |
18 from bleach._vendor.html5lib import constants | |
19 from bleach._vendor.html5lib.constants import ( | |
20 namespaces, | |
21 prefixes, | |
22 ) | |
23 from bleach._vendor.html5lib.constants import _ReparseException as ReparseException | |
24 from bleach._vendor.html5lib.filters.base import Filter | |
25 from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols | |
26 from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter | |
27 from bleach._vendor.html5lib._inputstream import HTMLInputStream | |
28 from bleach._vendor.html5lib.serializer import HTMLSerializer | |
29 from bleach._vendor.html5lib._tokenizer import HTMLTokenizer | |
30 from bleach._vendor.html5lib._trie import Trie | |
31 | |
32 | |
33 #: Map of entity name to expanded entity | |
34 ENTITIES = constants.entities | |
35 | |
36 #: Trie of html entity string -> character representation | |
37 ENTITIES_TRIE = Trie(ENTITIES) | |
38 | |
39 #: Token type constants--these never change | |
40 TAG_TOKEN_TYPES = { | |
41 constants.tokenTypes['StartTag'], | |
42 constants.tokenTypes['EndTag'], | |
43 constants.tokenTypes['EmptyTag'] | |
44 } | |
45 CHARACTERS_TYPE = constants.tokenTypes['Characters'] | |
46 PARSEERROR_TYPE = constants.tokenTypes['ParseError'] | |
47 | |
48 | |
49 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 | |
50 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 | |
51 HTML_TAGS = [ | |
52 'a', | |
53 'abbr', | |
54 'address', | |
55 'area', | |
56 'article', | |
57 'aside', | |
58 'audio', | |
59 'b', | |
60 'base', | |
61 'bdi', | |
62 'bdo', | |
63 'blockquote', | |
64 'body', | |
65 'br', | |
66 'button', | |
67 'canvas', | |
68 'caption', | |
69 'cite', | |
70 'code', | |
71 'col', | |
72 'colgroup', | |
73 'data', | |
74 'datalist', | |
75 'dd', | |
76 'del', | |
77 'details', | |
78 'dfn', | |
79 'dialog', | |
80 'div', | |
81 'dl', | |
82 'dt', | |
83 'em', | |
84 'embed', | |
85 'fieldset', | |
86 'figcaption', | |
87 'figure', | |
88 'footer', | |
89 'form', | |
90 'h1', | |
91 'h2', | |
92 'h3', | |
93 'h4', | |
94 'h5', | |
95 'h6', | |
96 'head', | |
97 'header', | |
98 'hgroup', | |
99 'hr', | |
100 'html', | |
101 'i', | |
102 'iframe', | |
103 'img', | |
104 'input', | |
105 'ins', | |
106 'kbd', | |
107 'keygen', | |
108 'label', | |
109 'legend', | |
110 'li', | |
111 'link', | |
112 'map', | |
113 'mark', | |
114 'menu', | |
115 'meta', | |
116 'meter', | |
117 'nav', | |
118 'noscript', | |
119 'object', | |
120 'ol', | |
121 'optgroup', | |
122 'option', | |
123 'output', | |
124 'p', | |
125 'param', | |
126 'picture', | |
127 'pre', | |
128 'progress', | |
129 'q', | |
130 'rp', | |
131 'rt', | |
132 'ruby', | |
133 's', | |
134 'samp', | |
135 'script', | |
136 'section', | |
137 'select', | |
138 'slot', | |
139 'small', | |
140 'source', | |
141 'span', | |
142 'strong', | |
143 'style', | |
144 'sub', | |
145 'summary', | |
146 'sup', | |
147 'table', | |
148 'tbody', | |
149 'td', | |
150 'template', | |
151 'textarea', | |
152 'tfoot', | |
153 'th', | |
154 'thead', | |
155 'time', | |
156 'title', | |
157 'tr', | |
158 'track', | |
159 'u', | |
160 'ul', | |
161 'var', | |
162 'video', | |
163 'wbr', | |
164 ] | |
165 | |
166 | |
167 class InputStreamWithMemory(object): | |
168 """Wraps an HTMLInputStream to remember characters since last < | |
169 | |
170 This wraps existing HTMLInputStream classes to keep track of the stream | |
171 since the last < which marked an open tag state. | |
172 | |
173 """ | |
174 def __init__(self, inner_stream): | |
175 self._inner_stream = inner_stream | |
176 self.reset = self._inner_stream.reset | |
177 self.position = self._inner_stream.position | |
178 self._buffer = [] | |
179 | |
180 @property | |
181 def errors(self): | |
182 return self._inner_stream.errors | |
183 | |
184 @property | |
185 def charEncoding(self): | |
186 return self._inner_stream.charEncoding | |
187 | |
188 @property | |
189 def changeEncoding(self): | |
190 return self._inner_stream.changeEncoding | |
191 | |
192 def char(self): | |
193 c = self._inner_stream.char() | |
194 # char() can return None if EOF, so ignore that | |
195 if c: | |
196 self._buffer.append(c) | |
197 return c | |
198 | |
199 def charsUntil(self, characters, opposite=False): | |
200 chars = self._inner_stream.charsUntil(characters, opposite=opposite) | |
201 self._buffer.extend(list(chars)) | |
202 return chars | |
203 | |
204 def unget(self, char): | |
205 if self._buffer: | |
206 self._buffer.pop(-1) | |
207 return self._inner_stream.unget(char) | |
208 | |
209 def get_tag(self): | |
210 """Returns the stream history since last '<' | |
211 | |
212 Since the buffer starts at the last '<' as as seen by tagOpenState(), | |
213 we know that everything from that point to when this method is called | |
214 is the "tag" that is being tokenized. | |
215 | |
216 """ | |
217 return six.text_type('').join(self._buffer) | |
218 | |
219 def start_tag(self): | |
220 """Resets stream history to just '<' | |
221 | |
222 This gets called by tagOpenState() which marks a '<' that denotes an | |
223 open tag. Any time we see that, we reset the buffer. | |
224 | |
225 """ | |
226 self._buffer = ['<'] | |
227 | |
228 | |
229 class BleachHTMLTokenizer(HTMLTokenizer): | |
230 """Tokenizer that doesn't consume character entities""" | |
231 def __init__(self, consume_entities=False, **kwargs): | |
232 super(BleachHTMLTokenizer, self).__init__(**kwargs) | |
233 | |
234 self.consume_entities = consume_entities | |
235 | |
236 # Wrap the stream with one that remembers the history | |
237 self.stream = InputStreamWithMemory(self.stream) | |
238 | |
239 def __iter__(self): | |
240 last_error_token = None | |
241 | |
242 for token in super(BleachHTMLTokenizer, self).__iter__(): | |
243 if last_error_token is not None: | |
244 if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and | |
245 token['type'] in TAG_TOKEN_TYPES and | |
246 token.get('data'))): | |
247 # Remove attribute names that have ', " or < in them | |
248 # because those characters are invalid for attribute names. | |
249 token['data'] = [ | |
250 item for item in token['data'] | |
251 if ('"' not in item[0] and | |
252 "'" not in item[0] and | |
253 '<' not in item[0]) | |
254 ] | |
255 last_error_token = None | |
256 yield token | |
257 | |
258 elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and | |
259 self.parser.tags is not None and | |
260 token['data'].lower().strip() not in self.parser.tags)): | |
261 # We've got either a malformed tag or a pseudo-tag or | |
262 # something that html5lib wants to turn into a malformed | |
263 # comment which Bleach clean() will drop so we interfere | |
264 # with the token stream to handle it more correctly. | |
265 # | |
266 # If this is an allowed tag, it's malformed and we just let | |
267 # the html5lib parser deal with it--we don't enter into this | |
268 # block. | |
269 # | |
270 # If this is not an allowed tag, then we convert it to | |
271 # characters and it'll get escaped in the sanitizer. | |
272 token['data'] = self.stream.get_tag() | |
273 token['type'] = CHARACTERS_TYPE | |
274 | |
275 last_error_token = None | |
276 yield token | |
277 | |
278 elif token['type'] == PARSEERROR_TYPE: | |
279 # If the token is a parse error, then let the last_error_token | |
280 # go, and make token the new last_error_token | |
281 yield last_error_token | |
282 last_error_token = token | |
283 | |
284 else: | |
285 yield last_error_token | |
286 yield token | |
287 last_error_token = None | |
288 | |
289 continue | |
290 | |
291 # If the token is a ParseError, we hold on to it so we can get the | |
292 # next token and potentially fix it. | |
293 if token['type'] == PARSEERROR_TYPE: | |
294 last_error_token = token | |
295 continue | |
296 | |
297 yield token | |
298 | |
299 if last_error_token: | |
300 yield last_error_token | |
301 | |
302 def consumeEntity(self, allowedChar=None, fromAttribute=False): | |
303 # If this tokenizer is set to consume entities, then we can let the | |
304 # superclass do its thing. | |
305 if self.consume_entities: | |
306 return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute) | |
307 | |
308 # If this tokenizer is set to not consume entities, then we don't want | |
309 # to consume and convert them, so this overrides the html5lib tokenizer's | |
310 # consumeEntity so that it's now a no-op. | |
311 # | |
312 # However, when that gets called, it's consumed an &, so we put that back in | |
313 # the stream. | |
314 if fromAttribute: | |
315 self.currentToken['data'][-1][1] += '&' | |
316 | |
317 else: | |
318 self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'}) | |
319 | |
320 def tagOpenState(self): | |
321 # This state marks a < that is either a StartTag, EndTag, EmptyTag, | |
322 # or ParseError. In all cases, we want to drop any stream history | |
323 # we've collected so far and we do that by calling start_tag() on | |
324 # the input stream wrapper. | |
325 self.stream.start_tag() | |
326 return super(BleachHTMLTokenizer, self).tagOpenState() | |
327 | |
328 def emitCurrentToken(self): | |
329 token = self.currentToken | |
330 | |
331 if ((self.parser.tags is not None and | |
332 token['type'] in TAG_TOKEN_TYPES and | |
333 token['name'].lower() not in self.parser.tags)): | |
334 # If this is a start/end/empty tag for a tag that's not in our | |
335 # allowed list, then it gets stripped or escaped. In both of these | |
336 # cases it gets converted to a Characters token. | |
337 if self.parser.strip: | |
338 # If we're stripping the token, we just throw in an empty | |
339 # string token. | |
340 new_data = '' | |
341 | |
342 else: | |
343 # If we're escaping the token, we want to escape the exact | |
344 # original string. Since tokenizing also normalizes data | |
345 # and this is a tag-like thing, we've lost some information. | |
346 # So we go back through the stream to get the original | |
347 # string and use that. | |
348 new_data = self.stream.get_tag() | |
349 | |
350 new_token = { | |
351 'type': CHARACTERS_TYPE, | |
352 'data': new_data | |
353 } | |
354 | |
355 self.currentToken = new_token | |
356 self.tokenQueue.append(new_token) | |
357 self.state = self.dataState | |
358 return | |
359 | |
360 super(BleachHTMLTokenizer, self).emitCurrentToken() | |
361 | |
362 | |
363 class BleachHTMLParser(HTMLParser): | |
364 """Parser that uses BleachHTMLTokenizer""" | |
365 def __init__(self, tags, strip, consume_entities, **kwargs): | |
366 """ | |
367 :arg tags: list of allowed tags--everything else is either stripped or | |
368 escaped; if None, then this doesn't look at tags at all | |
369 :arg strip: whether to strip disallowed tags (True) or escape them (False); | |
370 if tags=None, then this doesn't have any effect | |
371 :arg consume_entities: whether to consume entities (default behavior) or | |
372 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) | |
373 | |
374 """ | |
375 self.tags = [tag.lower() for tag in tags] if tags is not None else None | |
376 self.strip = strip | |
377 self.consume_entities = consume_entities | |
378 super(BleachHTMLParser, self).__init__(**kwargs) | |
379 | |
380 def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs): | |
381 # set scripting=True to parse <noscript> as though JS is enabled to | |
382 # match the expected context in browsers | |
383 # | |
384 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element | |
385 # | |
386 # Override HTMLParser so we can swap out the tokenizer for our own. | |
387 self.innerHTMLMode = innerHTML | |
388 self.container = container | |
389 self.scripting = scripting | |
390 self.tokenizer = BleachHTMLTokenizer( | |
391 stream=stream, | |
392 consume_entities=self.consume_entities, | |
393 parser=self, | |
394 **kwargs | |
395 ) | |
396 self.reset() | |
397 | |
398 try: | |
399 self.mainLoop() | |
400 except ReparseException: | |
401 self.reset() | |
402 self.mainLoop() | |
403 | |
404 | |
405 def convert_entity(value): | |
406 """Convert an entity (minus the & and ; part) into what it represents | |
407 | |
408 This handles numeric, hex, and text entities. | |
409 | |
410 :arg value: the string (minus the ``&`` and ``;`` part) to convert | |
411 | |
412 :returns: unicode character or None if it's an ambiguous ampersand that | |
413 doesn't match a character entity | |
414 | |
415 """ | |
416 if value[0] == '#': | |
417 if value[1] in ('x', 'X'): | |
418 return six.unichr(int(value[2:], 16)) | |
419 return six.unichr(int(value[1:], 10)) | |
420 | |
421 return ENTITIES.get(value, None) | |
422 | |
423 | |
424 def convert_entities(text): | |
425 """Converts all found entities in the text | |
426 | |
427 :arg text: the text to convert entities in | |
428 | |
429 :returns: unicode text with converted entities | |
430 | |
431 """ | |
432 if '&' not in text: | |
433 return text | |
434 | |
435 new_text = [] | |
436 for part in next_possible_entity(text): | |
437 if not part: | |
438 continue | |
439 | |
440 if part.startswith('&'): | |
441 entity = match_entity(part) | |
442 if entity is not None: | |
443 converted = convert_entity(entity) | |
444 | |
445 # If it's not an ambiguous ampersand, then replace with the | |
446 # unicode character. Otherwise, we leave the entity in. | |
447 if converted is not None: | |
448 new_text.append(converted) | |
449 remainder = part[len(entity) + 2:] | |
450 if part: | |
451 new_text.append(remainder) | |
452 continue | |
453 | |
454 new_text.append(part) | |
455 | |
456 return ''.join(new_text) | |
457 | |
458 | |
459 def match_entity(stream): | |
460 """Returns first entity in stream or None if no entity exists | |
461 | |
462 Note: For Bleach purposes, entities must start with a "&" and end with | |
463 a ";". This ignoresambiguous character entities that have no ";" at the | |
464 end. | |
465 | |
466 :arg stream: the character stream | |
467 | |
468 :returns: ``None`` or the entity string without "&" or ";" | |
469 | |
470 """ | |
471 # Nix the & at the beginning | |
472 if stream[0] != '&': | |
473 raise ValueError('Stream should begin with "&"') | |
474 | |
475 stream = stream[1:] | |
476 | |
477 stream = list(stream) | |
478 possible_entity = '' | |
479 end_characters = '<&=;' + string.whitespace | |
480 | |
481 # Handle number entities | |
482 if stream and stream[0] == '#': | |
483 possible_entity = '#' | |
484 stream.pop(0) | |
485 | |
486 if stream and stream[0] in ('x', 'X'): | |
487 allowed = '0123456789abcdefABCDEF' | |
488 possible_entity += stream.pop(0) | |
489 else: | |
490 allowed = '0123456789' | |
491 | |
492 # FIXME(willkg): Do we want to make sure these are valid number | |
493 # entities? This doesn't do that currently. | |
494 while stream and stream[0] not in end_characters: | |
495 c = stream.pop(0) | |
496 if c not in allowed: | |
497 break | |
498 possible_entity += c | |
499 | |
500 if possible_entity and stream and stream[0] == ';': | |
501 return possible_entity | |
502 return None | |
503 | |
504 # Handle character entities | |
505 while stream and stream[0] not in end_characters: | |
506 c = stream.pop(0) | |
507 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): | |
508 break | |
509 possible_entity += c | |
510 | |
511 if possible_entity and stream and stream[0] == ';': | |
512 return possible_entity | |
513 | |
514 return None | |
515 | |
516 | |
517 AMP_SPLIT_RE = re.compile('(&)') | |
518 | |
519 | |
520 def next_possible_entity(text): | |
521 """Takes a text and generates a list of possible entities | |
522 | |
523 :arg text: the text to look at | |
524 | |
525 :returns: generator where each part (except the first) starts with an | |
526 "&" | |
527 | |
528 """ | |
529 for i, part in enumerate(AMP_SPLIT_RE.split(text)): | |
530 if i == 0: | |
531 yield part | |
532 elif i % 2 == 0: | |
533 yield '&' + part | |
534 | |
535 | |
536 class BleachHTMLSerializer(HTMLSerializer): | |
537 """HTMLSerializer that undoes & -> & in attributes and sets | |
538 escape_rcdata to True | |
539 """ | |
540 | |
541 # per the HTMLSerializer.__init__ docstring: | |
542 # | |
543 # Whether to escape characters that need to be | |
544 # escaped within normal elements within rcdata elements such as | |
545 # style. | |
546 # | |
547 escape_rcdata = True | |
548 | |
549 def escape_base_amp(self, stoken): | |
550 """Escapes just bare & in HTML attribute values""" | |
551 # First, undo escaping of &. We need to do this because html5lib's | |
552 # HTMLSerializer expected the tokenizer to consume all the character | |
553 # entities and convert them to their respective characters, but the | |
554 # BleachHTMLTokenizer doesn't do that. For example, this fixes | |
555 # &entity; back to &entity; . | |
556 stoken = stoken.replace('&', '&') | |
557 | |
558 # However, we do want all bare & that are not marking character | |
559 # entities to be changed to &, so let's do that carefully here. | |
560 for part in next_possible_entity(stoken): | |
561 if not part: | |
562 continue | |
563 | |
564 if part.startswith('&'): | |
565 entity = match_entity(part) | |
566 # Only leave entities in that are not ambiguous. If they're | |
567 # ambiguous, then we escape the ampersand. | |
568 if entity is not None and convert_entity(entity) is not None: | |
569 yield '&' + entity + ';' | |
570 | |
571 # Length of the entity plus 2--one for & at the beginning | |
572 # and one for ; at the end | |
573 part = part[len(entity) + 2:] | |
574 if part: | |
575 yield part | |
576 continue | |
577 | |
578 yield part.replace('&', '&') | |
579 | |
580 def serialize(self, treewalker, encoding=None): | |
581 """Wrap HTMLSerializer.serialize and conver & to & in attribute values | |
582 | |
583 Note that this converts & to & in attribute values where the & isn't | |
584 already part of an unambiguous character entity. | |
585 | |
586 """ | |
587 in_tag = False | |
588 after_equals = False | |
589 | |
590 for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): | |
591 if in_tag: | |
592 if stoken == '>': | |
593 in_tag = False | |
594 | |
595 elif after_equals: | |
596 if stoken != '"': | |
597 for part in self.escape_base_amp(stoken): | |
598 yield part | |
599 | |
600 after_equals = False | |
601 continue | |
602 | |
603 elif stoken == '=': | |
604 after_equals = True | |
605 | |
606 yield stoken | |
607 else: | |
608 if stoken.startswith('<'): | |
609 in_tag = True | |
610 yield stoken |