comparison planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 from __future__ import unicode_literals
2 import re
3 import six
4
5 from bleach import callbacks as linkify_callbacks
6 from bleach import html5lib_shim
7 from bleach.utils import alphabetize_attributes, force_unicode
8
9
10 #: List of default callbacks
11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
12
13
14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
26 xn xxx ye yt yu za zm zw""".split()
27
28 # Make sure that .com doesn't get matched by .co first
29 TLDS.reverse()
30
31
32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
33 """Builds the url regex used by linkifier
34
35 If you want a different set of tlds or allowed protocols, pass those in
36 and stomp on the existing ``url_re``::
37
38 from bleach import linkifier
39
40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
41
42 linker = LinkifyFilter(url_re=my_url_re)
43
44 """
45 return re.compile(
46 r"""\(* # Match any opening parentheses.
47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
50 # /path/zz (excluding "unsafe" chars from RFC 1738,
51 # except for # and ~, which happen in practice)
52 """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))),
53 re.IGNORECASE | re.VERBOSE | re.UNICODE)
54
55
56 URL_RE = build_url_re()
57
58
59 PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
60
61
62 def build_email_re(tlds=TLDS):
63 """Builds the email regex used by linkifier
64
65 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
66
67 from bleach import linkifier
68
69 my_email_re = linkifier.build_email_re(my_tlds_list)
70
71 linker = LinkifyFilter(email_re=my_url_re)
72
73 """
74 # open and closing braces doubled below for format string
75 return re.compile(
76 r"""(?<!//)
77 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
78 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
79 |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
80 |\\[\001-\011\013\014\016-\177])*" # quoted-string
81 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
82 """.format('|'.join(tlds)),
83 re.IGNORECASE | re.MULTILINE | re.VERBOSE)
84
85
86 EMAIL_RE = build_email_re()
87
88
89 class Linker(object):
90 """Convert URL-like strings in an HTML fragment to links
91
92 This function converts strings that look like URLs, domain names and email
93 addresses in text that may be an HTML fragment to links, while preserving:
94
95 1. links already in the string
96 2. urls found in attributes
97 3. email addresses
98
99 linkify does a best-effort approach and tries to recover from bad
100 situations due to crazy text.
101
102 """
103 def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
104 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
105 """Creates a Linker instance
106
107 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
108 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
109
110 :arg list skip_tags: list of tags that you don't want to linkify the
111 contents of; for example, you could set this to ``['pre']`` to skip
112 linkifying contents of ``pre`` tags
113
114 :arg bool parse_email: whether or not to linkify email addresses
115
116 :arg re url_re: url matching regex
117
118 :arg re email_re: email matching regex
119
120 :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
121 everything else gets escaped
122
123 :returns: linkified text as unicode
124
125 """
126 self.callbacks = callbacks
127 self.skip_tags = skip_tags
128 self.parse_email = parse_email
129 self.url_re = url_re
130 self.email_re = email_re
131
132 # Create a parser/tokenizer that allows all HTML tags and escapes
133 # anything not in that list.
134 self.parser = html5lib_shim.BleachHTMLParser(
135 tags=recognized_tags,
136 strip=False,
137 consume_entities=True,
138 namespaceHTMLElements=False,
139 )
140 self.walker = html5lib_shim.getTreeWalker('etree')
141 self.serializer = html5lib_shim.BleachHTMLSerializer(
142 quote_attr_values='always',
143 omit_optional_tags=False,
144
145 # linkify does not sanitize
146 sanitize=False,
147
148 # linkify alphabetizes
149 alphabetical_attributes=False,
150 )
151
152 def linkify(self, text):
153 """Linkify specified text
154
155 :arg str text: the text to add links to
156
157 :returns: linkified text as unicode
158
159 :raises TypeError: if ``text`` is not a text type
160
161 """
162 if not isinstance(text, six.string_types):
163 raise TypeError('argument must be of text type')
164
165 text = force_unicode(text)
166
167 if not text:
168 return ''
169
170 dom = self.parser.parseFragment(text)
171 filtered = LinkifyFilter(
172 source=self.walker(dom),
173 callbacks=self.callbacks,
174 skip_tags=self.skip_tags,
175 parse_email=self.parse_email,
176 url_re=self.url_re,
177 email_re=self.email_re,
178 )
179 return self.serializer.render(filtered)
180
181
182 class LinkifyFilter(html5lib_shim.Filter):
183 """html5lib filter that linkifies text
184
185 This will do the following:
186
187 * convert email addresses into links
188 * convert urls into links
189 * edit existing links by running them through callbacks--the default is to
190 add a ``rel="nofollow"``
191
192 This filter can be used anywhere html5lib filters can be used.
193
194 """
195 def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
196 url_re=URL_RE, email_re=EMAIL_RE):
197 """Creates a LinkifyFilter instance
198
199 :arg TreeWalker source: stream
200
201 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
202 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
203
204 :arg list skip_tags: list of tags that you don't want to linkify the
205 contents of; for example, you could set this to ``['pre']`` to skip
206 linkifying contents of ``pre`` tags
207
208 :arg bool parse_email: whether or not to linkify email addresses
209
210 :arg re url_re: url matching regex
211
212 :arg re email_re: email matching regex
213
214 """
215 super(LinkifyFilter, self).__init__(source)
216
217 self.callbacks = callbacks or []
218 self.skip_tags = skip_tags or []
219 self.parse_email = parse_email
220
221 self.url_re = url_re
222 self.email_re = email_re
223
224 def apply_callbacks(self, attrs, is_new):
225 """Given an attrs dict and an is_new bool, runs through callbacks
226
227 Callbacks can return an adjusted attrs dict or ``None``. In the case of
228 ``None``, we stop going through callbacks and return that and the link
229 gets dropped.
230
231 :arg dict attrs: map of ``(namespace, name)`` -> ``value``
232
233 :arg bool is_new: whether or not this link was added by linkify
234
235 :returns: adjusted attrs dict or ``None``
236
237 """
238 for cb in self.callbacks:
239 attrs = cb(attrs, is_new)
240 if attrs is None:
241 return None
242 return attrs
243
244 def extract_character_data(self, token_list):
245 """Extracts and squashes character sequences in a token stream"""
246 # FIXME(willkg): This is a terrible idea. What it does is drop all the
247 # tags from the token list and merge the Characters and SpaceCharacters
248 # tokens into a single text.
249 #
250 # So something like this::
251 #
252 # "<span>" "<b>" "some text" "</b>" "</span>"
253 #
254 # gets converted to "some text".
255 #
256 # This gets used to figure out the ``_text`` fauxttribute value for
257 # linkify callables.
258 #
259 # I'm not really sure how else to support that ``_text`` fauxttribute and
260 # maintain some modicum of backwards compatibility with previous versions
261 # of Bleach.
262
263 out = []
264 for token in token_list:
265 token_type = token['type']
266 if token_type in ['Characters', 'SpaceCharacters']:
267 out.append(token['data'])
268
269 return ''.join(out)
270
271 def handle_email_addresses(self, src_iter):
272 """Handle email addresses in character tokens"""
273 for token in src_iter:
274 if token['type'] == 'Characters':
275 text = token['data']
276 new_tokens = []
277 end = 0
278
279 # For each email address we find in the text
280 for match in self.email_re.finditer(text):
281 if match.start() > end:
282 new_tokens.append(
283 {'type': 'Characters', 'data': text[end:match.start()]}
284 )
285
286 # Run attributes through the callbacks to see what we
287 # should do with this match
288 attrs = {
289 (None, 'href'): 'mailto:%s' % match.group(0),
290 '_text': match.group(0)
291 }
292 attrs = self.apply_callbacks(attrs, True)
293
294 if attrs is None:
295 # Just add the text--but not as a link
296 new_tokens.append(
297 {'type': 'Characters', 'data': match.group(0)}
298 )
299
300 else:
301 # Add an "a" tag for the new link
302 _text = attrs.pop('_text', '')
303 attrs = alphabetize_attributes(attrs)
304 new_tokens.extend([
305 {'type': 'StartTag', 'name': 'a', 'data': attrs},
306 {'type': 'Characters', 'data': force_unicode(_text)},
307 {'type': 'EndTag', 'name': 'a'}
308 ])
309 end = match.end()
310
311 if new_tokens:
312 # Yield the adjusted set of tokens and then continue
313 # through the loop
314 if end < len(text):
315 new_tokens.append({'type': 'Characters', 'data': text[end:]})
316
317 for new_token in new_tokens:
318 yield new_token
319
320 continue
321
322 yield token
323
324 def strip_non_url_bits(self, fragment):
325 """Strips non-url bits from the url
326
327 This accounts for over-eager matching by the regex.
328
329 """
330 prefix = suffix = ''
331
332 while fragment:
333 # Try removing ( from the beginning and, if it's balanced, from the
334 # end, too
335 if fragment.startswith('('):
336 prefix = prefix + '('
337 fragment = fragment[1:]
338
339 if fragment.endswith(')'):
340 suffix = ')' + suffix
341 fragment = fragment[:-1]
342 continue
343
344 # Now try extraneous things from the end. For example, sometimes we
345 # pick up ) at the end of a url, but the url is in a parenthesized
346 # phrase like:
347 #
348 # "i looked at the site (at http://example.com)"
349
350 if fragment.endswith(')') and '(' not in fragment:
351 fragment = fragment[:-1]
352 suffix = ')' + suffix
353 continue
354
355 # Handle commas
356 if fragment.endswith(','):
357 fragment = fragment[:-1]
358 suffix = ',' + suffix
359 continue
360
361 # Handle periods
362 if fragment.endswith('.'):
363 fragment = fragment[:-1]
364 suffix = '.' + suffix
365 continue
366
367 # Nothing matched, so we're done
368 break
369
370 return fragment, prefix, suffix
371
372 def handle_links(self, src_iter):
373 """Handle links in character tokens"""
374 in_a = False # happens, if parse_email=True and if a mail was found
375 for token in src_iter:
376 if in_a:
377 if token['type'] == 'EndTag' and token['name'] == 'a':
378 in_a = False
379 yield token
380 continue
381 elif token['type'] == 'StartTag' and token['name'] == 'a':
382 in_a = True
383 yield token
384 continue
385 if token['type'] == 'Characters':
386 text = token['data']
387 new_tokens = []
388 end = 0
389
390 for match in self.url_re.finditer(text):
391 if match.start() > end:
392 new_tokens.append(
393 {'type': 'Characters', 'data': text[end:match.start()]}
394 )
395
396 url = match.group(0)
397 prefix = suffix = ''
398
399 # Sometimes we pick up too much in the url match, so look for
400 # bits we should drop and remove them from the match
401 url, prefix, suffix = self.strip_non_url_bits(url)
402
403 # If there's no protocol, add one
404 if PROTO_RE.search(url):
405 href = url
406 else:
407 href = 'http://%s' % url
408
409 attrs = {
410 (None, 'href'): href,
411 '_text': url
412 }
413 attrs = self.apply_callbacks(attrs, True)
414
415 if attrs is None:
416 # Just add the text
417 new_tokens.append(
418 {'type': 'Characters', 'data': prefix + url + suffix}
419 )
420
421 else:
422 # Add the "a" tag!
423 if prefix:
424 new_tokens.append(
425 {'type': 'Characters', 'data': prefix}
426 )
427
428 _text = attrs.pop('_text', '')
429 attrs = alphabetize_attributes(attrs)
430
431 new_tokens.extend([
432 {'type': 'StartTag', 'name': 'a', 'data': attrs},
433 {'type': 'Characters', 'data': force_unicode(_text)},
434 {'type': 'EndTag', 'name': 'a'},
435 ])
436
437 if suffix:
438 new_tokens.append(
439 {'type': 'Characters', 'data': suffix}
440 )
441
442 end = match.end()
443
444 if new_tokens:
445 # Yield the adjusted set of tokens and then continue
446 # through the loop
447 if end < len(text):
448 new_tokens.append({'type': 'Characters', 'data': text[end:]})
449
450 for new_token in new_tokens:
451 yield new_token
452
453 continue
454
455 yield token
456
457 def handle_a_tag(self, token_buffer):
458 """Handle the "a" tag
459
460 This could adjust the link or drop it altogether depending on what the
461 callbacks return.
462
463 This yields the new set of tokens.
464
465 """
466 a_token = token_buffer[0]
467 if a_token['data']:
468 attrs = a_token['data']
469 else:
470 attrs = {}
471 text = self.extract_character_data(token_buffer)
472 attrs['_text'] = text
473
474 attrs = self.apply_callbacks(attrs, False)
475
476 if attrs is None:
477 # We're dropping the "a" tag and everything else and replacing
478 # it with character data. So emit that token.
479 yield {'type': 'Characters', 'data': text}
480
481 else:
482 new_text = attrs.pop('_text', '')
483 a_token['data'] = alphabetize_attributes(attrs)
484
485 if text == new_text:
486 # The callbacks didn't change the text, so we yield the new "a"
487 # token, then whatever else was there, then the end "a" token
488 yield a_token
489 for mem in token_buffer[1:]:
490 yield mem
491
492 else:
493 # If the callbacks changed the text, then we're going to drop
494 # all the tokens between the start and end "a" tags and replace
495 # it with the new text
496 yield a_token
497 yield {'type': 'Characters', 'data': force_unicode(new_text)}
498 yield token_buffer[-1]
499
500 def __iter__(self):
501 in_a = False
502 in_skip_tag = None
503
504 token_buffer = []
505
506 for token in super(LinkifyFilter, self).__iter__():
507 if in_a:
508 # Handle the case where we're in an "a" tag--we want to buffer tokens
509 # until we hit an end "a" tag.
510 if token['type'] == 'EndTag' and token['name'] == 'a':
511 # Add the end tag to the token buffer and then handle them
512 # and yield anything returned
513 token_buffer.append(token)
514 for new_token in self.handle_a_tag(token_buffer):
515 yield new_token
516
517 # Clear "a" related state and continue since we've yielded all
518 # the tokens we're going to yield
519 in_a = False
520 token_buffer = []
521 else:
522 token_buffer.append(token)
523 continue
524
525 if token['type'] in ['StartTag', 'EmptyTag']:
526 if token['name'] in self.skip_tags:
527 # Skip tags start a "special mode" where we don't linkify
528 # anything until the end tag.
529 in_skip_tag = token['name']
530
531 elif token['name'] == 'a':
532 # The "a" tag is special--we switch to a slurp mode and
533 # slurp all the tokens until the end "a" tag and then
534 # figure out what to do with them there.
535 in_a = True
536 token_buffer.append(token)
537
538 # We buffer the start tag, so we don't want to yield it,
539 # yet
540 continue
541
542 elif in_skip_tag and self.skip_tags:
543 # NOTE(willkg): We put this clause here since in_a and
544 # switching in and out of in_a takes precedence.
545 if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
546 in_skip_tag = None
547
548 elif not in_a and not in_skip_tag and token['type'] == 'Characters':
549 new_stream = iter([token])
550 if self.parse_email:
551 new_stream = self.handle_email_addresses(new_stream)
552
553 new_stream = self.handle_links(new_stream)
554
555 for token in new_stream:
556 yield token
557
558 # We've already yielded this token, so continue
559 continue
560
561 yield token