Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 from __future__ import unicode_literals | |
2 import re | |
3 import six | |
4 | |
5 from bleach import callbacks as linkify_callbacks | |
6 from bleach import html5lib_shim | |
7 from bleach.utils import alphabetize_attributes, force_unicode | |
8 | |
9 | |
10 #: List of default callbacks | |
11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] | |
12 | |
13 | |
14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az | |
15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat | |
16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk | |
17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg | |
18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il | |
19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp | |
20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk | |
21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne | |
22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post | |
23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl | |
24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to | |
25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws | |
26 xn xxx ye yt yu za zm zw""".split() | |
27 | |
28 # Make sure that .com doesn't get matched by .co first | |
29 TLDS.reverse() | |
30 | |
31 | |
32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): | |
33 """Builds the url regex used by linkifier | |
34 | |
35 If you want a different set of tlds or allowed protocols, pass those in | |
36 and stomp on the existing ``url_re``:: | |
37 | |
38 from bleach import linkifier | |
39 | |
40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) | |
41 | |
42 linker = LinkifyFilter(url_re=my_url_re) | |
43 | |
44 """ | |
45 return re.compile( | |
46 r"""\(* # Match any opening parentheses. | |
47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// | |
48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? | |
49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? | |
50 # /path/zz (excluding "unsafe" chars from RFC 1738, | |
51 # except for # and ~, which happen in practice) | |
52 """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))), | |
53 re.IGNORECASE | re.VERBOSE | re.UNICODE) | |
54 | |
55 | |
56 URL_RE = build_url_re() | |
57 | |
58 | |
59 PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) | |
60 | |
61 | |
62 def build_email_re(tlds=TLDS): | |
63 """Builds the email regex used by linkifier | |
64 | |
65 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: | |
66 | |
67 from bleach import linkifier | |
68 | |
69 my_email_re = linkifier.build_email_re(my_tlds_list) | |
70 | |
71 linker = LinkifyFilter(email_re=my_url_re) | |
72 | |
73 """ | |
74 # open and closing braces doubled below for format string | |
75 return re.compile( | |
76 r"""(?<!//) | |
77 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ | |
78 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom | |
79 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] | |
80 |\\[\001-\011\013\014\016-\177])*" # quoted-string | |
81 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain | |
82 """.format('|'.join(tlds)), | |
83 re.IGNORECASE | re.MULTILINE | re.VERBOSE) | |
84 | |
85 | |
86 EMAIL_RE = build_email_re() | |
87 | |
88 | |
89 class Linker(object): | |
90 """Convert URL-like strings in an HTML fragment to links | |
91 | |
92 This function converts strings that look like URLs, domain names and email | |
93 addresses in text that may be an HTML fragment to links, while preserving: | |
94 | |
95 1. links already in the string | |
96 2. urls found in attributes | |
97 3. email addresses | |
98 | |
99 linkify does a best-effort approach and tries to recover from bad | |
100 situations due to crazy text. | |
101 | |
102 """ | |
103 def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, | |
104 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): | |
105 """Creates a Linker instance | |
106 | |
107 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
108 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
109 | |
110 :arg list skip_tags: list of tags that you don't want to linkify the | |
111 contents of; for example, you could set this to ``['pre']`` to skip | |
112 linkifying contents of ``pre`` tags | |
113 | |
114 :arg bool parse_email: whether or not to linkify email addresses | |
115 | |
116 :arg re url_re: url matching regex | |
117 | |
118 :arg re email_re: email matching regex | |
119 | |
120 :arg list-of-strings recognized_tags: the list of tags that linkify knows about; | |
121 everything else gets escaped | |
122 | |
123 :returns: linkified text as unicode | |
124 | |
125 """ | |
126 self.callbacks = callbacks | |
127 self.skip_tags = skip_tags | |
128 self.parse_email = parse_email | |
129 self.url_re = url_re | |
130 self.email_re = email_re | |
131 | |
132 # Create a parser/tokenizer that allows all HTML tags and escapes | |
133 # anything not in that list. | |
134 self.parser = html5lib_shim.BleachHTMLParser( | |
135 tags=recognized_tags, | |
136 strip=False, | |
137 consume_entities=True, | |
138 namespaceHTMLElements=False, | |
139 ) | |
140 self.walker = html5lib_shim.getTreeWalker('etree') | |
141 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
142 quote_attr_values='always', | |
143 omit_optional_tags=False, | |
144 | |
145 # linkify does not sanitize | |
146 sanitize=False, | |
147 | |
148 # linkify alphabetizes | |
149 alphabetical_attributes=False, | |
150 ) | |
151 | |
152 def linkify(self, text): | |
153 """Linkify specified text | |
154 | |
155 :arg str text: the text to add links to | |
156 | |
157 :returns: linkified text as unicode | |
158 | |
159 :raises TypeError: if ``text`` is not a text type | |
160 | |
161 """ | |
162 if not isinstance(text, six.string_types): | |
163 raise TypeError('argument must be of text type') | |
164 | |
165 text = force_unicode(text) | |
166 | |
167 if not text: | |
168 return '' | |
169 | |
170 dom = self.parser.parseFragment(text) | |
171 filtered = LinkifyFilter( | |
172 source=self.walker(dom), | |
173 callbacks=self.callbacks, | |
174 skip_tags=self.skip_tags, | |
175 parse_email=self.parse_email, | |
176 url_re=self.url_re, | |
177 email_re=self.email_re, | |
178 ) | |
179 return self.serializer.render(filtered) | |
180 | |
181 | |
182 class LinkifyFilter(html5lib_shim.Filter): | |
183 """html5lib filter that linkifies text | |
184 | |
185 This will do the following: | |
186 | |
187 * convert email addresses into links | |
188 * convert urls into links | |
189 * edit existing links by running them through callbacks--the default is to | |
190 add a ``rel="nofollow"`` | |
191 | |
192 This filter can be used anywhere html5lib filters can be used. | |
193 | |
194 """ | |
195 def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, | |
196 url_re=URL_RE, email_re=EMAIL_RE): | |
197 """Creates a LinkifyFilter instance | |
198 | |
199 :arg TreeWalker source: stream | |
200 | |
201 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
202 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
203 | |
204 :arg list skip_tags: list of tags that you don't want to linkify the | |
205 contents of; for example, you could set this to ``['pre']`` to skip | |
206 linkifying contents of ``pre`` tags | |
207 | |
208 :arg bool parse_email: whether or not to linkify email addresses | |
209 | |
210 :arg re url_re: url matching regex | |
211 | |
212 :arg re email_re: email matching regex | |
213 | |
214 """ | |
215 super(LinkifyFilter, self).__init__(source) | |
216 | |
217 self.callbacks = callbacks or [] | |
218 self.skip_tags = skip_tags or [] | |
219 self.parse_email = parse_email | |
220 | |
221 self.url_re = url_re | |
222 self.email_re = email_re | |
223 | |
224 def apply_callbacks(self, attrs, is_new): | |
225 """Given an attrs dict and an is_new bool, runs through callbacks | |
226 | |
227 Callbacks can return an adjusted attrs dict or ``None``. In the case of | |
228 ``None``, we stop going through callbacks and return that and the link | |
229 gets dropped. | |
230 | |
231 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` | |
232 | |
233 :arg bool is_new: whether or not this link was added by linkify | |
234 | |
235 :returns: adjusted attrs dict or ``None`` | |
236 | |
237 """ | |
238 for cb in self.callbacks: | |
239 attrs = cb(attrs, is_new) | |
240 if attrs is None: | |
241 return None | |
242 return attrs | |
243 | |
244 def extract_character_data(self, token_list): | |
245 """Extracts and squashes character sequences in a token stream""" | |
246 # FIXME(willkg): This is a terrible idea. What it does is drop all the | |
247 # tags from the token list and merge the Characters and SpaceCharacters | |
248 # tokens into a single text. | |
249 # | |
250 # So something like this:: | |
251 # | |
252 # "<span>" "<b>" "some text" "</b>" "</span>" | |
253 # | |
254 # gets converted to "some text". | |
255 # | |
256 # This gets used to figure out the ``_text`` fauxttribute value for | |
257 # linkify callables. | |
258 # | |
259 # I'm not really sure how else to support that ``_text`` fauxttribute and | |
260 # maintain some modicum of backwards compatibility with previous versions | |
261 # of Bleach. | |
262 | |
263 out = [] | |
264 for token in token_list: | |
265 token_type = token['type'] | |
266 if token_type in ['Characters', 'SpaceCharacters']: | |
267 out.append(token['data']) | |
268 | |
269 return ''.join(out) | |
270 | |
271 def handle_email_addresses(self, src_iter): | |
272 """Handle email addresses in character tokens""" | |
273 for token in src_iter: | |
274 if token['type'] == 'Characters': | |
275 text = token['data'] | |
276 new_tokens = [] | |
277 end = 0 | |
278 | |
279 # For each email address we find in the text | |
280 for match in self.email_re.finditer(text): | |
281 if match.start() > end: | |
282 new_tokens.append( | |
283 {'type': 'Characters', 'data': text[end:match.start()]} | |
284 ) | |
285 | |
286 # Run attributes through the callbacks to see what we | |
287 # should do with this match | |
288 attrs = { | |
289 (None, 'href'): 'mailto:%s' % match.group(0), | |
290 '_text': match.group(0) | |
291 } | |
292 attrs = self.apply_callbacks(attrs, True) | |
293 | |
294 if attrs is None: | |
295 # Just add the text--but not as a link | |
296 new_tokens.append( | |
297 {'type': 'Characters', 'data': match.group(0)} | |
298 ) | |
299 | |
300 else: | |
301 # Add an "a" tag for the new link | |
302 _text = attrs.pop('_text', '') | |
303 attrs = alphabetize_attributes(attrs) | |
304 new_tokens.extend([ | |
305 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | |
306 {'type': 'Characters', 'data': force_unicode(_text)}, | |
307 {'type': 'EndTag', 'name': 'a'} | |
308 ]) | |
309 end = match.end() | |
310 | |
311 if new_tokens: | |
312 # Yield the adjusted set of tokens and then continue | |
313 # through the loop | |
314 if end < len(text): | |
315 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | |
316 | |
317 for new_token in new_tokens: | |
318 yield new_token | |
319 | |
320 continue | |
321 | |
322 yield token | |
323 | |
324 def strip_non_url_bits(self, fragment): | |
325 """Strips non-url bits from the url | |
326 | |
327 This accounts for over-eager matching by the regex. | |
328 | |
329 """ | |
330 prefix = suffix = '' | |
331 | |
332 while fragment: | |
333 # Try removing ( from the beginning and, if it's balanced, from the | |
334 # end, too | |
335 if fragment.startswith('('): | |
336 prefix = prefix + '(' | |
337 fragment = fragment[1:] | |
338 | |
339 if fragment.endswith(')'): | |
340 suffix = ')' + suffix | |
341 fragment = fragment[:-1] | |
342 continue | |
343 | |
344 # Now try extraneous things from the end. For example, sometimes we | |
345 # pick up ) at the end of a url, but the url is in a parenthesized | |
346 # phrase like: | |
347 # | |
348 # "i looked at the site (at http://example.com)" | |
349 | |
350 if fragment.endswith(')') and '(' not in fragment: | |
351 fragment = fragment[:-1] | |
352 suffix = ')' + suffix | |
353 continue | |
354 | |
355 # Handle commas | |
356 if fragment.endswith(','): | |
357 fragment = fragment[:-1] | |
358 suffix = ',' + suffix | |
359 continue | |
360 | |
361 # Handle periods | |
362 if fragment.endswith('.'): | |
363 fragment = fragment[:-1] | |
364 suffix = '.' + suffix | |
365 continue | |
366 | |
367 # Nothing matched, so we're done | |
368 break | |
369 | |
370 return fragment, prefix, suffix | |
371 | |
372 def handle_links(self, src_iter): | |
373 """Handle links in character tokens""" | |
374 in_a = False # happens, if parse_email=True and if a mail was found | |
375 for token in src_iter: | |
376 if in_a: | |
377 if token['type'] == 'EndTag' and token['name'] == 'a': | |
378 in_a = False | |
379 yield token | |
380 continue | |
381 elif token['type'] == 'StartTag' and token['name'] == 'a': | |
382 in_a = True | |
383 yield token | |
384 continue | |
385 if token['type'] == 'Characters': | |
386 text = token['data'] | |
387 new_tokens = [] | |
388 end = 0 | |
389 | |
390 for match in self.url_re.finditer(text): | |
391 if match.start() > end: | |
392 new_tokens.append( | |
393 {'type': 'Characters', 'data': text[end:match.start()]} | |
394 ) | |
395 | |
396 url = match.group(0) | |
397 prefix = suffix = '' | |
398 | |
399 # Sometimes we pick up too much in the url match, so look for | |
400 # bits we should drop and remove them from the match | |
401 url, prefix, suffix = self.strip_non_url_bits(url) | |
402 | |
403 # If there's no protocol, add one | |
404 if PROTO_RE.search(url): | |
405 href = url | |
406 else: | |
407 href = 'http://%s' % url | |
408 | |
409 attrs = { | |
410 (None, 'href'): href, | |
411 '_text': url | |
412 } | |
413 attrs = self.apply_callbacks(attrs, True) | |
414 | |
415 if attrs is None: | |
416 # Just add the text | |
417 new_tokens.append( | |
418 {'type': 'Characters', 'data': prefix + url + suffix} | |
419 ) | |
420 | |
421 else: | |
422 # Add the "a" tag! | |
423 if prefix: | |
424 new_tokens.append( | |
425 {'type': 'Characters', 'data': prefix} | |
426 ) | |
427 | |
428 _text = attrs.pop('_text', '') | |
429 attrs = alphabetize_attributes(attrs) | |
430 | |
431 new_tokens.extend([ | |
432 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | |
433 {'type': 'Characters', 'data': force_unicode(_text)}, | |
434 {'type': 'EndTag', 'name': 'a'}, | |
435 ]) | |
436 | |
437 if suffix: | |
438 new_tokens.append( | |
439 {'type': 'Characters', 'data': suffix} | |
440 ) | |
441 | |
442 end = match.end() | |
443 | |
444 if new_tokens: | |
445 # Yield the adjusted set of tokens and then continue | |
446 # through the loop | |
447 if end < len(text): | |
448 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | |
449 | |
450 for new_token in new_tokens: | |
451 yield new_token | |
452 | |
453 continue | |
454 | |
455 yield token | |
456 | |
457 def handle_a_tag(self, token_buffer): | |
458 """Handle the "a" tag | |
459 | |
460 This could adjust the link or drop it altogether depending on what the | |
461 callbacks return. | |
462 | |
463 This yields the new set of tokens. | |
464 | |
465 """ | |
466 a_token = token_buffer[0] | |
467 if a_token['data']: | |
468 attrs = a_token['data'] | |
469 else: | |
470 attrs = {} | |
471 text = self.extract_character_data(token_buffer) | |
472 attrs['_text'] = text | |
473 | |
474 attrs = self.apply_callbacks(attrs, False) | |
475 | |
476 if attrs is None: | |
477 # We're dropping the "a" tag and everything else and replacing | |
478 # it with character data. So emit that token. | |
479 yield {'type': 'Characters', 'data': text} | |
480 | |
481 else: | |
482 new_text = attrs.pop('_text', '') | |
483 a_token['data'] = alphabetize_attributes(attrs) | |
484 | |
485 if text == new_text: | |
486 # The callbacks didn't change the text, so we yield the new "a" | |
487 # token, then whatever else was there, then the end "a" token | |
488 yield a_token | |
489 for mem in token_buffer[1:]: | |
490 yield mem | |
491 | |
492 else: | |
493 # If the callbacks changed the text, then we're going to drop | |
494 # all the tokens between the start and end "a" tags and replace | |
495 # it with the new text | |
496 yield a_token | |
497 yield {'type': 'Characters', 'data': force_unicode(new_text)} | |
498 yield token_buffer[-1] | |
499 | |
500 def __iter__(self): | |
501 in_a = False | |
502 in_skip_tag = None | |
503 | |
504 token_buffer = [] | |
505 | |
506 for token in super(LinkifyFilter, self).__iter__(): | |
507 if in_a: | |
508 # Handle the case where we're in an "a" tag--we want to buffer tokens | |
509 # until we hit an end "a" tag. | |
510 if token['type'] == 'EndTag' and token['name'] == 'a': | |
511 # Add the end tag to the token buffer and then handle them | |
512 # and yield anything returned | |
513 token_buffer.append(token) | |
514 for new_token in self.handle_a_tag(token_buffer): | |
515 yield new_token | |
516 | |
517 # Clear "a" related state and continue since we've yielded all | |
518 # the tokens we're going to yield | |
519 in_a = False | |
520 token_buffer = [] | |
521 else: | |
522 token_buffer.append(token) | |
523 continue | |
524 | |
525 if token['type'] in ['StartTag', 'EmptyTag']: | |
526 if token['name'] in self.skip_tags: | |
527 # Skip tags start a "special mode" where we don't linkify | |
528 # anything until the end tag. | |
529 in_skip_tag = token['name'] | |
530 | |
531 elif token['name'] == 'a': | |
532 # The "a" tag is special--we switch to a slurp mode and | |
533 # slurp all the tokens until the end "a" tag and then | |
534 # figure out what to do with them there. | |
535 in_a = True | |
536 token_buffer.append(token) | |
537 | |
538 # We buffer the start tag, so we don't want to yield it, | |
539 # yet | |
540 continue | |
541 | |
542 elif in_skip_tag and self.skip_tags: | |
543 # NOTE(willkg): We put this clause here since in_a and | |
544 # switching in and out of in_a takes precedence. | |
545 if token['type'] == 'EndTag' and token['name'] == in_skip_tag: | |
546 in_skip_tag = None | |
547 | |
548 elif not in_a and not in_skip_tag and token['type'] == 'Characters': | |
549 new_stream = iter([token]) | |
550 if self.parse_email: | |
551 new_stream = self.handle_email_addresses(new_stream) | |
552 | |
553 new_stream = self.handle_links(new_stream) | |
554 | |
555 for token in new_stream: | |
556 yield token | |
557 | |
558 # We've already yielded this token, so continue | |
559 continue | |
560 | |
561 yield token |