springsuite: planemo/lib/python3.7/site-packages/bleach/linkifier.py comparison

comparison planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"

author	guerler
date	Fri, 31 Jul 2020 00:18:57 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:d30785e31577
+from __future__ import unicode_literals
+import re
+import six
+from bleach import callbacks as linkify_callbacks
+from bleach import html5lib_shim
+from bleach.utils import alphabetize_attributes, force_unicode
+#: List of default callbacks
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+xn xxx ye yt yu za zm zw""".split()
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
+"""Builds the url regex used by linkifier
+If you want a different set of tlds or allowed protocols, pass those in
+and stomp on the existing ``url_re``::
+from bleach import linkifier
+my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+linker = LinkifyFilter(url_re=my_url_re)
+"""
+return re.compile(
+r"""\(*  # Match any opening parentheses.
+\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+# /path/zz (excluding "unsafe" chars from RFC 1738,
+# except for # and ~, which happen in practice)
+""".format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))),
+re.IGNORECASE | re.VERBOSE | re.UNICODE)
+URL_RE = build_url_re()
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+def build_email_re(tlds=TLDS):
+"""Builds the email regex used by linkifier
+If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
+from bleach import linkifier
+my_email_re = linkifier.build_email_re(my_tlds_list)
+linker = LinkifyFilter(email_re=my_url_re)
+"""
+# open and closing braces doubled below for format string
+return re.compile(
+r"""(?<!//)
+(([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
+(\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
+|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+|\\[\001-\011\013\014\016-\177])*"  # quoted-string
+)@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
+""".format('|'.join(tlds)),
+re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+EMAIL_RE = build_email_re()
+class Linker(object):
+"""Convert URL-like strings in an HTML fragment to links
+This function converts strings that look like URLs, domain names and email
+addresses in text that may be an HTML fragment to links, while preserving:
+1. links already in the string
+2. urls found in attributes
+3. email addresses
+linkify does a best-effort approach and tries to recover from bad
+situations due to crazy text.
+"""
+def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
+url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
+"""Creates a Linker instance
+:arg list callbacks: list of callbacks to run when adjusting tag attributes;
+defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+:arg list skip_tags: list of tags that you don't want to linkify the
+contents of; for example, you could set this to ``['pre']`` to skip
+linkifying contents of ``pre`` tags
+:arg bool parse_email: whether or not to linkify email addresses
+:arg re url_re: url matching regex
+:arg re email_re: email matching regex
+:arg list-of-strings recognized_tags: the list of tags that linkify knows about;
+everything else gets escaped
+:returns: linkified text as unicode
+"""
+self.callbacks = callbacks
+self.skip_tags = skip_tags
+self.parse_email = parse_email
+self.url_re = url_re
+self.email_re = email_re
+# Create a parser/tokenizer that allows all HTML tags and escapes
+# anything not in that list.
+self.parser = html5lib_shim.BleachHTMLParser(
+tags=recognized_tags,
+strip=False,
+consume_entities=True,
+namespaceHTMLElements=False,
+)
+self.walker = html5lib_shim.getTreeWalker('etree')
+self.serializer = html5lib_shim.BleachHTMLSerializer(
+quote_attr_values='always',
+omit_optional_tags=False,
+# linkify does not sanitize
+sanitize=False,
+# linkify alphabetizes
+alphabetical_attributes=False,
+)
+def linkify(self, text):
+"""Linkify specified text
+:arg str text: the text to add links to
+:returns: linkified text as unicode
+:raises TypeError: if ``text`` is not a text type
+"""
+if not isinstance(text, six.string_types):
+raise TypeError('argument must be of text type')
+text = force_unicode(text)
+if not text:
+return ''
+dom = self.parser.parseFragment(text)
+filtered = LinkifyFilter(
+source=self.walker(dom),
+callbacks=self.callbacks,
+skip_tags=self.skip_tags,
+parse_email=self.parse_email,
+url_re=self.url_re,
+email_re=self.email_re,
+)
+return self.serializer.render(filtered)
+class LinkifyFilter(html5lib_shim.Filter):
+"""html5lib filter that linkifies text
+This will do the following:
+* convert email addresses into links
+* convert urls into links
+* edit existing links by running them through callbacks--the default is to
+add a ``rel="nofollow"``
+This filter can be used anywhere html5lib filters can be used.
+"""
+def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
+url_re=URL_RE, email_re=EMAIL_RE):
+"""Creates a LinkifyFilter instance
+:arg TreeWalker source: stream
+:arg list callbacks: list of callbacks to run when adjusting tag attributes;
+defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+:arg list skip_tags: list of tags that you don't want to linkify the
+contents of; for example, you could set this to ``['pre']`` to skip
+linkifying contents of ``pre`` tags
+:arg bool parse_email: whether or not to linkify email addresses
+:arg re url_re: url matching regex
+:arg re email_re: email matching regex
+"""
+super(LinkifyFilter, self).__init__(source)
+self.callbacks = callbacks or []
+self.skip_tags = skip_tags or []
+self.parse_email = parse_email
+self.url_re = url_re
+self.email_re = email_re
+def apply_callbacks(self, attrs, is_new):
+"""Given an attrs dict and an is_new bool, runs through callbacks
+Callbacks can return an adjusted attrs dict or ``None``. In the case of
+``None``, we stop going through callbacks and return that and the link
+gets dropped.
+:arg dict attrs: map of ``(namespace, name)`` -> ``value``
+:arg bool is_new: whether or not this link was added by linkify
+:returns: adjusted attrs dict or ``None``
+"""
+for cb in self.callbacks:
+attrs = cb(attrs, is_new)
+if attrs is None:
+return None
+return attrs
+def extract_character_data(self, token_list):
+"""Extracts and squashes character sequences in a token stream"""
+# FIXME(willkg): This is a terrible idea. What it does is drop all the
+# tags from the token list and merge the Characters and SpaceCharacters
+# tokens into a single text.
+#
+# So something like this::
+#
+#     "<span>" "<b>" "some text" "</b>" "</span>"
+#
+# gets converted to "some text".
+#
+# This gets used to figure out the ``_text`` fauxttribute value for
+# linkify callables.
+#
+# I'm not really sure how else to support that ``_text`` fauxttribute and
+# maintain some modicum of backwards compatibility with previous versions
+# of Bleach.
+out = []
+for token in token_list:
+token_type = token['type']
+if token_type in ['Characters', 'SpaceCharacters']:
+out.append(token['data'])
+return ''.join(out)
+def handle_email_addresses(self, src_iter):
+"""Handle email addresses in character tokens"""
+for token in src_iter:
+if token['type'] == 'Characters':
+text = token['data']
+new_tokens = []
+end = 0
+# For each email address we find in the text
+for match in self.email_re.finditer(text):
+if match.start() > end:
+new_tokens.append(
+{'type': 'Characters', 'data': text[end:match.start()]}
+)
+# Run attributes through the callbacks to see what we
+# should do with this match
+attrs = {
+(None, 'href'): 'mailto:%s' % match.group(0),
+'_text': match.group(0)
+}
+attrs = self.apply_callbacks(attrs, True)
+if attrs is None:
+# Just add the text--but not as a link
+new_tokens.append(
+{'type': 'Characters', 'data': match.group(0)}
+)
+else:
+# Add an "a" tag for the new link
+_text = attrs.pop('_text', '')
+attrs = alphabetize_attributes(attrs)
+new_tokens.extend([
+{'type': 'StartTag', 'name': 'a', 'data': attrs},
+{'type': 'Characters', 'data': force_unicode(_text)},
+{'type': 'EndTag', 'name': 'a'}
+])
+end = match.end()
+if new_tokens:
+# Yield the adjusted set of tokens and then continue
+# through the loop
+if end < len(text):
+new_tokens.append({'type': 'Characters', 'data': text[end:]})
+for new_token in new_tokens:
+yield new_token
+continue
+yield token
+def strip_non_url_bits(self, fragment):
+"""Strips non-url bits from the url
+This accounts for over-eager matching by the regex.
+"""
+prefix = suffix = ''
+while fragment:
+# Try removing ( from the beginning and, if it's balanced, from the
+# end, too
+if fragment.startswith('('):
+prefix = prefix + '('
+fragment = fragment[1:]
+if fragment.endswith(')'):
+suffix = ')' + suffix
+fragment = fragment[:-1]
+continue
+# Now try extraneous things from the end. For example, sometimes we
+# pick up ) at the end of a url, but the url is in a parenthesized
+# phrase like:
+#
+#     "i looked at the site (at http://example.com)"
+if fragment.endswith(')') and '(' not in fragment:
+fragment = fragment[:-1]
+suffix = ')' + suffix
+continue
+# Handle commas
+if fragment.endswith(','):
+fragment = fragment[:-1]
+suffix = ',' + suffix
+continue
+# Handle periods
+if fragment.endswith('.'):
+fragment = fragment[:-1]
+suffix = '.' + suffix
+continue
+# Nothing matched, so we're done
+break
+return fragment, prefix, suffix
+def handle_links(self, src_iter):
+"""Handle links in character tokens"""
+in_a = False  # happens, if parse_email=True and if a mail was found
+for token in src_iter:
+if in_a:
+if token['type'] == 'EndTag' and token['name'] == 'a':
+in_a = False
+yield token
+continue
+elif token['type'] == 'StartTag' and token['name'] == 'a':
+in_a = True
+yield token
+continue
+if token['type'] == 'Characters':
+text = token['data']
+new_tokens = []
+end = 0
+for match in self.url_re.finditer(text):
+if match.start() > end:
+new_tokens.append(
+{'type': 'Characters', 'data': text[end:match.start()]}
+)
+url = match.group(0)
+prefix = suffix = ''
+# Sometimes we pick up too much in the url match, so look for
+# bits we should drop and remove them from the match
+url, prefix, suffix = self.strip_non_url_bits(url)
+# If there's no protocol, add one
+if PROTO_RE.search(url):
+href = url
+else:
+href = 'http://%s' % url
+attrs = {
+(None, 'href'): href,
+'_text': url
+}
+attrs = self.apply_callbacks(attrs, True)
+if attrs is None:
+# Just add the text
+new_tokens.append(
+{'type': 'Characters', 'data': prefix + url + suffix}
+)
+else:
+# Add the "a" tag!
+if prefix:
+new_tokens.append(
+{'type': 'Characters', 'data': prefix}
+)
+_text = attrs.pop('_text', '')
+attrs = alphabetize_attributes(attrs)
+new_tokens.extend([
+{'type': 'StartTag', 'name': 'a', 'data': attrs},
+{'type': 'Characters', 'data': force_unicode(_text)},
+{'type': 'EndTag', 'name': 'a'},
+])
+if suffix:
+new_tokens.append(
+{'type': 'Characters', 'data': suffix}
+)
+end = match.end()
+if new_tokens:
+# Yield the adjusted set of tokens and then continue
+# through the loop
+if end < len(text):
+new_tokens.append({'type': 'Characters', 'data': text[end:]})
+for new_token in new_tokens:
+yield new_token
+continue
+yield token
+def handle_a_tag(self, token_buffer):
+"""Handle the "a" tag
+This could adjust the link or drop it altogether depending on what the
+callbacks return.
+This yields the new set of tokens.
+"""
+a_token = token_buffer[0]
+if a_token['data']:
+attrs = a_token['data']
+else:
+attrs = {}
+text = self.extract_character_data(token_buffer)
+attrs['_text'] = text
+attrs = self.apply_callbacks(attrs, False)
+if attrs is None:
+# We're dropping the "a" tag and everything else and replacing
+# it with character data. So emit that token.
+yield {'type': 'Characters', 'data': text}
+else:
+new_text = attrs.pop('_text', '')
+a_token['data'] = alphabetize_attributes(attrs)
+if text == new_text:
+# The callbacks didn't change the text, so we yield the new "a"
+# token, then whatever else was there, then the end "a" token
+yield a_token
+for mem in token_buffer[1:]:
+yield mem
+else:
+# If the callbacks changed the text, then we're going to drop
+# all the tokens between the start and end "a" tags and replace
+# it with the new text
+yield a_token
+yield {'type': 'Characters', 'data': force_unicode(new_text)}
+yield token_buffer[-1]
+def __iter__(self):
+in_a = False
+in_skip_tag = None
+token_buffer = []
+for token in super(LinkifyFilter, self).__iter__():
+if in_a:
+# Handle the case where we're in an "a" tag--we want to buffer tokens
+# until we hit an end "a" tag.
+if token['type'] == 'EndTag' and token['name'] == 'a':
+# Add the end tag to the token buffer and then handle them
+# and yield anything returned
+token_buffer.append(token)
+for new_token in self.handle_a_tag(token_buffer):
+yield new_token
+# Clear "a" related state and continue since we've yielded all
+# the tokens we're going to yield
+in_a = False
+token_buffer = []
+else:
+token_buffer.append(token)
+continue
+if token['type'] in ['StartTag', 'EmptyTag']:
+if token['name'] in self.skip_tags:
+# Skip tags start a "special mode" where we don't linkify
+# anything until the end tag.
+in_skip_tag = token['name']
+elif token['name'] == 'a':
+# The "a" tag is special--we switch to a slurp mode and
+# slurp all the tokens until the end "a" tag and then
+# figure out what to do with them there.
+in_a = True
+token_buffer.append(token)
+# We buffer the start tag, so we don't want to yield it,
+# yet
+continue
+elif in_skip_tag and self.skip_tags:
+# NOTE(willkg): We put this clause here since in_a and
+# switching in and out of in_a takes precedence.
+if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
+in_skip_tag = None
+elif not in_a and not in_skip_tag and token['type'] == 'Characters':
+new_stream = iter([token])
+if self.parse_email:
+new_stream = self.handle_email_addresses(new_stream)
+new_stream = self.handle_links(new_stream)
+for token in new_stream:
+yield token
+# We've already yielded this token, so continue
+continue
+yield token

Mercurial > repos > guerler > springsuite

comparison planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft