Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/bleach/linkifier.py Fri Jul 31 00:18:57 2020 -0400 @@ -0,0 +1,561 @@ +from __future__ import unicode_literals +import re +import six + +from bleach import callbacks as linkify_callbacks +from bleach import html5lib_shim +from bleach.utils import alphabetize_attributes, force_unicode + + +#: List of default callbacks +DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] + + +TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az + ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat + cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk + dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg + gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il + im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp + kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk + ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne + net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post + pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl + sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to + tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws + xn xxx ye yt yu za zm zw""".split() + +# Make sure that .com doesn't get matched by .co first +TLDS.reverse() + + +def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): + """Builds the url regex used by linkifier + + If you want a different set of tlds or allowed protocols, pass those in + and stomp on the existing ``url_re``:: + + from bleach import linkifier + + my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) + + linker = LinkifyFilter(url_re=my_url_re) + + """ + return re.compile( + r"""\(* # Match any opening parentheses. + \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// + ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? + (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))), + re.IGNORECASE | re.VERBOSE | re.UNICODE) + + +URL_RE = build_url_re() + + +PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) + + +def build_email_re(tlds=TLDS): + """Builds the email regex used by linkifier + + If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: + + from bleach import linkifier + + my_email_re = linkifier.build_email_re(my_tlds_list) + + linker = LinkifyFilter(email_re=my_url_re) + + """ + # open and closing braces doubled below for format string + return re.compile( + r"""(?<!//) + (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ + (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom + |^"([\001-\010\013\014\016-\037!#-\[\]-\177] + |\\[\001-\011\013\014\016-\177])*" # quoted-string + )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain + """.format('|'.join(tlds)), + re.IGNORECASE | re.MULTILINE | re.VERBOSE) + + +EMAIL_RE = build_email_re() + + +class Linker(object): + """Convert URL-like strings in an HTML fragment to links + + This function converts strings that look like URLs, domain names and email + addresses in text that may be an HTML fragment to links, while preserving: + + 1. links already in the string + 2. urls found in attributes + 3. email addresses + + linkify does a best-effort approach and tries to recover from bad + situations due to crazy text. + + """ + def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, + url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): + """Creates a Linker instance + + :arg list callbacks: list of callbacks to run when adjusting tag attributes; + defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` + + :arg list skip_tags: list of tags that you don't want to linkify the + contents of; for example, you could set this to ``['pre']`` to skip + linkifying contents of ``pre`` tags + + :arg bool parse_email: whether or not to linkify email addresses + + :arg re url_re: url matching regex + + :arg re email_re: email matching regex + + :arg list-of-strings recognized_tags: the list of tags that linkify knows about; + everything else gets escaped + + :returns: linkified text as unicode + + """ + self.callbacks = callbacks + self.skip_tags = skip_tags + self.parse_email = parse_email + self.url_re = url_re + self.email_re = email_re + + # Create a parser/tokenizer that allows all HTML tags and escapes + # anything not in that list. + self.parser = html5lib_shim.BleachHTMLParser( + tags=recognized_tags, + strip=False, + consume_entities=True, + namespaceHTMLElements=False, + ) + self.walker = html5lib_shim.getTreeWalker('etree') + self.serializer = html5lib_shim.BleachHTMLSerializer( + quote_attr_values='always', + omit_optional_tags=False, + + # linkify does not sanitize + sanitize=False, + + # linkify alphabetizes + alphabetical_attributes=False, + ) + + def linkify(self, text): + """Linkify specified text + + :arg str text: the text to add links to + + :returns: linkified text as unicode + + :raises TypeError: if ``text`` is not a text type + + """ + if not isinstance(text, six.string_types): + raise TypeError('argument must be of text type') + + text = force_unicode(text) + + if not text: + return '' + + dom = self.parser.parseFragment(text) + filtered = LinkifyFilter( + source=self.walker(dom), + callbacks=self.callbacks, + skip_tags=self.skip_tags, + parse_email=self.parse_email, + url_re=self.url_re, + email_re=self.email_re, + ) + return self.serializer.render(filtered) + + +class LinkifyFilter(html5lib_shim.Filter): + """html5lib filter that linkifies text + + This will do the following: + + * convert email addresses into links + * convert urls into links + * edit existing links by running them through callbacks--the default is to + add a ``rel="nofollow"`` + + This filter can be used anywhere html5lib filters can be used. + + """ + def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, + url_re=URL_RE, email_re=EMAIL_RE): + """Creates a LinkifyFilter instance + + :arg TreeWalker source: stream + + :arg list callbacks: list of callbacks to run when adjusting tag attributes; + defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` + + :arg list skip_tags: list of tags that you don't want to linkify the + contents of; for example, you could set this to ``['pre']`` to skip + linkifying contents of ``pre`` tags + + :arg bool parse_email: whether or not to linkify email addresses + + :arg re url_re: url matching regex + + :arg re email_re: email matching regex + + """ + super(LinkifyFilter, self).__init__(source) + + self.callbacks = callbacks or [] + self.skip_tags = skip_tags or [] + self.parse_email = parse_email + + self.url_re = url_re + self.email_re = email_re + + def apply_callbacks(self, attrs, is_new): + """Given an attrs dict and an is_new bool, runs through callbacks + + Callbacks can return an adjusted attrs dict or ``None``. In the case of + ``None``, we stop going through callbacks and return that and the link + gets dropped. + + :arg dict attrs: map of ``(namespace, name)`` -> ``value`` + + :arg bool is_new: whether or not this link was added by linkify + + :returns: adjusted attrs dict or ``None`` + + """ + for cb in self.callbacks: + attrs = cb(attrs, is_new) + if attrs is None: + return None + return attrs + + def extract_character_data(self, token_list): + """Extracts and squashes character sequences in a token stream""" + # FIXME(willkg): This is a terrible idea. What it does is drop all the + # tags from the token list and merge the Characters and SpaceCharacters + # tokens into a single text. + # + # So something like this:: + # + # "<span>" "<b>" "some text" "</b>" "</span>" + # + # gets converted to "some text". + # + # This gets used to figure out the ``_text`` fauxttribute value for + # linkify callables. + # + # I'm not really sure how else to support that ``_text`` fauxttribute and + # maintain some modicum of backwards compatibility with previous versions + # of Bleach. + + out = [] + for token in token_list: + token_type = token['type'] + if token_type in ['Characters', 'SpaceCharacters']: + out.append(token['data']) + + return ''.join(out) + + def handle_email_addresses(self, src_iter): + """Handle email addresses in character tokens""" + for token in src_iter: + if token['type'] == 'Characters': + text = token['data'] + new_tokens = [] + end = 0 + + # For each email address we find in the text + for match in self.email_re.finditer(text): + if match.start() > end: + new_tokens.append( + {'type': 'Characters', 'data': text[end:match.start()]} + ) + + # Run attributes through the callbacks to see what we + # should do with this match + attrs = { + (None, 'href'): 'mailto:%s' % match.group(0), + '_text': match.group(0) + } + attrs = self.apply_callbacks(attrs, True) + + if attrs is None: + # Just add the text--but not as a link + new_tokens.append( + {'type': 'Characters', 'data': match.group(0)} + ) + + else: + # Add an "a" tag for the new link + _text = attrs.pop('_text', '') + attrs = alphabetize_attributes(attrs) + new_tokens.extend([ + {'type': 'StartTag', 'name': 'a', 'data': attrs}, + {'type': 'Characters', 'data': force_unicode(_text)}, + {'type': 'EndTag', 'name': 'a'} + ]) + end = match.end() + + if new_tokens: + # Yield the adjusted set of tokens and then continue + # through the loop + if end < len(text): + new_tokens.append({'type': 'Characters', 'data': text[end:]}) + + for new_token in new_tokens: + yield new_token + + continue + + yield token + + def strip_non_url_bits(self, fragment): + """Strips non-url bits from the url + + This accounts for over-eager matching by the regex. + + """ + prefix = suffix = '' + + while fragment: + # Try removing ( from the beginning and, if it's balanced, from the + # end, too + if fragment.startswith('('): + prefix = prefix + '(' + fragment = fragment[1:] + + if fragment.endswith(')'): + suffix = ')' + suffix + fragment = fragment[:-1] + continue + + # Now try extraneous things from the end. For example, sometimes we + # pick up ) at the end of a url, but the url is in a parenthesized + # phrase like: + # + # "i looked at the site (at http://example.com)" + + if fragment.endswith(')') and '(' not in fragment: + fragment = fragment[:-1] + suffix = ')' + suffix + continue + + # Handle commas + if fragment.endswith(','): + fragment = fragment[:-1] + suffix = ',' + suffix + continue + + # Handle periods + if fragment.endswith('.'): + fragment = fragment[:-1] + suffix = '.' + suffix + continue + + # Nothing matched, so we're done + break + + return fragment, prefix, suffix + + def handle_links(self, src_iter): + """Handle links in character tokens""" + in_a = False # happens, if parse_email=True and if a mail was found + for token in src_iter: + if in_a: + if token['type'] == 'EndTag' and token['name'] == 'a': + in_a = False + yield token + continue + elif token['type'] == 'StartTag' and token['name'] == 'a': + in_a = True + yield token + continue + if token['type'] == 'Characters': + text = token['data'] + new_tokens = [] + end = 0 + + for match in self.url_re.finditer(text): + if match.start() > end: + new_tokens.append( + {'type': 'Characters', 'data': text[end:match.start()]} + ) + + url = match.group(0) + prefix = suffix = '' + + # Sometimes we pick up too much in the url match, so look for + # bits we should drop and remove them from the match + url, prefix, suffix = self.strip_non_url_bits(url) + + # If there's no protocol, add one + if PROTO_RE.search(url): + href = url + else: + href = 'http://%s' % url + + attrs = { + (None, 'href'): href, + '_text': url + } + attrs = self.apply_callbacks(attrs, True) + + if attrs is None: + # Just add the text + new_tokens.append( + {'type': 'Characters', 'data': prefix + url + suffix} + ) + + else: + # Add the "a" tag! + if prefix: + new_tokens.append( + {'type': 'Characters', 'data': prefix} + ) + + _text = attrs.pop('_text', '') + attrs = alphabetize_attributes(attrs) + + new_tokens.extend([ + {'type': 'StartTag', 'name': 'a', 'data': attrs}, + {'type': 'Characters', 'data': force_unicode(_text)}, + {'type': 'EndTag', 'name': 'a'}, + ]) + + if suffix: + new_tokens.append( + {'type': 'Characters', 'data': suffix} + ) + + end = match.end() + + if new_tokens: + # Yield the adjusted set of tokens and then continue + # through the loop + if end < len(text): + new_tokens.append({'type': 'Characters', 'data': text[end:]}) + + for new_token in new_tokens: + yield new_token + + continue + + yield token + + def handle_a_tag(self, token_buffer): + """Handle the "a" tag + + This could adjust the link or drop it altogether depending on what the + callbacks return. + + This yields the new set of tokens. + + """ + a_token = token_buffer[0] + if a_token['data']: + attrs = a_token['data'] + else: + attrs = {} + text = self.extract_character_data(token_buffer) + attrs['_text'] = text + + attrs = self.apply_callbacks(attrs, False) + + if attrs is None: + # We're dropping the "a" tag and everything else and replacing + # it with character data. So emit that token. + yield {'type': 'Characters', 'data': text} + + else: + new_text = attrs.pop('_text', '') + a_token['data'] = alphabetize_attributes(attrs) + + if text == new_text: + # The callbacks didn't change the text, so we yield the new "a" + # token, then whatever else was there, then the end "a" token + yield a_token + for mem in token_buffer[1:]: + yield mem + + else: + # If the callbacks changed the text, then we're going to drop + # all the tokens between the start and end "a" tags and replace + # it with the new text + yield a_token + yield {'type': 'Characters', 'data': force_unicode(new_text)} + yield token_buffer[-1] + + def __iter__(self): + in_a = False + in_skip_tag = None + + token_buffer = [] + + for token in super(LinkifyFilter, self).__iter__(): + if in_a: + # Handle the case where we're in an "a" tag--we want to buffer tokens + # until we hit an end "a" tag. + if token['type'] == 'EndTag' and token['name'] == 'a': + # Add the end tag to the token buffer and then handle them + # and yield anything returned + token_buffer.append(token) + for new_token in self.handle_a_tag(token_buffer): + yield new_token + + # Clear "a" related state and continue since we've yielded all + # the tokens we're going to yield + in_a = False + token_buffer = [] + else: + token_buffer.append(token) + continue + + if token['type'] in ['StartTag', 'EmptyTag']: + if token['name'] in self.skip_tags: + # Skip tags start a "special mode" where we don't linkify + # anything until the end tag. + in_skip_tag = token['name'] + + elif token['name'] == 'a': + # The "a" tag is special--we switch to a slurp mode and + # slurp all the tokens until the end "a" tag and then + # figure out what to do with them there. + in_a = True + token_buffer.append(token) + + # We buffer the start tag, so we don't want to yield it, + # yet + continue + + elif in_skip_tag and self.skip_tags: + # NOTE(willkg): We put this clause here since in_a and + # switching in and out of in_a takes precedence. + if token['type'] == 'EndTag' and token['name'] == in_skip_tag: + in_skip_tag = None + + elif not in_a and not in_skip_tag and token['type'] == 'Characters': + new_stream = iter([token]) + if self.parse_email: + new_stream = self.handle_email_addresses(new_stream) + + new_stream = self.handle_links(new_stream) + + for token in new_stream: + yield token + + # We've already yielded this token, so continue + continue + + yield token