diff env/lib/python3.7/site-packages/bleach/linkifier.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bleach/linkifier.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,561 +0,0 @@
-from __future__ import unicode_literals
-import re
-import six
-
-from bleach import callbacks as linkify_callbacks
-from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
-
-
-#: List of default callbacks
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
-
-
-TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
-       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
-       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
-       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
-       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
-       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
-       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
-       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
-       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
-       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
-       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
-       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
-       xn xxx ye yt yu za zm zw""".split()
-
-# Make sure that .com doesn't get matched by .co first
-TLDS.reverse()
-
-
-def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
-    """Builds the url regex used by linkifier
-
-   If you want a different set of tlds or allowed protocols, pass those in
-   and stomp on the existing ``url_re``::
-
-       from bleach import linkifier
-
-       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
-
-       linker = LinkifyFilter(url_re=my_url_re)
-
-    """
-    return re.compile(
-        r"""\(*  # Match any opening parentheses.
-        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
-            # /path/zz (excluding "unsafe" chars from RFC 1738,
-            # except for # and ~, which happen in practice)
-        """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))),
-        re.IGNORECASE | re.VERBOSE | re.UNICODE)
-
-
-URL_RE = build_url_re()
-
-
-PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-
-
-def build_email_re(tlds=TLDS):
-    """Builds the email regex used by linkifier
-
-   If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
-
-       from bleach import linkifier
-
-       my_email_re = linkifier.build_email_re(my_tlds_list)
-
-       linker = LinkifyFilter(email_re=my_url_re)
-
-    """
-    # open and closing braces doubled below for format string
-    return re.compile(
-        r"""(?<!//)
-        (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
-            (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
-        |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-            |\\[\001-\011\013\014\016-\177])*"  # quoted-string
-        )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
-        """.format('|'.join(tlds)),
-        re.IGNORECASE | re.MULTILINE | re.VERBOSE)
-
-
-EMAIL_RE = build_email_re()
-
-
-class Linker(object):
-    """Convert URL-like strings in an HTML fragment to links
-
-    This function converts strings that look like URLs, domain names and email
-    addresses in text that may be an HTML fragment to links, while preserving:
-
-    1. links already in the string
-    2. urls found in attributes
-    3. email addresses
-
-    linkify does a best-effort approach and tries to recover from bad
-    situations due to crazy text.
-
-    """
-    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
-        """Creates a Linker instance
-
-        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
-            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
-
-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
-            linkifying contents of ``pre`` tags
-
-        :arg bool parse_email: whether or not to linkify email addresses
-
-        :arg re url_re: url matching regex
-
-        :arg re email_re: email matching regex
-
-        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
-            everything else gets escaped
-
-        :returns: linkified text as unicode
-
-        """
-        self.callbacks = callbacks
-        self.skip_tags = skip_tags
-        self.parse_email = parse_email
-        self.url_re = url_re
-        self.email_re = email_re
-
-        # Create a parser/tokenizer that allows all HTML tags and escapes
-        # anything not in that list.
-        self.parser = html5lib_shim.BleachHTMLParser(
-            tags=recognized_tags,
-            strip=False,
-            consume_entities=True,
-            namespaceHTMLElements=False,
-        )
-        self.walker = html5lib_shim.getTreeWalker('etree')
-        self.serializer = html5lib_shim.BleachHTMLSerializer(
-            quote_attr_values='always',
-            omit_optional_tags=False,
-
-            # linkify does not sanitize
-            sanitize=False,
-
-            # linkify alphabetizes
-            alphabetical_attributes=False,
-        )
-
-    def linkify(self, text):
-        """Linkify specified text
-
-        :arg str text: the text to add links to
-
-        :returns: linkified text as unicode
-
-        :raises TypeError: if ``text`` is not a text type
-
-        """
-        if not isinstance(text, six.string_types):
-            raise TypeError('argument must be of text type')
-
-        text = force_unicode(text)
-
-        if not text:
-            return ''
-
-        dom = self.parser.parseFragment(text)
-        filtered = LinkifyFilter(
-            source=self.walker(dom),
-            callbacks=self.callbacks,
-            skip_tags=self.skip_tags,
-            parse_email=self.parse_email,
-            url_re=self.url_re,
-            email_re=self.email_re,
-        )
-        return self.serializer.render(filtered)
-
-
-class LinkifyFilter(html5lib_shim.Filter):
-    """html5lib filter that linkifies text
-
-    This will do the following:
-
-    * convert email addresses into links
-    * convert urls into links
-    * edit existing links by running them through callbacks--the default is to
-      add a ``rel="nofollow"``
-
-    This filter can be used anywhere html5lib filters can be used.
-
-    """
-    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE):
-        """Creates a LinkifyFilter instance
-
-        :arg TreeWalker source: stream
-
-        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
-            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
-
-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
-            linkifying contents of ``pre`` tags
-
-        :arg bool parse_email: whether or not to linkify email addresses
-
-        :arg re url_re: url matching regex
-
-        :arg re email_re: email matching regex
-
-        """
-        super(LinkifyFilter, self).__init__(source)
-
-        self.callbacks = callbacks or []
-        self.skip_tags = skip_tags or []
-        self.parse_email = parse_email
-
-        self.url_re = url_re
-        self.email_re = email_re
-
-    def apply_callbacks(self, attrs, is_new):
-        """Given an attrs dict and an is_new bool, runs through callbacks
-
-        Callbacks can return an adjusted attrs dict or ``None``. In the case of
-        ``None``, we stop going through callbacks and return that and the link
-        gets dropped.
-
-        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
-
-        :arg bool is_new: whether or not this link was added by linkify
-
-        :returns: adjusted attrs dict or ``None``
-
-        """
-        for cb in self.callbacks:
-            attrs = cb(attrs, is_new)
-            if attrs is None:
-                return None
-        return attrs
-
-    def extract_character_data(self, token_list):
-        """Extracts and squashes character sequences in a token stream"""
-        # FIXME(willkg): This is a terrible idea. What it does is drop all the
-        # tags from the token list and merge the Characters and SpaceCharacters
-        # tokens into a single text.
-        #
-        # So something like this::
-        #
-        #     "<span>" "<b>" "some text" "</b>" "</span>"
-        #
-        # gets converted to "some text".
-        #
-        # This gets used to figure out the ``_text`` fauxttribute value for
-        # linkify callables.
-        #
-        # I'm not really sure how else to support that ``_text`` fauxttribute and
-        # maintain some modicum of backwards compatibility with previous versions
-        # of Bleach.
-
-        out = []
-        for token in token_list:
-            token_type = token['type']
-            if token_type in ['Characters', 'SpaceCharacters']:
-                out.append(token['data'])
-
-        return ''.join(out)
-
-    def handle_email_addresses(self, src_iter):
-        """Handle email addresses in character tokens"""
-        for token in src_iter:
-            if token['type'] == 'Characters':
-                text = token['data']
-                new_tokens = []
-                end = 0
-
-                # For each email address we find in the text
-                for match in self.email_re.finditer(text):
-                    if match.start() > end:
-                        new_tokens.append(
-                            {'type': 'Characters', 'data': text[end:match.start()]}
-                        )
-
-                    # Run attributes through the callbacks to see what we
-                    # should do with this match
-                    attrs = {
-                        (None, 'href'): 'mailto:%s' % match.group(0),
-                        '_text': match.group(0)
-                    }
-                    attrs = self.apply_callbacks(attrs, True)
-
-                    if attrs is None:
-                        # Just add the text--but not as a link
-                        new_tokens.append(
-                            {'type': 'Characters', 'data': match.group(0)}
-                        )
-
-                    else:
-                        # Add an "a" tag for the new link
-                        _text = attrs.pop('_text', '')
-                        attrs = alphabetize_attributes(attrs)
-                        new_tokens.extend([
-                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
-                            {'type': 'Characters', 'data': force_unicode(_text)},
-                            {'type': 'EndTag', 'name': 'a'}
-                        ])
-                    end = match.end()
-
-                if new_tokens:
-                    # Yield the adjusted set of tokens and then continue
-                    # through the loop
-                    if end < len(text):
-                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
-
-                    for new_token in new_tokens:
-                        yield new_token
-
-                    continue
-
-            yield token
-
-    def strip_non_url_bits(self, fragment):
-        """Strips non-url bits from the url
-
-        This accounts for over-eager matching by the regex.
-
-        """
-        prefix = suffix = ''
-
-        while fragment:
-            # Try removing ( from the beginning and, if it's balanced, from the
-            # end, too
-            if fragment.startswith('('):
-                prefix = prefix + '('
-                fragment = fragment[1:]
-
-                if fragment.endswith(')'):
-                    suffix = ')' + suffix
-                    fragment = fragment[:-1]
-                continue
-
-            # Now try extraneous things from the end. For example, sometimes we
-            # pick up ) at the end of a url, but the url is in a parenthesized
-            # phrase like:
-            #
-            #     "i looked at the site (at http://example.com)"
-
-            if fragment.endswith(')') and '(' not in fragment:
-                fragment = fragment[:-1]
-                suffix = ')' + suffix
-                continue
-
-            # Handle commas
-            if fragment.endswith(','):
-                fragment = fragment[:-1]
-                suffix = ',' + suffix
-                continue
-
-            # Handle periods
-            if fragment.endswith('.'):
-                fragment = fragment[:-1]
-                suffix = '.' + suffix
-                continue
-
-            # Nothing matched, so we're done
-            break
-
-        return fragment, prefix, suffix
-
-    def handle_links(self, src_iter):
-        """Handle links in character tokens"""
-        in_a = False  # happens, if parse_email=True and if a mail was found
-        for token in src_iter:
-            if in_a:
-                if token['type'] == 'EndTag' and token['name'] == 'a':
-                    in_a = False
-                yield token
-                continue
-            elif token['type'] == 'StartTag' and token['name'] == 'a':
-                in_a = True
-                yield token
-                continue
-            if token['type'] == 'Characters':
-                text = token['data']
-                new_tokens = []
-                end = 0
-
-                for match in self.url_re.finditer(text):
-                    if match.start() > end:
-                        new_tokens.append(
-                            {'type': 'Characters', 'data': text[end:match.start()]}
-                        )
-
-                    url = match.group(0)
-                    prefix = suffix = ''
-
-                    # Sometimes we pick up too much in the url match, so look for
-                    # bits we should drop and remove them from the match
-                    url, prefix, suffix = self.strip_non_url_bits(url)
-
-                    # If there's no protocol, add one
-                    if PROTO_RE.search(url):
-                        href = url
-                    else:
-                        href = 'http://%s' % url
-
-                    attrs = {
-                        (None, 'href'): href,
-                        '_text': url
-                    }
-                    attrs = self.apply_callbacks(attrs, True)
-
-                    if attrs is None:
-                        # Just add the text
-                        new_tokens.append(
-                            {'type': 'Characters', 'data': prefix + url + suffix}
-                        )
-
-                    else:
-                        # Add the "a" tag!
-                        if prefix:
-                            new_tokens.append(
-                                {'type': 'Characters', 'data': prefix}
-                            )
-
-                        _text = attrs.pop('_text', '')
-                        attrs = alphabetize_attributes(attrs)
-
-                        new_tokens.extend([
-                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
-                            {'type': 'Characters', 'data': force_unicode(_text)},
-                            {'type': 'EndTag', 'name': 'a'},
-                        ])
-
-                        if suffix:
-                            new_tokens.append(
-                                {'type': 'Characters', 'data': suffix}
-                            )
-
-                    end = match.end()
-
-                if new_tokens:
-                    # Yield the adjusted set of tokens and then continue
-                    # through the loop
-                    if end < len(text):
-                        new_tokens.append({'type': 'Characters', 'data': text[end:]})
-
-                    for new_token in new_tokens:
-                        yield new_token
-
-                    continue
-
-            yield token
-
-    def handle_a_tag(self, token_buffer):
-        """Handle the "a" tag
-
-        This could adjust the link or drop it altogether depending on what the
-        callbacks return.
-
-        This yields the new set of tokens.
-
-        """
-        a_token = token_buffer[0]
-        if a_token['data']:
-            attrs = a_token['data']
-        else:
-            attrs = {}
-        text = self.extract_character_data(token_buffer)
-        attrs['_text'] = text
-
-        attrs = self.apply_callbacks(attrs, False)
-
-        if attrs is None:
-            # We're dropping the "a" tag and everything else and replacing
-            # it with character data. So emit that token.
-            yield {'type': 'Characters', 'data': text}
-
-        else:
-            new_text = attrs.pop('_text', '')
-            a_token['data'] = alphabetize_attributes(attrs)
-
-            if text == new_text:
-                # The callbacks didn't change the text, so we yield the new "a"
-                # token, then whatever else was there, then the end "a" token
-                yield a_token
-                for mem in token_buffer[1:]:
-                    yield mem
-
-            else:
-                # If the callbacks changed the text, then we're going to drop
-                # all the tokens between the start and end "a" tags and replace
-                # it with the new text
-                yield a_token
-                yield {'type': 'Characters', 'data': force_unicode(new_text)}
-                yield token_buffer[-1]
-
-    def __iter__(self):
-        in_a = False
-        in_skip_tag = None
-
-        token_buffer = []
-
-        for token in super(LinkifyFilter, self).__iter__():
-            if in_a:
-                # Handle the case where we're in an "a" tag--we want to buffer tokens
-                # until we hit an end "a" tag.
-                if token['type'] == 'EndTag' and token['name'] == 'a':
-                    # Add the end tag to the token buffer and then handle them
-                    # and yield anything returned
-                    token_buffer.append(token)
-                    for new_token in self.handle_a_tag(token_buffer):
-                        yield new_token
-
-                    # Clear "a" related state and continue since we've yielded all
-                    # the tokens we're going to yield
-                    in_a = False
-                    token_buffer = []
-                else:
-                    token_buffer.append(token)
-                continue
-
-            if token['type'] in ['StartTag', 'EmptyTag']:
-                if token['name'] in self.skip_tags:
-                    # Skip tags start a "special mode" where we don't linkify
-                    # anything until the end tag.
-                    in_skip_tag = token['name']
-
-                elif token['name'] == 'a':
-                    # The "a" tag is special--we switch to a slurp mode and
-                    # slurp all the tokens until the end "a" tag and then
-                    # figure out what to do with them there.
-                    in_a = True
-                    token_buffer.append(token)
-
-                    # We buffer the start tag, so we don't want to yield it,
-                    # yet
-                    continue
-
-            elif in_skip_tag and self.skip_tags:
-                # NOTE(willkg): We put this clause here since in_a and
-                # switching in and out of in_a takes precedence.
-                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
-                    in_skip_tag = None
-
-            elif not in_a and not in_skip_tag and token['type'] == 'Characters':
-                new_stream = iter([token])
-                if self.parse_email:
-                    new_stream = self.handle_email_addresses(new_stream)
-
-                new_stream = self.handle_links(new_stream)
-
-                for token in new_stream:
-                    yield token
-
-                # We've already yielded this token, so continue
-                continue
-
-            yield token