diff env/lib/python3.7/site-packages/lxml/html/clean.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/lxml/html/clean.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,754 +0,0 @@
-# cython: language_level=2
-
-"""A cleanup tool for HTML.
-
-Removes unwanted tags and content.  See the `Cleaner` class for
-details.
-"""
-
-from __future__ import absolute_import
-
-import re
-import copy
-try:
-    from urlparse import urlsplit
-    from urllib import unquote_plus
-except ImportError:
-    # Python 3
-    from urllib.parse import urlsplit, unquote_plus
-from lxml import etree
-from lxml.html import defs
-from lxml.html import fromstring, XHTML_NAMESPACE
-from lxml.html import xhtml_to_html, _transform_result
-
-try:
-    unichr
-except NameError:
-    # Python 3
-    unichr = chr
-try:
-    unicode
-except NameError:
-    # Python 3
-    unicode = str
-try:
-    basestring
-except NameError:
-    basestring = (str, bytes)
-
-
-__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
-           'word_break', 'word_break_html']
-
-# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-#   Particularly the CSS cleaning; most of the tag cleaning is integrated now
-# I have multiple kinds of schemes searched; but should schemes be
-#   whitelisted instead?
-# max height?
-# remove images?  Also in CSS?  background attribute?
-# Some way to whitelist object, iframe, etc (e.g., if you want to
-#   allow *just* embedded YouTube movies)
-# Log what was deleted and why?
-# style="behavior: ..." might be bad in IE?
-# Should we have something for just <meta http-equiv>?  That's the worst of the
-#   metas.
-# UTF-7 detections?  Example:
-#     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
-#   you don't always have to have the charset set, if the page has no charset
-#   and there's UTF7-like code in it.
-# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
-
-
-# This is an IE-specific construct you can have in a stylesheet to
-# run some Javascript:
-_css_javascript_re = re.compile(
-    r'expression\s*\(.*?\)', re.S|re.I)
-
-# Do I have to worry about @\nimport?
-_css_import_re = re.compile(
-    r'@\s*import', re.I)
-
-# All kinds of schemes besides just javascript: that can cause
-# execution:
-_is_image_dataurl = re.compile(
-    r'^data:image/.+;base64', re.I).search
-_is_possibly_malicious_scheme = re.compile(
-    r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
-    re.I).search
-def _is_javascript_scheme(s):
-    if _is_image_dataurl(s):
-        return None
-    return _is_possibly_malicious_scheme(s)
-
-_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
-# FIXME: should data: be blocked?
-
-# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
-_conditional_comment_re = re.compile(
-    r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
-
-_find_styled_elements = etree.XPath(
-    "descendant-or-self::*[@style]")
-
-_find_external_links = etree.XPath(
-    ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
-     "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
-    namespaces={'x':XHTML_NAMESPACE})
-
-
-class Cleaner(object):
-    """
-    Instances cleans the document of each of the possible offending
-    elements.  The cleaning is controlled by attributes; you can
-    override attributes in a subclass, or set them in the constructor.
-
-    ``scripts``:
-        Removes any ``<script>`` tags.
-
-    ``javascript``:
-        Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
-        as they could contain Javascript.
-
-    ``comments``:
-        Removes any comments.
-
-    ``style``:
-        Removes any style tags.
-
-    ``inline_style``
-        Removes any style attributes.  Defaults to the value of the ``style`` option.
-
-    ``links``:
-        Removes any ``<link>`` tags
-
-    ``meta``:
-        Removes any ``<meta>`` tags
-
-    ``page_structure``:
-        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
-
-    ``processing_instructions``:
-        Removes any processing instructions.
-
-    ``embedded``:
-        Removes any embedded objects (flash, iframes)
-
-    ``frames``:
-        Removes any frame-related tags
-
-    ``forms``:
-        Removes any form tags
-
-    ``annoying_tags``:
-        Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``
-
-    ``remove_tags``:
-        A list of tags to remove.  Only the tags will be removed,
-        their content will get pulled up into the parent tag.
-
-    ``kill_tags``:
-        A list of tags to kill.  Killing also removes the tag's content,
-        i.e. the whole subtree, not just the tag itself.
-
-    ``allow_tags``:
-        A list of tags to include (default include all).
-
-    ``remove_unknown_tags``:
-        Remove any tags that aren't standard parts of HTML.
-
-    ``safe_attrs_only``:
-        If true, only include 'safe' attributes (specifically the list
-        from the feedparser HTML sanitisation web site).
-
-    ``safe_attrs``:
-        A set of attribute names to override the default list of attributes
-        considered 'safe' (when safe_attrs_only=True).
-
-    ``add_nofollow``:
-        If true, then any <a> tags will have ``rel="nofollow"`` added to them.
-
-    ``host_whitelist``:
-        A list or set of hosts that you can use for embedded content
-        (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
-        You can also implement/override the method
-        ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
-        implement more complex rules for what can be embedded.
-        Anything that passes this test will be shown, regardless of
-        the value of (for instance) ``embedded``.
-
-        Note that this parameter might not work as intended if you do not
-        make the links absolute before doing the cleaning.
-
-        Note that you may also need to set ``whitelist_tags``.
-
-    ``whitelist_tags``:
-        A set of tags that can be included with ``host_whitelist``.
-        The default is ``iframe`` and ``embed``; you may wish to
-        include other tags like ``script``, or you may want to
-        implement ``allow_embedded_url`` for more control.  Set to None to
-        include all tags.
-
-    This modifies the document *in place*.
-    """
-
-    scripts = True
-    javascript = True
-    comments = True
-    style = False
-    inline_style = None
-    links = True
-    meta = True
-    page_structure = True
-    processing_instructions = True
-    embedded = True
-    frames = True
-    forms = True
-    annoying_tags = True
-    remove_tags = None
-    allow_tags = None
-    kill_tags = None
-    remove_unknown_tags = True
-    safe_attrs_only = True
-    safe_attrs = defs.safe_attrs
-    add_nofollow = False
-    host_whitelist = ()
-    whitelist_tags = {'iframe', 'embed'}
-
-    def __init__(self, **kw):
-        for name, value in kw.items():
-            if not hasattr(self, name):
-                raise TypeError(
-                    "Unknown parameter: %s=%r" % (name, value))
-            setattr(self, name, value)
-        if self.inline_style is None and 'inline_style' not in kw:
-            self.inline_style = self.style
-
-    # Used to lookup the primary URL for a given tag that is up for
-    # removal:
-    _tag_link_attrs = dict(
-        script='src',
-        link='href',
-        # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
-        # From what I can tell, both attributes can contain a link:
-        applet=['code', 'object'],
-        iframe='src',
-        embed='src',
-        layer='src',
-        # FIXME: there doesn't really seem like a general way to figure out what
-        # links an <object> tag uses; links often go in <param> tags with values
-        # that we don't really know.  You'd have to have knowledge about specific
-        # kinds of plugins (probably keyed off classid), and match against those.
-        ##object=?,
-        # FIXME: not looking at the action currently, because it is more complex
-        # than than -- if you keep the form, you should keep the form controls.
-        ##form='action',
-        a='href',
-        )
-
-    def __call__(self, doc):
-        """
-        Cleans the document.
-        """
-        if hasattr(doc, 'getroot'):
-            # ElementTree instance, instead of an element
-            doc = doc.getroot()
-        # convert XHTML to HTML
-        xhtml_to_html(doc)
-        # Normalize a case that IE treats <image> like <img>, and that
-        # can confuse either this step or later steps.
-        for el in doc.iter('image'):
-            el.tag = 'img'
-        if not self.comments:
-            # Of course, if we were going to kill comments anyway, we don't
-            # need to worry about this
-            self.kill_conditional_comments(doc)
-
-        kill_tags = set(self.kill_tags or ())
-        remove_tags = set(self.remove_tags or ())
-        allow_tags = set(self.allow_tags or ())
-
-        if self.scripts:
-            kill_tags.add('script')
-        if self.safe_attrs_only:
-            safe_attrs = set(self.safe_attrs)
-            for el in doc.iter(etree.Element):
-                attrib = el.attrib
-                for aname in attrib.keys():
-                    if aname not in safe_attrs:
-                        del attrib[aname]
-        if self.javascript:
-            if not (self.safe_attrs_only and
-                    self.safe_attrs == defs.safe_attrs):
-                # safe_attrs handles events attributes itself
-                for el in doc.iter(etree.Element):
-                    attrib = el.attrib
-                    for aname in attrib.keys():
-                        if aname.startswith('on'):
-                            del attrib[aname]
-            doc.rewrite_links(self._remove_javascript_link,
-                              resolve_base_href=False)
-            # If we're deleting style then we don't have to remove JS links
-            # from styles, otherwise...
-            if not self.inline_style:
-                for el in _find_styled_elements(doc):
-                    old = el.get('style')
-                    new = _css_javascript_re.sub('', old)
-                    new = _css_import_re.sub('', new)
-                    if self._has_sneaky_javascript(new):
-                        # Something tricky is going on...
-                        del el.attrib['style']
-                    elif new != old:
-                        el.set('style', new)
-            if not self.style:
-                for el in list(doc.iter('style')):
-                    if el.get('type', '').lower().strip() == 'text/javascript':
-                        el.drop_tree()
-                        continue
-                    old = el.text or ''
-                    new = _css_javascript_re.sub('', old)
-                    # The imported CSS can do anything; we just can't allow:
-                    new = _css_import_re.sub('', old)
-                    if self._has_sneaky_javascript(new):
-                        # Something tricky is going on...
-                        el.text = '/* deleted */'
-                    elif new != old:
-                        el.text = new
-        if self.comments or self.processing_instructions:
-            # FIXME: why either?  I feel like there's some obscure reason
-            # because you can put PIs in comments...?  But I've already
-            # forgotten it
-            kill_tags.add(etree.Comment)
-        if self.processing_instructions:
-            kill_tags.add(etree.ProcessingInstruction)
-        if self.style:
-            kill_tags.add('style')
-        if self.inline_style:
-            etree.strip_attributes(doc, 'style')
-        if self.links:
-            kill_tags.add('link')
-        elif self.style or self.javascript:
-            # We must get rid of included stylesheets if Javascript is not
-            # allowed, as you can put Javascript in them
-            for el in list(doc.iter('link')):
-                if 'stylesheet' in el.get('rel', '').lower():
-                    # Note this kills alternate stylesheets as well
-                    if not self.allow_element(el):
-                        el.drop_tree()
-        if self.meta:
-            kill_tags.add('meta')
-        if self.page_structure:
-            remove_tags.update(('head', 'html', 'title'))
-        if self.embedded:
-            # FIXME: is <layer> really embedded?
-            # We should get rid of any <param> tags not inside <applet>;
-            # These are not really valid anyway.
-            for el in list(doc.iter('param')):
-                found_parent = False
-                parent = el.getparent()
-                while parent is not None and parent.tag not in ('applet', 'object'):
-                    parent = parent.getparent()
-                if parent is None:
-                    el.drop_tree()
-            kill_tags.update(('applet',))
-            # The alternate contents that are in an iframe are a good fallback:
-            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
-        if self.frames:
-            # FIXME: ideally we should look at the frame links, but
-            # generally frames don't mix properly with an HTML
-            # fragment anyway.
-            kill_tags.update(defs.frame_tags)
-        if self.forms:
-            remove_tags.add('form')
-            kill_tags.update(('button', 'input', 'select', 'textarea'))
-        if self.annoying_tags:
-            remove_tags.update(('blink', 'marquee'))
-
-        _remove = []
-        _kill = []
-        for el in doc.iter():
-            if el.tag in kill_tags:
-                if self.allow_element(el):
-                    continue
-                _kill.append(el)
-            elif el.tag in remove_tags:
-                if self.allow_element(el):
-                    continue
-                _remove.append(el)
-
-        if _remove and _remove[0] == doc:
-            # We have to drop the parent-most tag, which we can't
-            # do.  Instead we'll rewrite it:
-            el = _remove.pop(0)
-            el.tag = 'div'
-            el.attrib.clear()
-        elif _kill and _kill[0] == doc:
-            # We have to drop the parent-most element, which we can't
-            # do.  Instead we'll clear it:
-            el = _kill.pop(0)
-            if el.tag != 'html':
-                el.tag = 'div'
-            el.clear()
-
-        _kill.reverse() # start with innermost tags
-        for el in _kill:
-            el.drop_tree()
-        for el in _remove:
-            el.drop_tag()
-
-        if self.remove_unknown_tags:
-            if allow_tags:
-                raise ValueError(
-                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
-            allow_tags = set(defs.tags)
-        if allow_tags:
-            bad = []
-            for el in doc.iter():
-                if el.tag not in allow_tags:
-                    bad.append(el)
-            if bad:
-                if bad[0] is doc:
-                    el = bad.pop(0)
-                    el.tag = 'div'
-                    el.attrib.clear()
-                for el in bad:
-                    el.drop_tag()
-        if self.add_nofollow:
-            for el in _find_external_links(doc):
-                if not self.allow_follow(el):
-                    rel = el.get('rel')
-                    if rel:
-                        if ('nofollow' in rel
-                                and ' nofollow ' in (' %s ' % rel)):
-                            continue
-                        rel = '%s nofollow' % rel
-                    else:
-                        rel = 'nofollow'
-                    el.set('rel', rel)
-
-    def allow_follow(self, anchor):
-        """
-        Override to suppress rel="nofollow" on some anchors.
-        """
-        return False
-
-    def allow_element(self, el):
-        """
-        Decide whether an element is configured to be accepted or rejected.
-
-        :param el: an element.
-        :return: true to accept the element or false to reject/discard it.
-        """
-        if el.tag not in self._tag_link_attrs:
-            return False
-        attr = self._tag_link_attrs[el.tag]
-        if isinstance(attr, (list, tuple)):
-            for one_attr in attr:
-                url = el.get(one_attr)
-                if not url:
-                    return False
-                if not self.allow_embedded_url(el, url):
-                    return False
-            return True
-        else:
-            url = el.get(attr)
-            if not url:
-                return False
-            return self.allow_embedded_url(el, url)
-
-    def allow_embedded_url(self, el, url):
-        """
-        Decide whether a URL that was found in an element's attributes or text
-        if configured to be accepted or rejected.
-
-        :param el: an element.
-        :param url: a URL found on the element.
-        :return: true to accept the URL and false to reject it.
-        """
-        if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
-            return False
-        scheme, netloc, path, query, fragment = urlsplit(url)
-        netloc = netloc.lower().split(':', 1)[0]
-        if scheme not in ('http', 'https'):
-            return False
-        if netloc in self.host_whitelist:
-            return True
-        return False
-
-    def kill_conditional_comments(self, doc):
-        """
-        IE conditional comments basically embed HTML that the parser
-        doesn't normally see.  We can't allow anything like that, so
-        we'll kill any comments that could be conditional.
-        """
-        bad = []
-        self._kill_elements(
-            doc, lambda el: _conditional_comment_re.search(el.text),
-            etree.Comment)                
-
-    def _kill_elements(self, doc, condition, iterate=None):
-        bad = []
-        for el in doc.iter(iterate):
-            if condition(el):
-                bad.append(el)
-        for el in bad:
-            el.drop_tree()
-
-    def _remove_javascript_link(self, link):
-        # links like "j a v a s c r i p t:" might be interpreted in IE
-        new = _substitute_whitespace('', unquote_plus(link))
-        if _is_javascript_scheme(new):
-            # FIXME: should this be None to delete?
-            return ''
-        return link
-
-    _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
-
-    def _has_sneaky_javascript(self, style):
-        """
-        Depending on the browser, stuff like ``e x p r e s s i o n(...)``
-        can get interpreted, or ``expre/* stuff */ssion(...)``.  This
-        checks for attempt to do stuff like this.
-
-        Typically the response will be to kill the entire style; if you
-        have just a bit of Javascript in the style another rule will catch
-        that and remove only the Javascript from the style; this catches
-        more sneaky attempts.
-        """
-        style = self._substitute_comments('', style)
-        style = style.replace('\\', '')
-        style = _substitute_whitespace('', style)
-        style = style.lower()
-        if 'javascript:' in style:
-            return True
-        if 'expression(' in style:
-            return True
-        return False
-
-    def clean_html(self, html):
-        result_type = type(html)
-        if isinstance(html, basestring):
-            doc = fromstring(html)
-        else:
-            doc = copy.deepcopy(html)
-        self(doc)
-        return _transform_result(result_type, doc)
-
-clean = Cleaner()
-clean_html = clean.clean_html
-
-############################################################
-## Autolinking
-############################################################
-
-_link_regexes = [
-    re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
-    # This is conservative, but autolinking can be a bit conservative:
-    re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
-    ]
-
-_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
-
-_avoid_hosts = [
-    re.compile(r'^localhost', re.I),
-    re.compile(r'\bexample\.(?:com|org|net)$', re.I),
-    re.compile(r'^127\.0\.0\.1$'),
-    ]
-
-_avoid_classes = ['nolink']
-
-def autolink(el, link_regexes=_link_regexes,
-             avoid_elements=_avoid_elements,
-             avoid_hosts=_avoid_hosts,
-             avoid_classes=_avoid_classes):
-    """
-    Turn any URLs into links.
-
-    It will search for links identified by the given regular
-    expressions (by default mailto and http(s) links).
-
-    It won't link text in an element in avoid_elements, or an element
-    with a class in avoid_classes.  It won't link to anything with a
-    host that matches one of the regular expressions in avoid_hosts
-    (default localhost and 127.0.0.1).
-
-    If you pass in an element, the element's tail will not be
-    substituted, only the contents of the element.
-    """
-    if el.tag in avoid_elements:
-        return
-    class_name = el.get('class')
-    if class_name:
-        class_name = class_name.split()
-        for match_class in avoid_classes:
-            if match_class in class_name:
-                return
-    for child in list(el):
-        autolink(child, link_regexes=link_regexes,
-                 avoid_elements=avoid_elements,
-                 avoid_hosts=avoid_hosts,
-                 avoid_classes=avoid_classes)
-        if child.tail:
-            text, tail_children = _link_text(
-                child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
-            if tail_children:
-                child.tail = text
-                index = el.index(child)
-                el[index+1:index+1] = tail_children
-    if el.text:
-        text, pre_children = _link_text(
-            el.text, link_regexes, avoid_hosts, factory=el.makeelement)
-        if pre_children:
-            el.text = text
-            el[:0] = pre_children
-
-def _link_text(text, link_regexes, avoid_hosts, factory):
-    leading_text = ''
-    links = []
-    last_pos = 0
-    while 1:
-        best_match, best_pos = None, None
-        for regex in link_regexes:
-            regex_pos = last_pos
-            while 1:
-                match = regex.search(text, pos=regex_pos)
-                if match is None:
-                    break
-                host = match.group('host')
-                for host_regex in avoid_hosts:
-                    if host_regex.search(host):
-                        regex_pos = match.end()
-                        break
-                else:
-                    break
-            if match is None:
-                continue
-            if best_pos is None or match.start() < best_pos:
-                best_match = match
-                best_pos = match.start()
-        if best_match is None:
-            # No more matches
-            if links:
-                assert not links[-1].tail
-                links[-1].tail = text
-            else:
-                assert not leading_text
-                leading_text = text
-            break
-        link = best_match.group(0)
-        end = best_match.end()
-        if link.endswith('.') or link.endswith(','):
-            # These punctuation marks shouldn't end a link
-            end -= 1
-            link = link[:-1]
-        prev_text = text[:best_match.start()]
-        if links:
-            assert not links[-1].tail
-            links[-1].tail = prev_text
-        else:
-            assert not leading_text
-            leading_text = prev_text
-        anchor = factory('a')
-        anchor.set('href', link)
-        body = best_match.group('body')
-        if not body:
-            body = link
-        if body.endswith('.') or body.endswith(','):
-            body = body[:-1]
-        anchor.text = body
-        links.append(anchor)
-        text = text[end:]
-    return leading_text, links
-                
-def autolink_html(html, *args, **kw):
-    result_type = type(html)
-    if isinstance(html, basestring):
-        doc = fromstring(html)
-    else:
-        doc = copy.deepcopy(html)
-    autolink(doc, *args, **kw)
-    return _transform_result(result_type, doc)
-
-autolink_html.__doc__ = autolink.__doc__
-
-############################################################
-## Word wrapping
-############################################################
-
-_avoid_word_break_elements = ['pre', 'textarea', 'code']
-_avoid_word_break_classes = ['nobreak']
-
-def word_break(el, max_width=40,
-               avoid_elements=_avoid_word_break_elements,
-               avoid_classes=_avoid_word_break_classes,
-               break_character=unichr(0x200b)):
-    """
-    Breaks any long words found in the body of the text (not attributes).
-
-    Doesn't effect any of the tags in avoid_elements, by default
-    ``<textarea>`` and ``<pre>``
-
-    Breaks words by inserting &#8203;, which is a unicode character
-    for Zero Width Space character.  This generally takes up no space
-    in rendering, but does copy as a space, and in monospace contexts
-    usually takes up space.
-
-    See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
-    """
-    # Character suggestion of &#8203 comes from:
-    #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
-    if el.tag in _avoid_word_break_elements:
-        return
-    class_name = el.get('class')
-    if class_name:
-        dont_break = False
-        class_name = class_name.split()
-        for avoid in avoid_classes:
-            if avoid in class_name:
-                dont_break = True
-                break
-        if dont_break:
-            return
-    if el.text:
-        el.text = _break_text(el.text, max_width, break_character)
-    for child in el:
-        word_break(child, max_width=max_width,
-                   avoid_elements=avoid_elements,
-                   avoid_classes=avoid_classes,
-                   break_character=break_character)
-        if child.tail:
-            child.tail = _break_text(child.tail, max_width, break_character)
-
-def word_break_html(html, *args, **kw):
-    result_type = type(html)
-    doc = fromstring(html)
-    word_break(doc, *args, **kw)
-    return _transform_result(result_type, doc)
-
-def _break_text(text, max_width, break_character):
-    words = text.split()
-    for word in words:
-        if len(word) > max_width:
-            replacement = _insert_break(word, max_width, break_character)
-            text = text.replace(word, replacement)
-    return text
-
-_break_prefer_re = re.compile(r'[^a-z]', re.I)
-
-def _insert_break(word, width, break_character):
-    orig_word = word
-    result = ''
-    while len(word) > width:
-        start = word[:width]
-        breaks = list(_break_prefer_re.finditer(start))
-        if breaks:
-            last_break = breaks[-1]
-            # Only walk back up to 10 characters to find a nice break:
-            if last_break.end() > width-10:
-                # FIXME: should the break character be at the end of the
-                # chunk, or the beginning of the next chunk?
-                start = word[:last_break.end()]
-        result += start + break_character
-        word = word[len(start):]
-    result += word
-    return result
-