Mercurial > repos > shellac > guppy_basecaller
annotate env/lib/python3.7/site-packages/lxml/html/clean.py @ 3:758bc20232e8 draft
"planemo upload commit 2a0fe2cc28b09e101d37293e53e82f61762262ec"
| author | shellac | 
|---|---|
| date | Thu, 14 May 2020 16:20:52 -0400 | 
| parents | 26e78fe6e8c4 | 
| children | 
| rev | line source | 
|---|---|
| 0 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 1 # cython: language_level=2 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 2 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 3 """A cleanup tool for HTML. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 4 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 5 Removes unwanted tags and content. See the `Cleaner` class for | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 6 details. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 7 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 8 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 9 from __future__ import absolute_import | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 10 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 11 import re | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 12 import copy | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 13 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 14 from urlparse import urlsplit | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 15 from urllib import unquote_plus | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 16 except ImportError: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 17 # Python 3 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 18 from urllib.parse import urlsplit, unquote_plus | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 19 from lxml import etree | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 20 from lxml.html import defs | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 21 from lxml.html import fromstring, XHTML_NAMESPACE | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 22 from lxml.html import xhtml_to_html, _transform_result | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 23 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 24 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 25 unichr | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 26 except NameError: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 27 # Python 3 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 28 unichr = chr | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 29 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 30 unicode | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 31 except NameError: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 32 # Python 3 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 33 unicode = str | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 34 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 35 basestring | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 36 except NameError: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 37 basestring = (str, bytes) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 38 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 39 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 41 'word_break', 'word_break_html'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 42 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 43 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 44 # Particularly the CSS cleaning; most of the tag cleaning is integrated now | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 45 # I have multiple kinds of schemes searched; but should schemes be | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 46 # whitelisted instead? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 47 # max height? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 48 # remove images? Also in CSS? background attribute? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 49 # Some way to whitelist object, iframe, etc (e.g., if you want to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 50 # allow *just* embedded YouTube movies) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 51 # Log what was deleted and why? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 52 # style="behavior: ..." might be bad in IE? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 53 # Should we have something for just <meta http-equiv>? That's the worst of the | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 54 # metas. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 55 # UTF-7 detections? Example: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 56 # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 57 # you don't always have to have the charset set, if the page has no charset | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 58 # and there's UTF7-like code in it. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 59 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 60 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 61 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 62 # This is an IE-specific construct you can have in a stylesheet to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 63 # run some Javascript: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 64 _css_javascript_re = re.compile( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 65 r'expression\s*\(.*?\)', re.S|re.I) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 66 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 67 # Do I have to worry about @\nimport? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 68 _css_import_re = re.compile( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 69 r'@\s*import', re.I) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 70 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 71 # All kinds of schemes besides just javascript: that can cause | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 72 # execution: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 73 _is_image_dataurl = re.compile( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 74 r'^data:image/.+;base64', re.I).search | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 75 _is_possibly_malicious_scheme = re.compile( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 76 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 77 re.I).search | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 78 def _is_javascript_scheme(s): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 79 if _is_image_dataurl(s): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 80 return None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 81 return _is_possibly_malicious_scheme(s) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 82 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 84 # FIXME: should data: be blocked? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 85 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 87 _conditional_comment_re = re.compile( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 89 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 90 _find_styled_elements = etree.XPath( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 91 "descendant-or-self::*[@style]") | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 92 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 93 _find_external_links = etree.XPath( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 96 namespaces={'x':XHTML_NAMESPACE}) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 97 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 98 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 99 class Cleaner(object): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 100 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 101 Instances cleans the document of each of the possible offending | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 102 elements. The cleaning is controlled by attributes; you can | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 103 override attributes in a subclass, or set them in the constructor. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 104 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 105 ``scripts``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 106 Removes any ``<script>`` tags. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 107 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 108 ``javascript``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 109 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 110 as they could contain Javascript. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 111 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 112 ``comments``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 113 Removes any comments. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 114 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 115 ``style``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 116 Removes any style tags. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 117 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 118 ``inline_style`` | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 119 Removes any style attributes. Defaults to the value of the ``style`` option. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 120 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 121 ``links``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 122 Removes any ``<link>`` tags | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 123 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 124 ``meta``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 125 Removes any ``<meta>`` tags | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 126 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 127 ``page_structure``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 128 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 129 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 130 ``processing_instructions``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 131 Removes any processing instructions. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 132 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 133 ``embedded``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 134 Removes any embedded objects (flash, iframes) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 135 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 136 ``frames``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 137 Removes any frame-related tags | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 138 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 139 ``forms``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 140 Removes any form tags | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 141 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 142 ``annoying_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 143 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 144 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 145 ``remove_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 146 A list of tags to remove. Only the tags will be removed, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 147 their content will get pulled up into the parent tag. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 148 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 149 ``kill_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 150 A list of tags to kill. Killing also removes the tag's content, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 151 i.e. the whole subtree, not just the tag itself. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 152 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 153 ``allow_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 154 A list of tags to include (default include all). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 155 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 156 ``remove_unknown_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 157 Remove any tags that aren't standard parts of HTML. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 158 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 159 ``safe_attrs_only``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 160 If true, only include 'safe' attributes (specifically the list | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 161 from the feedparser HTML sanitisation web site). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 162 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 163 ``safe_attrs``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 164 A set of attribute names to override the default list of attributes | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 165 considered 'safe' (when safe_attrs_only=True). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 166 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 167 ``add_nofollow``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 168 If true, then any <a> tags will have ``rel="nofollow"`` added to them. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 169 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 170 ``host_whitelist``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 171 A list or set of hosts that you can use for embedded content | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 172 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 173 You can also implement/override the method | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 174 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 175 implement more complex rules for what can be embedded. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 176 Anything that passes this test will be shown, regardless of | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 177 the value of (for instance) ``embedded``. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 178 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 179 Note that this parameter might not work as intended if you do not | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 180 make the links absolute before doing the cleaning. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 181 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 182 Note that you may also need to set ``whitelist_tags``. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 183 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 184 ``whitelist_tags``: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 185 A set of tags that can be included with ``host_whitelist``. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 186 The default is ``iframe`` and ``embed``; you may wish to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 187 include other tags like ``script``, or you may want to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 188 implement ``allow_embedded_url`` for more control. Set to None to | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 189 include all tags. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 190 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 191 This modifies the document *in place*. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 192 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 193 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 194 scripts = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 195 javascript = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 196 comments = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 197 style = False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 198 inline_style = None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 199 links = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 200 meta = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 201 page_structure = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 202 processing_instructions = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 203 embedded = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 204 frames = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 205 forms = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 206 annoying_tags = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 207 remove_tags = None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 208 allow_tags = None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 209 kill_tags = None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 210 remove_unknown_tags = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 211 safe_attrs_only = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 212 safe_attrs = defs.safe_attrs | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 213 add_nofollow = False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 214 host_whitelist = () | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 215 whitelist_tags = {'iframe', 'embed'} | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 216 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 217 def __init__(self, **kw): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 218 for name, value in kw.items(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 219 if not hasattr(self, name): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 220 raise TypeError( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 221 "Unknown parameter: %s=%r" % (name, value)) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 222 setattr(self, name, value) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 223 if self.inline_style is None and 'inline_style' not in kw: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 224 self.inline_style = self.style | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 225 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 226 # Used to lookup the primary URL for a given tag that is up for | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 227 # removal: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 228 _tag_link_attrs = dict( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 229 script='src', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 230 link='href', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 231 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 232 # From what I can tell, both attributes can contain a link: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 233 applet=['code', 'object'], | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 234 iframe='src', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 235 embed='src', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 236 layer='src', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 237 # FIXME: there doesn't really seem like a general way to figure out what | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 238 # links an <object> tag uses; links often go in <param> tags with values | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 239 # that we don't really know. You'd have to have knowledge about specific | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 240 # kinds of plugins (probably keyed off classid), and match against those. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 241 ##object=?, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 242 # FIXME: not looking at the action currently, because it is more complex | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 243 # than than -- if you keep the form, you should keep the form controls. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 244 ##form='action', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 245 a='href', | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 246 ) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 247 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 248 def __call__(self, doc): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 249 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 250 Cleans the document. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 251 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 252 if hasattr(doc, 'getroot'): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 253 # ElementTree instance, instead of an element | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 254 doc = doc.getroot() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 255 # convert XHTML to HTML | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 256 xhtml_to_html(doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 257 # Normalize a case that IE treats <image> like <img>, and that | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 258 # can confuse either this step or later steps. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 259 for el in doc.iter('image'): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 260 el.tag = 'img' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 261 if not self.comments: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 262 # Of course, if we were going to kill comments anyway, we don't | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 263 # need to worry about this | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 264 self.kill_conditional_comments(doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 265 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 266 kill_tags = set(self.kill_tags or ()) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 267 remove_tags = set(self.remove_tags or ()) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 268 allow_tags = set(self.allow_tags or ()) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 269 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 270 if self.scripts: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 271 kill_tags.add('script') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 272 if self.safe_attrs_only: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 273 safe_attrs = set(self.safe_attrs) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 274 for el in doc.iter(etree.Element): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 275 attrib = el.attrib | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 276 for aname in attrib.keys(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 277 if aname not in safe_attrs: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 278 del attrib[aname] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 279 if self.javascript: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 280 if not (self.safe_attrs_only and | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 281 self.safe_attrs == defs.safe_attrs): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 282 # safe_attrs handles events attributes itself | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 283 for el in doc.iter(etree.Element): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 284 attrib = el.attrib | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 285 for aname in attrib.keys(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 286 if aname.startswith('on'): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 287 del attrib[aname] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 288 doc.rewrite_links(self._remove_javascript_link, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 289 resolve_base_href=False) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 290 # If we're deleting style then we don't have to remove JS links | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 291 # from styles, otherwise... | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 292 if not self.inline_style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 293 for el in _find_styled_elements(doc): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 294 old = el.get('style') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 295 new = _css_javascript_re.sub('', old) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 296 new = _css_import_re.sub('', new) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 297 if self._has_sneaky_javascript(new): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 298 # Something tricky is going on... | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 299 del el.attrib['style'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 300 elif new != old: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 301 el.set('style', new) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 302 if not self.style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 303 for el in list(doc.iter('style')): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 304 if el.get('type', '').lower().strip() == 'text/javascript': | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 305 el.drop_tree() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 306 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 307 old = el.text or '' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 308 new = _css_javascript_re.sub('', old) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 309 # The imported CSS can do anything; we just can't allow: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 310 new = _css_import_re.sub('', old) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 311 if self._has_sneaky_javascript(new): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 312 # Something tricky is going on... | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 313 el.text = '/* deleted */' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 314 elif new != old: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 315 el.text = new | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 316 if self.comments or self.processing_instructions: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 317 # FIXME: why either? I feel like there's some obscure reason | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 318 # because you can put PIs in comments...? But I've already | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 319 # forgotten it | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 320 kill_tags.add(etree.Comment) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 321 if self.processing_instructions: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 322 kill_tags.add(etree.ProcessingInstruction) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 323 if self.style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 324 kill_tags.add('style') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 325 if self.inline_style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 326 etree.strip_attributes(doc, 'style') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 327 if self.links: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 328 kill_tags.add('link') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 329 elif self.style or self.javascript: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 330 # We must get rid of included stylesheets if Javascript is not | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 331 # allowed, as you can put Javascript in them | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 332 for el in list(doc.iter('link')): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 333 if 'stylesheet' in el.get('rel', '').lower(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 334 # Note this kills alternate stylesheets as well | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 335 if not self.allow_element(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 336 el.drop_tree() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 337 if self.meta: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 338 kill_tags.add('meta') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 339 if self.page_structure: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 340 remove_tags.update(('head', 'html', 'title')) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 341 if self.embedded: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 342 # FIXME: is <layer> really embedded? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 343 # We should get rid of any <param> tags not inside <applet>; | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 344 # These are not really valid anyway. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 345 for el in list(doc.iter('param')): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 346 found_parent = False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 347 parent = el.getparent() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 348 while parent is not None and parent.tag not in ('applet', 'object'): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 349 parent = parent.getparent() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 350 if parent is None: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 351 el.drop_tree() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 352 kill_tags.update(('applet',)) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 353 # The alternate contents that are in an iframe are a good fallback: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 354 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 355 if self.frames: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 356 # FIXME: ideally we should look at the frame links, but | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 357 # generally frames don't mix properly with an HTML | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 358 # fragment anyway. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 359 kill_tags.update(defs.frame_tags) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 360 if self.forms: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 361 remove_tags.add('form') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 362 kill_tags.update(('button', 'input', 'select', 'textarea')) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 363 if self.annoying_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 364 remove_tags.update(('blink', 'marquee')) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 365 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 366 _remove = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 367 _kill = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 368 for el in doc.iter(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 369 if el.tag in kill_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 370 if self.allow_element(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 371 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 372 _kill.append(el) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 373 elif el.tag in remove_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 374 if self.allow_element(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 375 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 376 _remove.append(el) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 377 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 378 if _remove and _remove[0] == doc: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 379 # We have to drop the parent-most tag, which we can't | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 380 # do. Instead we'll rewrite it: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 381 el = _remove.pop(0) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 382 el.tag = 'div' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 383 el.attrib.clear() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 384 elif _kill and _kill[0] == doc: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 385 # We have to drop the parent-most element, which we can't | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 386 # do. Instead we'll clear it: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 387 el = _kill.pop(0) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 388 if el.tag != 'html': | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 389 el.tag = 'div' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 390 el.clear() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 391 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 392 _kill.reverse() # start with innermost tags | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 393 for el in _kill: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 394 el.drop_tree() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 395 for el in _remove: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 396 el.drop_tag() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 397 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 398 if self.remove_unknown_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 399 if allow_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 400 raise ValueError( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 401 "It does not make sense to pass in both allow_tags and remove_unknown_tags") | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 402 allow_tags = set(defs.tags) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 403 if allow_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 404 bad = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 405 for el in doc.iter(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 406 if el.tag not in allow_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 407 bad.append(el) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 408 if bad: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 409 if bad[0] is doc: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 410 el = bad.pop(0) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 411 el.tag = 'div' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 412 el.attrib.clear() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 413 for el in bad: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 414 el.drop_tag() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 415 if self.add_nofollow: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 416 for el in _find_external_links(doc): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 417 if not self.allow_follow(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 418 rel = el.get('rel') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 419 if rel: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 420 if ('nofollow' in rel | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 421 and ' nofollow ' in (' %s ' % rel)): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 422 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 423 rel = '%s nofollow' % rel | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 424 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 425 rel = 'nofollow' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 426 el.set('rel', rel) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 427 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 428 def allow_follow(self, anchor): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 429 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 430 Override to suppress rel="nofollow" on some anchors. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 431 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 432 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 433 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 434 def allow_element(self, el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 435 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 436 Decide whether an element is configured to be accepted or rejected. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 437 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 438 :param el: an element. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 439 :return: true to accept the element or false to reject/discard it. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 440 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 441 if el.tag not in self._tag_link_attrs: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 442 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 443 attr = self._tag_link_attrs[el.tag] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 444 if isinstance(attr, (list, tuple)): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 445 for one_attr in attr: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 446 url = el.get(one_attr) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 447 if not url: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 448 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 449 if not self.allow_embedded_url(el, url): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 450 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 451 return True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 452 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 453 url = el.get(attr) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 454 if not url: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 455 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 456 return self.allow_embedded_url(el, url) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 457 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 458 def allow_embedded_url(self, el, url): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 459 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 460 Decide whether a URL that was found in an element's attributes or text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 461 if configured to be accepted or rejected. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 462 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 463 :param el: an element. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 464 :param url: a URL found on the element. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 465 :return: true to accept the URL and false to reject it. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 466 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 467 if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 468 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 469 scheme, netloc, path, query, fragment = urlsplit(url) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 470 netloc = netloc.lower().split(':', 1)[0] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 471 if scheme not in ('http', 'https'): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 472 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 473 if netloc in self.host_whitelist: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 474 return True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 475 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 476 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 477 def kill_conditional_comments(self, doc): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 478 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 479 IE conditional comments basically embed HTML that the parser | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 480 doesn't normally see. We can't allow anything like that, so | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 481 we'll kill any comments that could be conditional. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 482 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 483 bad = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 484 self._kill_elements( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 485 doc, lambda el: _conditional_comment_re.search(el.text), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 486 etree.Comment) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 487 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 488 def _kill_elements(self, doc, condition, iterate=None): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 489 bad = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 490 for el in doc.iter(iterate): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 491 if condition(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 492 bad.append(el) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 493 for el in bad: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 494 el.drop_tree() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 495 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 496 def _remove_javascript_link(self, link): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 497 # links like "j a v a s c r i p t:" might be interpreted in IE | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 498 new = _substitute_whitespace('', unquote_plus(link)) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 499 if _is_javascript_scheme(new): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 500 # FIXME: should this be None to delete? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 501 return '' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 502 return link | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 503 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 504 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 505 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 506 def _has_sneaky_javascript(self, style): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 507 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 508 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 509 can get interpreted, or ``expre/* stuff */ssion(...)``. This | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 510 checks for attempt to do stuff like this. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 511 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 512 Typically the response will be to kill the entire style; if you | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 513 have just a bit of Javascript in the style another rule will catch | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 514 that and remove only the Javascript from the style; this catches | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 515 more sneaky attempts. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 516 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 517 style = self._substitute_comments('', style) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 518 style = style.replace('\\', '') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 519 style = _substitute_whitespace('', style) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 520 style = style.lower() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 521 if 'javascript:' in style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 522 return True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 523 if 'expression(' in style: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 524 return True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 525 return False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 526 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 527 def clean_html(self, html): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 528 result_type = type(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 529 if isinstance(html, basestring): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 530 doc = fromstring(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 531 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 532 doc = copy.deepcopy(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 533 self(doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 534 return _transform_result(result_type, doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 535 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 536 clean = Cleaner() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 537 clean_html = clean.clean_html | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 538 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 539 ############################################################ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 540 ## Autolinking | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 541 ############################################################ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 542 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 543 _link_regexes = [ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 544 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 545 # This is conservative, but autolinking can be a bit conservative: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 546 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 547 ] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 548 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 549 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 550 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 551 _avoid_hosts = [ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 552 re.compile(r'^localhost', re.I), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 553 re.compile(r'\bexample\.(?:com|org|net)$', re.I), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 554 re.compile(r'^127\.0\.0\.1$'), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 555 ] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 556 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 557 _avoid_classes = ['nolink'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 558 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 559 def autolink(el, link_regexes=_link_regexes, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 560 avoid_elements=_avoid_elements, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 561 avoid_hosts=_avoid_hosts, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 562 avoid_classes=_avoid_classes): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 563 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 564 Turn any URLs into links. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 565 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 566 It will search for links identified by the given regular | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 567 expressions (by default mailto and http(s) links). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 568 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 569 It won't link text in an element in avoid_elements, or an element | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 570 with a class in avoid_classes. It won't link to anything with a | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 571 host that matches one of the regular expressions in avoid_hosts | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 572 (default localhost and 127.0.0.1). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 573 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 574 If you pass in an element, the element's tail will not be | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 575 substituted, only the contents of the element. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 576 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 577 if el.tag in avoid_elements: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 578 return | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 579 class_name = el.get('class') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 580 if class_name: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 581 class_name = class_name.split() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 582 for match_class in avoid_classes: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 583 if match_class in class_name: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 584 return | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 585 for child in list(el): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 586 autolink(child, link_regexes=link_regexes, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 587 avoid_elements=avoid_elements, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 588 avoid_hosts=avoid_hosts, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 589 avoid_classes=avoid_classes) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 590 if child.tail: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 591 text, tail_children = _link_text( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 592 child.tail, link_regexes, avoid_hosts, factory=el.makeelement) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 593 if tail_children: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 594 child.tail = text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 595 index = el.index(child) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 596 el[index+1:index+1] = tail_children | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 597 if el.text: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 598 text, pre_children = _link_text( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 599 el.text, link_regexes, avoid_hosts, factory=el.makeelement) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 600 if pre_children: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 601 el.text = text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 602 el[:0] = pre_children | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 603 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 604 def _link_text(text, link_regexes, avoid_hosts, factory): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 605 leading_text = '' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 606 links = [] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 607 last_pos = 0 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 608 while 1: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 609 best_match, best_pos = None, None | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 610 for regex in link_regexes: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 611 regex_pos = last_pos | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 612 while 1: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 613 match = regex.search(text, pos=regex_pos) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 614 if match is None: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 615 break | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 616 host = match.group('host') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 617 for host_regex in avoid_hosts: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 618 if host_regex.search(host): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 619 regex_pos = match.end() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 620 break | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 621 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 622 break | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 623 if match is None: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 624 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 625 if best_pos is None or match.start() < best_pos: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 626 best_match = match | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 627 best_pos = match.start() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 628 if best_match is None: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 629 # No more matches | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 630 if links: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 631 assert not links[-1].tail | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 632 links[-1].tail = text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 633 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 634 assert not leading_text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 635 leading_text = text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 636 break | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 637 link = best_match.group(0) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 638 end = best_match.end() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 639 if link.endswith('.') or link.endswith(','): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 640 # These punctuation marks shouldn't end a link | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 641 end -= 1 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 642 link = link[:-1] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 643 prev_text = text[:best_match.start()] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 644 if links: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 645 assert not links[-1].tail | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 646 links[-1].tail = prev_text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 647 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 648 assert not leading_text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 649 leading_text = prev_text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 650 anchor = factory('a') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 651 anchor.set('href', link) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 652 body = best_match.group('body') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 653 if not body: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 654 body = link | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 655 if body.endswith('.') or body.endswith(','): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 656 body = body[:-1] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 657 anchor.text = body | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 658 links.append(anchor) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 659 text = text[end:] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 660 return leading_text, links | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 661 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 662 def autolink_html(html, *args, **kw): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 663 result_type = type(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 664 if isinstance(html, basestring): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 665 doc = fromstring(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 666 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 667 doc = copy.deepcopy(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 668 autolink(doc, *args, **kw) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 669 return _transform_result(result_type, doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 670 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 671 autolink_html.__doc__ = autolink.__doc__ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 672 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 673 ############################################################ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 674 ## Word wrapping | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 675 ############################################################ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 676 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 677 _avoid_word_break_elements = ['pre', 'textarea', 'code'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 678 _avoid_word_break_classes = ['nobreak'] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 679 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 680 def word_break(el, max_width=40, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 681 avoid_elements=_avoid_word_break_elements, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 682 avoid_classes=_avoid_word_break_classes, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 683 break_character=unichr(0x200b)): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 684 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 685 Breaks any long words found in the body of the text (not attributes). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 686 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 687 Doesn't effect any of the tags in avoid_elements, by default | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 688 ``<textarea>`` and ``<pre>`` | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 689 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 690 Breaks words by inserting ​, which is a unicode character | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 691 for Zero Width Space character. This generally takes up no space | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 692 in rendering, but does copy as a space, and in monospace contexts | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 693 usually takes up space. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 694 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 695 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 696 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 697 # Character suggestion of ​ comes from: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 698 # http://www.cs.tut.fi/~jkorpela/html/nobr.html | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 699 if el.tag in _avoid_word_break_elements: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 700 return | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 701 class_name = el.get('class') | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 702 if class_name: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 703 dont_break = False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 704 class_name = class_name.split() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 705 for avoid in avoid_classes: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 706 if avoid in class_name: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 707 dont_break = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 708 break | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 709 if dont_break: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 710 return | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 711 if el.text: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 712 el.text = _break_text(el.text, max_width, break_character) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 713 for child in el: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 714 word_break(child, max_width=max_width, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 715 avoid_elements=avoid_elements, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 716 avoid_classes=avoid_classes, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 717 break_character=break_character) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 718 if child.tail: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 719 child.tail = _break_text(child.tail, max_width, break_character) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 720 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 721 def word_break_html(html, *args, **kw): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 722 result_type = type(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 723 doc = fromstring(html) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 724 word_break(doc, *args, **kw) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 725 return _transform_result(result_type, doc) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 726 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 727 def _break_text(text, max_width, break_character): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 728 words = text.split() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 729 for word in words: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 730 if len(word) > max_width: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 731 replacement = _insert_break(word, max_width, break_character) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 732 text = text.replace(word, replacement) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 733 return text | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 734 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 735 _break_prefer_re = re.compile(r'[^a-z]', re.I) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 736 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 737 def _insert_break(word, width, break_character): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 738 orig_word = word | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 739 result = '' | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 740 while len(word) > width: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 741 start = word[:width] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 742 breaks = list(_break_prefer_re.finditer(start)) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 743 if breaks: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 744 last_break = breaks[-1] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 745 # Only walk back up to 10 characters to find a nice break: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 746 if last_break.end() > width-10: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 747 # FIXME: should the break character be at the end of the | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 748 # chunk, or the beginning of the next chunk? | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 749 start = word[:last_break.end()] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 750 result += start + break_character | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 751 word = word[len(start):] | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 752 result += word | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 753 return result | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 754 | 
