annotate planemo/lib/python3.7/site-packages/lxml/html/clean.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
1 # cython: language_level=3str
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
2
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
3 """A cleanup tool for HTML.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
4
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
5 Removes unwanted tags and content. See the `Cleaner` class for
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
6 details.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
7 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
8
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
9 from __future__ import absolute_import
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
10
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
11 import re
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
12 import copy
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
13 try:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
14 from urlparse import urlsplit
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
15 from urllib import unquote_plus
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
16 except ImportError:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
17 # Python 3
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
18 from urllib.parse import urlsplit, unquote_plus
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
19 from lxml import etree
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
20 from lxml.html import defs
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
21 from lxml.html import fromstring, XHTML_NAMESPACE
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
22 from lxml.html import xhtml_to_html, _transform_result
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
23
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
24 try:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
25 unichr
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
26 except NameError:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
27 # Python 3
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
28 unichr = chr
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
29 try:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
30 unicode
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
31 except NameError:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
32 # Python 3
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
33 unicode = str
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
34 try:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
35 basestring
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
36 except NameError:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
37 basestring = (str, bytes)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
38
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
39
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
41 'word_break', 'word_break_html']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
42
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
43 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
44 # Particularly the CSS cleaning; most of the tag cleaning is integrated now
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
45 # I have multiple kinds of schemes searched; but should schemes be
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
46 # whitelisted instead?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
47 # max height?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
48 # remove images? Also in CSS? background attribute?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
49 # Some way to whitelist object, iframe, etc (e.g., if you want to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
50 # allow *just* embedded YouTube movies)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
51 # Log what was deleted and why?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
52 # style="behavior: ..." might be bad in IE?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
53 # Should we have something for just <meta http-equiv>? That's the worst of the
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
54 # metas.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
55 # UTF-7 detections? Example:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
56 # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
57 # you don't always have to have the charset set, if the page has no charset
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
58 # and there's UTF7-like code in it.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
59 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
60
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
61
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
62 # This is an IE-specific construct you can have in a stylesheet to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
63 # run some Javascript:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
64 _css_javascript_re = re.compile(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
65 r'expression\s*\(.*?\)', re.S|re.I)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
66
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
67 # Do I have to worry about @\nimport?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
68 _css_import_re = re.compile(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
69 r'@\s*import', re.I)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
70
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
71 # All kinds of schemes besides just javascript: that can cause
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
72 # execution:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
73 _is_image_dataurl = re.compile(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
74 r'^data:image/.+;base64', re.I).search
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
75 _is_possibly_malicious_scheme = re.compile(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
76 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
77 re.I).search
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
78 def _is_javascript_scheme(s):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
79 if _is_image_dataurl(s):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
80 return None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
81 return _is_possibly_malicious_scheme(s)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
82
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
84 # FIXME: should data: be blocked?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
85
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
87 _conditional_comment_re = re.compile(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
89
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
90 _find_styled_elements = etree.XPath(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
91 "descendant-or-self::*[@style]")
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
92
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
93 _find_external_links = etree.XPath(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
96 namespaces={'x':XHTML_NAMESPACE})
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
97
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
98
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
99 class Cleaner(object):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
100 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
101 Instances cleans the document of each of the possible offending
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
102 elements. The cleaning is controlled by attributes; you can
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
103 override attributes in a subclass, or set them in the constructor.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
104
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
105 ``scripts``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
106 Removes any ``<script>`` tags.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
107
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
108 ``javascript``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
109 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
110 as they could contain Javascript.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
111
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
112 ``comments``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
113 Removes any comments.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
114
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
115 ``style``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
116 Removes any style tags.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
117
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
118 ``inline_style``
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
119 Removes any style attributes. Defaults to the value of the ``style`` option.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
120
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
121 ``links``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
122 Removes any ``<link>`` tags
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
123
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
124 ``meta``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
125 Removes any ``<meta>`` tags
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
126
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
127 ``page_structure``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
128 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
129
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
130 ``processing_instructions``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
131 Removes any processing instructions.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
132
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
133 ``embedded``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
134 Removes any embedded objects (flash, iframes)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
135
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
136 ``frames``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
137 Removes any frame-related tags
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
138
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
139 ``forms``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
140 Removes any form tags
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
141
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
142 ``annoying_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
143 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
144
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
145 ``remove_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
146 A list of tags to remove. Only the tags will be removed,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
147 their content will get pulled up into the parent tag.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
148
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
149 ``kill_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
150 A list of tags to kill. Killing also removes the tag's content,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
151 i.e. the whole subtree, not just the tag itself.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
152
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
153 ``allow_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
154 A list of tags to include (default include all).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
155
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
156 ``remove_unknown_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
157 Remove any tags that aren't standard parts of HTML.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
158
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
159 ``safe_attrs_only``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
160 If true, only include 'safe' attributes (specifically the list
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
161 from the feedparser HTML sanitisation web site).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
162
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
163 ``safe_attrs``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
164 A set of attribute names to override the default list of attributes
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
165 considered 'safe' (when safe_attrs_only=True).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
166
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
167 ``add_nofollow``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
168 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
169
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
170 ``host_whitelist``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
171 A list or set of hosts that you can use for embedded content
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
172 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
173 You can also implement/override the method
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
174 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
175 implement more complex rules for what can be embedded.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
176 Anything that passes this test will be shown, regardless of
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
177 the value of (for instance) ``embedded``.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
178
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
179 Note that this parameter might not work as intended if you do not
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
180 make the links absolute before doing the cleaning.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
181
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
182 Note that you may also need to set ``whitelist_tags``.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
183
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
184 ``whitelist_tags``:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
185 A set of tags that can be included with ``host_whitelist``.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
186 The default is ``iframe`` and ``embed``; you may wish to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
187 include other tags like ``script``, or you may want to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
188 implement ``allow_embedded_url`` for more control. Set to None to
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
189 include all tags.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
190
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
191 This modifies the document *in place*.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
192 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
193
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
194 scripts = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
195 javascript = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
196 comments = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
197 style = False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
198 inline_style = None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
199 links = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
200 meta = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
201 page_structure = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
202 processing_instructions = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
203 embedded = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
204 frames = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
205 forms = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
206 annoying_tags = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
207 remove_tags = None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
208 allow_tags = None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
209 kill_tags = None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
210 remove_unknown_tags = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
211 safe_attrs_only = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
212 safe_attrs = defs.safe_attrs
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
213 add_nofollow = False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
214 host_whitelist = ()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
215 whitelist_tags = {'iframe', 'embed'}
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
216
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
217 def __init__(self, **kw):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
218 not_an_attribute = object()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
219 for name, value in kw.items():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
220 default = getattr(self, name, not_an_attribute)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
221 if (default is not None and default is not True and default is not False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
222 and not isinstance(default, (frozenset, set, tuple, list))):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
223 raise TypeError(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
224 "Unknown parameter: %s=%r" % (name, value))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
225 setattr(self, name, value)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
226 if self.inline_style is None and 'inline_style' not in kw:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
227 self.inline_style = self.style
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
228
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
229 if kw.get("allow_tags"):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
230 if kw.get("remove_unknown_tags"):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
231 raise ValueError("It does not make sense to pass in both "
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
232 "allow_tags and remove_unknown_tags")
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
233 self.remove_unknown_tags = False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
234
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
235 # Used to lookup the primary URL for a given tag that is up for
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
236 # removal:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
237 _tag_link_attrs = dict(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
238 script='src',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
239 link='href',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
240 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
241 # From what I can tell, both attributes can contain a link:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
242 applet=['code', 'object'],
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
243 iframe='src',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
244 embed='src',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
245 layer='src',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
246 # FIXME: there doesn't really seem like a general way to figure out what
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
247 # links an <object> tag uses; links often go in <param> tags with values
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
248 # that we don't really know. You'd have to have knowledge about specific
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
249 # kinds of plugins (probably keyed off classid), and match against those.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
250 ##object=?,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
251 # FIXME: not looking at the action currently, because it is more complex
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
252 # than than -- if you keep the form, you should keep the form controls.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
253 ##form='action',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
254 a='href',
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
255 )
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
256
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
257 def __call__(self, doc):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
258 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
259 Cleans the document.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
260 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
261 try:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
262 getroot = doc.getroot
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
263 except AttributeError:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
264 pass # Element instance
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
265 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
266 doc = getroot() # ElementTree instance, instead of an element
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
267 # convert XHTML to HTML
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
268 xhtml_to_html(doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
269 # Normalize a case that IE treats <image> like <img>, and that
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
270 # can confuse either this step or later steps.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
271 for el in doc.iter('image'):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
272 el.tag = 'img'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
273 if not self.comments:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
274 # Of course, if we were going to kill comments anyway, we don't
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
275 # need to worry about this
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
276 self.kill_conditional_comments(doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
277
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
278 kill_tags = set(self.kill_tags or ())
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
279 remove_tags = set(self.remove_tags or ())
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
280 allow_tags = set(self.allow_tags or ())
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
281
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
282 if self.scripts:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
283 kill_tags.add('script')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
284 if self.safe_attrs_only:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
285 safe_attrs = set(self.safe_attrs)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
286 for el in doc.iter(etree.Element):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
287 attrib = el.attrib
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
288 for aname in attrib.keys():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
289 if aname not in safe_attrs:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
290 del attrib[aname]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
291 if self.javascript:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
292 if not (self.safe_attrs_only and
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
293 self.safe_attrs == defs.safe_attrs):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
294 # safe_attrs handles events attributes itself
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
295 for el in doc.iter(etree.Element):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
296 attrib = el.attrib
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
297 for aname in attrib.keys():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
298 if aname.startswith('on'):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
299 del attrib[aname]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
300 doc.rewrite_links(self._remove_javascript_link,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
301 resolve_base_href=False)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
302 # If we're deleting style then we don't have to remove JS links
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
303 # from styles, otherwise...
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
304 if not self.inline_style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
305 for el in _find_styled_elements(doc):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
306 old = el.get('style')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
307 new = _css_javascript_re.sub('', old)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
308 new = _css_import_re.sub('', new)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
309 if self._has_sneaky_javascript(new):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
310 # Something tricky is going on...
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
311 del el.attrib['style']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
312 elif new != old:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
313 el.set('style', new)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
314 if not self.style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
315 for el in list(doc.iter('style')):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
316 if el.get('type', '').lower().strip() == 'text/javascript':
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
317 el.drop_tree()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
318 continue
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
319 old = el.text or ''
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
320 new = _css_javascript_re.sub('', old)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
321 # The imported CSS can do anything; we just can't allow:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
322 new = _css_import_re.sub('', old)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
323 if self._has_sneaky_javascript(new):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
324 # Something tricky is going on...
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
325 el.text = '/* deleted */'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
326 elif new != old:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
327 el.text = new
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
328 if self.comments:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
329 kill_tags.add(etree.Comment)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
330 if self.processing_instructions:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
331 kill_tags.add(etree.ProcessingInstruction)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
332 if self.style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
333 kill_tags.add('style')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
334 if self.inline_style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
335 etree.strip_attributes(doc, 'style')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
336 if self.links:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
337 kill_tags.add('link')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
338 elif self.style or self.javascript:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
339 # We must get rid of included stylesheets if Javascript is not
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
340 # allowed, as you can put Javascript in them
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
341 for el in list(doc.iter('link')):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
342 if 'stylesheet' in el.get('rel', '').lower():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
343 # Note this kills alternate stylesheets as well
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
344 if not self.allow_element(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
345 el.drop_tree()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
346 if self.meta:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
347 kill_tags.add('meta')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
348 if self.page_structure:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
349 remove_tags.update(('head', 'html', 'title'))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
350 if self.embedded:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
351 # FIXME: is <layer> really embedded?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
352 # We should get rid of any <param> tags not inside <applet>;
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
353 # These are not really valid anyway.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
354 for el in list(doc.iter('param')):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
355 found_parent = False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
356 parent = el.getparent()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
357 while parent is not None and parent.tag not in ('applet', 'object'):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
358 parent = parent.getparent()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
359 if parent is None:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
360 el.drop_tree()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
361 kill_tags.update(('applet',))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
362 # The alternate contents that are in an iframe are a good fallback:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
363 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
364 if self.frames:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
365 # FIXME: ideally we should look at the frame links, but
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
366 # generally frames don't mix properly with an HTML
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
367 # fragment anyway.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
368 kill_tags.update(defs.frame_tags)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
369 if self.forms:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
370 remove_tags.add('form')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
371 kill_tags.update(('button', 'input', 'select', 'textarea'))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
372 if self.annoying_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
373 remove_tags.update(('blink', 'marquee'))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
374
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
375 _remove = []
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
376 _kill = []
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
377 for el in doc.iter():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
378 if el.tag in kill_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
379 if self.allow_element(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
380 continue
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
381 _kill.append(el)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
382 elif el.tag in remove_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
383 if self.allow_element(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
384 continue
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
385 _remove.append(el)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
386
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
387 if _remove and _remove[0] == doc:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
388 # We have to drop the parent-most tag, which we can't
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
389 # do. Instead we'll rewrite it:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
390 el = _remove.pop(0)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
391 el.tag = 'div'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
392 el.attrib.clear()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
393 elif _kill and _kill[0] == doc:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
394 # We have to drop the parent-most element, which we can't
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
395 # do. Instead we'll clear it:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
396 el = _kill.pop(0)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
397 if el.tag != 'html':
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
398 el.tag = 'div'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
399 el.clear()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
400
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
401 _kill.reverse() # start with innermost tags
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
402 for el in _kill:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
403 el.drop_tree()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
404 for el in _remove:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
405 el.drop_tag()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
406
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
407 if self.remove_unknown_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
408 if allow_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
409 raise ValueError(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
410 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
411 allow_tags = set(defs.tags)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
412 if allow_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
413 # make sure we do not remove comments/PIs if users want them (which is rare enough)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
414 if not self.comments:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
415 allow_tags.add(etree.Comment)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
416 if not self.processing_instructions:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
417 allow_tags.add(etree.ProcessingInstruction)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
418
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
419 bad = []
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
420 for el in doc.iter():
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
421 if el.tag not in allow_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
422 bad.append(el)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
423 if bad:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
424 if bad[0] is doc:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
425 el = bad.pop(0)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
426 el.tag = 'div'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
427 el.attrib.clear()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
428 for el in bad:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
429 el.drop_tag()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
430 if self.add_nofollow:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
431 for el in _find_external_links(doc):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
432 if not self.allow_follow(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
433 rel = el.get('rel')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
434 if rel:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
435 if ('nofollow' in rel
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
436 and ' nofollow ' in (' %s ' % rel)):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
437 continue
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
438 rel = '%s nofollow' % rel
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
439 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
440 rel = 'nofollow'
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
441 el.set('rel', rel)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
442
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
443 def allow_follow(self, anchor):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
444 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
445 Override to suppress rel="nofollow" on some anchors.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
446 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
447 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
448
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
449 def allow_element(self, el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
450 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
451 Decide whether an element is configured to be accepted or rejected.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
452
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
453 :param el: an element.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
454 :return: true to accept the element or false to reject/discard it.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
455 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
456 if el.tag not in self._tag_link_attrs:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
457 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
458 attr = self._tag_link_attrs[el.tag]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
459 if isinstance(attr, (list, tuple)):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
460 for one_attr in attr:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
461 url = el.get(one_attr)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
462 if not url:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
463 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
464 if not self.allow_embedded_url(el, url):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
465 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
466 return True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
467 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
468 url = el.get(attr)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
469 if not url:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
470 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
471 return self.allow_embedded_url(el, url)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
472
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
473 def allow_embedded_url(self, el, url):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
474 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
475 Decide whether a URL that was found in an element's attributes or text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
476 if configured to be accepted or rejected.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
477
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
478 :param el: an element.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
479 :param url: a URL found on the element.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
480 :return: true to accept the URL and false to reject it.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
481 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
482 if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
483 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
484 scheme, netloc, path, query, fragment = urlsplit(url)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
485 netloc = netloc.lower().split(':', 1)[0]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
486 if scheme not in ('http', 'https'):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
487 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
488 if netloc in self.host_whitelist:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
489 return True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
490 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
491
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
492 def kill_conditional_comments(self, doc):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
493 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
494 IE conditional comments basically embed HTML that the parser
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
495 doesn't normally see. We can't allow anything like that, so
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
496 we'll kill any comments that could be conditional.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
497 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
498 has_conditional_comment = _conditional_comment_re.search
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
499 self._kill_elements(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
500 doc, lambda el: has_conditional_comment(el.text),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
501 etree.Comment)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
502
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
503 def _kill_elements(self, doc, condition, iterate=None):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
504 bad = []
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
505 for el in doc.iter(iterate):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
506 if condition(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
507 bad.append(el)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
508 for el in bad:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
509 el.drop_tree()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
510
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
511 def _remove_javascript_link(self, link):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
512 # links like "j a v a s c r i p t:" might be interpreted in IE
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
513 new = _substitute_whitespace('', unquote_plus(link))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
514 if _is_javascript_scheme(new):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
515 # FIXME: should this be None to delete?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
516 return ''
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
517 return link
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
518
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
519 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
520
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
521 def _has_sneaky_javascript(self, style):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
522 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
523 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
524 can get interpreted, or ``expre/* stuff */ssion(...)``. This
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
525 checks for attempt to do stuff like this.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
526
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
527 Typically the response will be to kill the entire style; if you
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
528 have just a bit of Javascript in the style another rule will catch
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
529 that and remove only the Javascript from the style; this catches
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
530 more sneaky attempts.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
531 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
532 style = self._substitute_comments('', style)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
533 style = style.replace('\\', '')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
534 style = _substitute_whitespace('', style)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
535 style = style.lower()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
536 if 'javascript:' in style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
537 return True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
538 if 'expression(' in style:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
539 return True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
540 return False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
541
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
542 def clean_html(self, html):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
543 result_type = type(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
544 if isinstance(html, basestring):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
545 doc = fromstring(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
546 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
547 doc = copy.deepcopy(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
548 self(doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
549 return _transform_result(result_type, doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
550
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
551 clean = Cleaner()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
552 clean_html = clean.clean_html
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
553
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
554 ############################################################
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
555 ## Autolinking
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
556 ############################################################
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
557
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
558 _link_regexes = [
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
559 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
560 # This is conservative, but autolinking can be a bit conservative:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
561 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
562 ]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
563
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
564 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
565
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
566 _avoid_hosts = [
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
567 re.compile(r'^localhost', re.I),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
568 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
569 re.compile(r'^127\.0\.0\.1$'),
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
570 ]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
571
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
572 _avoid_classes = ['nolink']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
573
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
574 def autolink(el, link_regexes=_link_regexes,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
575 avoid_elements=_avoid_elements,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
576 avoid_hosts=_avoid_hosts,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
577 avoid_classes=_avoid_classes):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
578 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
579 Turn any URLs into links.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
580
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
581 It will search for links identified by the given regular
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
582 expressions (by default mailto and http(s) links).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
583
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
584 It won't link text in an element in avoid_elements, or an element
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
585 with a class in avoid_classes. It won't link to anything with a
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
586 host that matches one of the regular expressions in avoid_hosts
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
587 (default localhost and 127.0.0.1).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
588
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
589 If you pass in an element, the element's tail will not be
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
590 substituted, only the contents of the element.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
591 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
592 if el.tag in avoid_elements:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
593 return
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
594 class_name = el.get('class')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
595 if class_name:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
596 class_name = class_name.split()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
597 for match_class in avoid_classes:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
598 if match_class in class_name:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
599 return
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
600 for child in list(el):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
601 autolink(child, link_regexes=link_regexes,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
602 avoid_elements=avoid_elements,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
603 avoid_hosts=avoid_hosts,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
604 avoid_classes=avoid_classes)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
605 if child.tail:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
606 text, tail_children = _link_text(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
607 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
608 if tail_children:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
609 child.tail = text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
610 index = el.index(child)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
611 el[index+1:index+1] = tail_children
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
612 if el.text:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
613 text, pre_children = _link_text(
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
614 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
615 if pre_children:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
616 el.text = text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
617 el[:0] = pre_children
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
618
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
619 def _link_text(text, link_regexes, avoid_hosts, factory):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
620 leading_text = ''
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
621 links = []
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
622 last_pos = 0
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
623 while 1:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
624 best_match, best_pos = None, None
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
625 for regex in link_regexes:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
626 regex_pos = last_pos
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
627 while 1:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
628 match = regex.search(text, pos=regex_pos)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
629 if match is None:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
630 break
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
631 host = match.group('host')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
632 for host_regex in avoid_hosts:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
633 if host_regex.search(host):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
634 regex_pos = match.end()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
635 break
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
636 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
637 break
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
638 if match is None:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
639 continue
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
640 if best_pos is None or match.start() < best_pos:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
641 best_match = match
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
642 best_pos = match.start()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
643 if best_match is None:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
644 # No more matches
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
645 if links:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
646 assert not links[-1].tail
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
647 links[-1].tail = text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
648 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
649 assert not leading_text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
650 leading_text = text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
651 break
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
652 link = best_match.group(0)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
653 end = best_match.end()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
654 if link.endswith('.') or link.endswith(','):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
655 # These punctuation marks shouldn't end a link
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
656 end -= 1
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
657 link = link[:-1]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
658 prev_text = text[:best_match.start()]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
659 if links:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
660 assert not links[-1].tail
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
661 links[-1].tail = prev_text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
662 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
663 assert not leading_text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
664 leading_text = prev_text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
665 anchor = factory('a')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
666 anchor.set('href', link)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
667 body = best_match.group('body')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
668 if not body:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
669 body = link
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
670 if body.endswith('.') or body.endswith(','):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
671 body = body[:-1]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
672 anchor.text = body
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
673 links.append(anchor)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
674 text = text[end:]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
675 return leading_text, links
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
676
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
677 def autolink_html(html, *args, **kw):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
678 result_type = type(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
679 if isinstance(html, basestring):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
680 doc = fromstring(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
681 else:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
682 doc = copy.deepcopy(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
683 autolink(doc, *args, **kw)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
684 return _transform_result(result_type, doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
685
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
686 autolink_html.__doc__ = autolink.__doc__
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
687
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
688 ############################################################
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
689 ## Word wrapping
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
690 ############################################################
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
691
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
692 _avoid_word_break_elements = ['pre', 'textarea', 'code']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
693 _avoid_word_break_classes = ['nobreak']
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
694
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
695 def word_break(el, max_width=40,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
696 avoid_elements=_avoid_word_break_elements,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
697 avoid_classes=_avoid_word_break_classes,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
698 break_character=unichr(0x200b)):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
699 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
700 Breaks any long words found in the body of the text (not attributes).
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
701
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
702 Doesn't effect any of the tags in avoid_elements, by default
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
703 ``<textarea>`` and ``<pre>``
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
704
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
705 Breaks words by inserting &#8203;, which is a unicode character
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
706 for Zero Width Space character. This generally takes up no space
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
707 in rendering, but does copy as a space, and in monospace contexts
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
708 usually takes up space.
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
709
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
710 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
711 """
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
712 # Character suggestion of &#8203 comes from:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
713 # http://www.cs.tut.fi/~jkorpela/html/nobr.html
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
714 if el.tag in _avoid_word_break_elements:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
715 return
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
716 class_name = el.get('class')
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
717 if class_name:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
718 dont_break = False
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
719 class_name = class_name.split()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
720 for avoid in avoid_classes:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
721 if avoid in class_name:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
722 dont_break = True
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
723 break
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
724 if dont_break:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
725 return
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
726 if el.text:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
727 el.text = _break_text(el.text, max_width, break_character)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
728 for child in el:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
729 word_break(child, max_width=max_width,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
730 avoid_elements=avoid_elements,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
731 avoid_classes=avoid_classes,
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
732 break_character=break_character)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
733 if child.tail:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
734 child.tail = _break_text(child.tail, max_width, break_character)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
735
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
736 def word_break_html(html, *args, **kw):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
737 result_type = type(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
738 doc = fromstring(html)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
739 word_break(doc, *args, **kw)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
740 return _transform_result(result_type, doc)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
741
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
742 def _break_text(text, max_width, break_character):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
743 words = text.split()
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
744 for word in words:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
745 if len(word) > max_width:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
746 replacement = _insert_break(word, max_width, break_character)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
747 text = text.replace(word, replacement)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
748 return text
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
749
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
750 _break_prefer_re = re.compile(r'[^a-z]', re.I)
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
751
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
752 def _insert_break(word, width, break_character):
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
753 orig_word = word
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
754 result = ''
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
755 while len(word) > width:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
756 start = word[:width]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
757 breaks = list(_break_prefer_re.finditer(start))
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
758 if breaks:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
759 last_break = breaks[-1]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
760 # Only walk back up to 10 characters to find a nice break:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
761 if last_break.end() > width-10:
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
762 # FIXME: should the break character be at the end of the
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
763 # chunk, or the beginning of the next chunk?
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
764 start = word[:last_break.end()]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
765 result += start + break_character
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
766 word = word[len(start):]
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
767 result += word
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
768 return result
56ad4e20f292 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
769