Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/lxml/html/clean.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
rev | line source |
---|---|
1
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
1 # cython: language_level=3str |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
2 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
3 """A cleanup tool for HTML. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
4 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
5 Removes unwanted tags and content. See the `Cleaner` class for |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
6 details. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
7 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
8 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
9 from __future__ import absolute_import |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
10 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
11 import re |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
12 import copy |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
13 try: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
14 from urlparse import urlsplit |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
15 from urllib import unquote_plus |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
16 except ImportError: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
17 # Python 3 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
18 from urllib.parse import urlsplit, unquote_plus |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
19 from lxml import etree |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
20 from lxml.html import defs |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
21 from lxml.html import fromstring, XHTML_NAMESPACE |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
22 from lxml.html import xhtml_to_html, _transform_result |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
23 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
24 try: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
25 unichr |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
26 except NameError: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
27 # Python 3 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
28 unichr = chr |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
29 try: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
30 unicode |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
31 except NameError: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
32 # Python 3 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
33 unicode = str |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
34 try: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
35 basestring |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
36 except NameError: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
37 basestring = (str, bytes) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
38 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
39 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
41 'word_break', 'word_break_html'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
42 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
43 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
44 # Particularly the CSS cleaning; most of the tag cleaning is integrated now |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
45 # I have multiple kinds of schemes searched; but should schemes be |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
46 # whitelisted instead? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
47 # max height? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
48 # remove images? Also in CSS? background attribute? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
49 # Some way to whitelist object, iframe, etc (e.g., if you want to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
50 # allow *just* embedded YouTube movies) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
51 # Log what was deleted and why? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
52 # style="behavior: ..." might be bad in IE? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
53 # Should we have something for just <meta http-equiv>? That's the worst of the |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
54 # metas. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
55 # UTF-7 detections? Example: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
56 # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
57 # you don't always have to have the charset set, if the page has no charset |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
58 # and there's UTF7-like code in it. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
59 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
60 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
61 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
62 # This is an IE-specific construct you can have in a stylesheet to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
63 # run some Javascript: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
64 _css_javascript_re = re.compile( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
65 r'expression\s*\(.*?\)', re.S|re.I) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
66 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
67 # Do I have to worry about @\nimport? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
68 _css_import_re = re.compile( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
69 r'@\s*import', re.I) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
70 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
71 # All kinds of schemes besides just javascript: that can cause |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
72 # execution: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
73 _is_image_dataurl = re.compile( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
74 r'^data:image/.+;base64', re.I).search |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
75 _is_possibly_malicious_scheme = re.compile( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
76 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
77 re.I).search |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
78 def _is_javascript_scheme(s): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
79 if _is_image_dataurl(s): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
80 return None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
81 return _is_possibly_malicious_scheme(s) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
82 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
84 # FIXME: should data: be blocked? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
85 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
87 _conditional_comment_re = re.compile( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
89 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
90 _find_styled_elements = etree.XPath( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
91 "descendant-or-self::*[@style]") |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
92 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
93 _find_external_links = etree.XPath( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
96 namespaces={'x':XHTML_NAMESPACE}) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
97 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
98 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
99 class Cleaner(object): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
100 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
101 Instances cleans the document of each of the possible offending |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
102 elements. The cleaning is controlled by attributes; you can |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
103 override attributes in a subclass, or set them in the constructor. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
104 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
105 ``scripts``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
106 Removes any ``<script>`` tags. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
107 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
108 ``javascript``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
109 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
110 as they could contain Javascript. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
111 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
112 ``comments``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
113 Removes any comments. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
114 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
115 ``style``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
116 Removes any style tags. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
117 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
118 ``inline_style`` |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
119 Removes any style attributes. Defaults to the value of the ``style`` option. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
120 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
121 ``links``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
122 Removes any ``<link>`` tags |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
123 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
124 ``meta``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
125 Removes any ``<meta>`` tags |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
126 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
127 ``page_structure``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
128 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
129 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
130 ``processing_instructions``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
131 Removes any processing instructions. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
132 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
133 ``embedded``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
134 Removes any embedded objects (flash, iframes) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
135 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
136 ``frames``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
137 Removes any frame-related tags |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
138 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
139 ``forms``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
140 Removes any form tags |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
141 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
142 ``annoying_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
143 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
144 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
145 ``remove_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
146 A list of tags to remove. Only the tags will be removed, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
147 their content will get pulled up into the parent tag. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
148 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
149 ``kill_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
150 A list of tags to kill. Killing also removes the tag's content, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
151 i.e. the whole subtree, not just the tag itself. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
152 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
153 ``allow_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
154 A list of tags to include (default include all). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
155 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
156 ``remove_unknown_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
157 Remove any tags that aren't standard parts of HTML. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
158 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
159 ``safe_attrs_only``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
160 If true, only include 'safe' attributes (specifically the list |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
161 from the feedparser HTML sanitisation web site). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
162 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
163 ``safe_attrs``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
164 A set of attribute names to override the default list of attributes |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
165 considered 'safe' (when safe_attrs_only=True). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
166 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
167 ``add_nofollow``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
168 If true, then any <a> tags will have ``rel="nofollow"`` added to them. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
169 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
170 ``host_whitelist``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
171 A list or set of hosts that you can use for embedded content |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
172 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
173 You can also implement/override the method |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
174 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
175 implement more complex rules for what can be embedded. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
176 Anything that passes this test will be shown, regardless of |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
177 the value of (for instance) ``embedded``. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
178 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
179 Note that this parameter might not work as intended if you do not |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
180 make the links absolute before doing the cleaning. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
181 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
182 Note that you may also need to set ``whitelist_tags``. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
183 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
184 ``whitelist_tags``: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
185 A set of tags that can be included with ``host_whitelist``. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
186 The default is ``iframe`` and ``embed``; you may wish to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
187 include other tags like ``script``, or you may want to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
188 implement ``allow_embedded_url`` for more control. Set to None to |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
189 include all tags. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
190 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
191 This modifies the document *in place*. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
192 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
193 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
194 scripts = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
195 javascript = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
196 comments = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
197 style = False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
198 inline_style = None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
199 links = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
200 meta = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
201 page_structure = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
202 processing_instructions = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
203 embedded = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
204 frames = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
205 forms = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
206 annoying_tags = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
207 remove_tags = None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
208 allow_tags = None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
209 kill_tags = None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
210 remove_unknown_tags = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
211 safe_attrs_only = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
212 safe_attrs = defs.safe_attrs |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
213 add_nofollow = False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
214 host_whitelist = () |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
215 whitelist_tags = {'iframe', 'embed'} |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
216 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
217 def __init__(self, **kw): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
218 not_an_attribute = object() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
219 for name, value in kw.items(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
220 default = getattr(self, name, not_an_attribute) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
221 if (default is not None and default is not True and default is not False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
222 and not isinstance(default, (frozenset, set, tuple, list))): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
223 raise TypeError( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
224 "Unknown parameter: %s=%r" % (name, value)) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
225 setattr(self, name, value) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
226 if self.inline_style is None and 'inline_style' not in kw: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
227 self.inline_style = self.style |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
228 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
229 if kw.get("allow_tags"): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
230 if kw.get("remove_unknown_tags"): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
231 raise ValueError("It does not make sense to pass in both " |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
232 "allow_tags and remove_unknown_tags") |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
233 self.remove_unknown_tags = False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
234 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
235 # Used to lookup the primary URL for a given tag that is up for |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
236 # removal: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
237 _tag_link_attrs = dict( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
238 script='src', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
239 link='href', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
240 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
241 # From what I can tell, both attributes can contain a link: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
242 applet=['code', 'object'], |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
243 iframe='src', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
244 embed='src', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
245 layer='src', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
246 # FIXME: there doesn't really seem like a general way to figure out what |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
247 # links an <object> tag uses; links often go in <param> tags with values |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
248 # that we don't really know. You'd have to have knowledge about specific |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
249 # kinds of plugins (probably keyed off classid), and match against those. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
250 ##object=?, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
251 # FIXME: not looking at the action currently, because it is more complex |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
252 # than than -- if you keep the form, you should keep the form controls. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
253 ##form='action', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
254 a='href', |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
255 ) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
256 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
257 def __call__(self, doc): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
258 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
259 Cleans the document. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
260 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
261 try: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
262 getroot = doc.getroot |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
263 except AttributeError: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
264 pass # Element instance |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
265 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
266 doc = getroot() # ElementTree instance, instead of an element |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
267 # convert XHTML to HTML |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
268 xhtml_to_html(doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
269 # Normalize a case that IE treats <image> like <img>, and that |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
270 # can confuse either this step or later steps. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
271 for el in doc.iter('image'): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
272 el.tag = 'img' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
273 if not self.comments: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
274 # Of course, if we were going to kill comments anyway, we don't |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
275 # need to worry about this |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
276 self.kill_conditional_comments(doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
277 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
278 kill_tags = set(self.kill_tags or ()) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
279 remove_tags = set(self.remove_tags or ()) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
280 allow_tags = set(self.allow_tags or ()) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
281 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
282 if self.scripts: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
283 kill_tags.add('script') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
284 if self.safe_attrs_only: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
285 safe_attrs = set(self.safe_attrs) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
286 for el in doc.iter(etree.Element): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
287 attrib = el.attrib |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
288 for aname in attrib.keys(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
289 if aname not in safe_attrs: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
290 del attrib[aname] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
291 if self.javascript: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
292 if not (self.safe_attrs_only and |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
293 self.safe_attrs == defs.safe_attrs): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
294 # safe_attrs handles events attributes itself |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
295 for el in doc.iter(etree.Element): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
296 attrib = el.attrib |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
297 for aname in attrib.keys(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
298 if aname.startswith('on'): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
299 del attrib[aname] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
300 doc.rewrite_links(self._remove_javascript_link, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
301 resolve_base_href=False) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
302 # If we're deleting style then we don't have to remove JS links |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
303 # from styles, otherwise... |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
304 if not self.inline_style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
305 for el in _find_styled_elements(doc): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
306 old = el.get('style') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
307 new = _css_javascript_re.sub('', old) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
308 new = _css_import_re.sub('', new) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
309 if self._has_sneaky_javascript(new): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
310 # Something tricky is going on... |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
311 del el.attrib['style'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
312 elif new != old: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
313 el.set('style', new) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
314 if not self.style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
315 for el in list(doc.iter('style')): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
316 if el.get('type', '').lower().strip() == 'text/javascript': |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
317 el.drop_tree() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
318 continue |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
319 old = el.text or '' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
320 new = _css_javascript_re.sub('', old) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
321 # The imported CSS can do anything; we just can't allow: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
322 new = _css_import_re.sub('', old) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
323 if self._has_sneaky_javascript(new): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
324 # Something tricky is going on... |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
325 el.text = '/* deleted */' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
326 elif new != old: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
327 el.text = new |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
328 if self.comments: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
329 kill_tags.add(etree.Comment) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
330 if self.processing_instructions: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
331 kill_tags.add(etree.ProcessingInstruction) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
332 if self.style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
333 kill_tags.add('style') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
334 if self.inline_style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
335 etree.strip_attributes(doc, 'style') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
336 if self.links: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
337 kill_tags.add('link') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
338 elif self.style or self.javascript: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
339 # We must get rid of included stylesheets if Javascript is not |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
340 # allowed, as you can put Javascript in them |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
341 for el in list(doc.iter('link')): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
342 if 'stylesheet' in el.get('rel', '').lower(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
343 # Note this kills alternate stylesheets as well |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
344 if not self.allow_element(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
345 el.drop_tree() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
346 if self.meta: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
347 kill_tags.add('meta') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
348 if self.page_structure: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
349 remove_tags.update(('head', 'html', 'title')) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
350 if self.embedded: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
351 # FIXME: is <layer> really embedded? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
352 # We should get rid of any <param> tags not inside <applet>; |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
353 # These are not really valid anyway. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
354 for el in list(doc.iter('param')): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
355 found_parent = False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
356 parent = el.getparent() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
357 while parent is not None and parent.tag not in ('applet', 'object'): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
358 parent = parent.getparent() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
359 if parent is None: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
360 el.drop_tree() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
361 kill_tags.update(('applet',)) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
362 # The alternate contents that are in an iframe are a good fallback: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
363 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
364 if self.frames: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
365 # FIXME: ideally we should look at the frame links, but |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
366 # generally frames don't mix properly with an HTML |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
367 # fragment anyway. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
368 kill_tags.update(defs.frame_tags) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
369 if self.forms: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
370 remove_tags.add('form') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
371 kill_tags.update(('button', 'input', 'select', 'textarea')) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
372 if self.annoying_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
373 remove_tags.update(('blink', 'marquee')) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
374 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
375 _remove = [] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
376 _kill = [] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
377 for el in doc.iter(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
378 if el.tag in kill_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
379 if self.allow_element(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
380 continue |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
381 _kill.append(el) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
382 elif el.tag in remove_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
383 if self.allow_element(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
384 continue |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
385 _remove.append(el) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
386 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
387 if _remove and _remove[0] == doc: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
388 # We have to drop the parent-most tag, which we can't |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
389 # do. Instead we'll rewrite it: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
390 el = _remove.pop(0) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
391 el.tag = 'div' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
392 el.attrib.clear() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
393 elif _kill and _kill[0] == doc: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
394 # We have to drop the parent-most element, which we can't |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
395 # do. Instead we'll clear it: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
396 el = _kill.pop(0) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
397 if el.tag != 'html': |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
398 el.tag = 'div' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
399 el.clear() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
400 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
401 _kill.reverse() # start with innermost tags |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
402 for el in _kill: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
403 el.drop_tree() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
404 for el in _remove: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
405 el.drop_tag() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
406 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
407 if self.remove_unknown_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
408 if allow_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
409 raise ValueError( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
410 "It does not make sense to pass in both allow_tags and remove_unknown_tags") |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
411 allow_tags = set(defs.tags) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
412 if allow_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
413 # make sure we do not remove comments/PIs if users want them (which is rare enough) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
414 if not self.comments: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
415 allow_tags.add(etree.Comment) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
416 if not self.processing_instructions: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
417 allow_tags.add(etree.ProcessingInstruction) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
418 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
419 bad = [] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
420 for el in doc.iter(): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
421 if el.tag not in allow_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
422 bad.append(el) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
423 if bad: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
424 if bad[0] is doc: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
425 el = bad.pop(0) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
426 el.tag = 'div' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
427 el.attrib.clear() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
428 for el in bad: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
429 el.drop_tag() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
430 if self.add_nofollow: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
431 for el in _find_external_links(doc): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
432 if not self.allow_follow(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
433 rel = el.get('rel') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
434 if rel: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
435 if ('nofollow' in rel |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
436 and ' nofollow ' in (' %s ' % rel)): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
437 continue |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
438 rel = '%s nofollow' % rel |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
439 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
440 rel = 'nofollow' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
441 el.set('rel', rel) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
442 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
443 def allow_follow(self, anchor): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
444 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
445 Override to suppress rel="nofollow" on some anchors. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
446 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
447 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
448 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
449 def allow_element(self, el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
450 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
451 Decide whether an element is configured to be accepted or rejected. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
452 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
453 :param el: an element. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
454 :return: true to accept the element or false to reject/discard it. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
455 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
456 if el.tag not in self._tag_link_attrs: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
457 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
458 attr = self._tag_link_attrs[el.tag] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
459 if isinstance(attr, (list, tuple)): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
460 for one_attr in attr: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
461 url = el.get(one_attr) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
462 if not url: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
463 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
464 if not self.allow_embedded_url(el, url): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
465 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
466 return True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
467 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
468 url = el.get(attr) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
469 if not url: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
470 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
471 return self.allow_embedded_url(el, url) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
472 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
473 def allow_embedded_url(self, el, url): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
474 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
475 Decide whether a URL that was found in an element's attributes or text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
476 if configured to be accepted or rejected. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
477 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
478 :param el: an element. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
479 :param url: a URL found on the element. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
480 :return: true to accept the URL and false to reject it. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
481 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
482 if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
483 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
484 scheme, netloc, path, query, fragment = urlsplit(url) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
485 netloc = netloc.lower().split(':', 1)[0] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
486 if scheme not in ('http', 'https'): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
487 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
488 if netloc in self.host_whitelist: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
489 return True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
490 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
491 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
492 def kill_conditional_comments(self, doc): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
493 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
494 IE conditional comments basically embed HTML that the parser |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
495 doesn't normally see. We can't allow anything like that, so |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
496 we'll kill any comments that could be conditional. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
497 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
498 has_conditional_comment = _conditional_comment_re.search |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
499 self._kill_elements( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
500 doc, lambda el: has_conditional_comment(el.text), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
501 etree.Comment) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
502 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
503 def _kill_elements(self, doc, condition, iterate=None): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
504 bad = [] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
505 for el in doc.iter(iterate): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
506 if condition(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
507 bad.append(el) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
508 for el in bad: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
509 el.drop_tree() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
510 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
511 def _remove_javascript_link(self, link): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
512 # links like "j a v a s c r i p t:" might be interpreted in IE |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
513 new = _substitute_whitespace('', unquote_plus(link)) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
514 if _is_javascript_scheme(new): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
515 # FIXME: should this be None to delete? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
516 return '' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
517 return link |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
518 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
519 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
520 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
521 def _has_sneaky_javascript(self, style): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
522 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
523 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
524 can get interpreted, or ``expre/* stuff */ssion(...)``. This |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
525 checks for attempt to do stuff like this. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
526 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
527 Typically the response will be to kill the entire style; if you |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
528 have just a bit of Javascript in the style another rule will catch |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
529 that and remove only the Javascript from the style; this catches |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
530 more sneaky attempts. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
531 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
532 style = self._substitute_comments('', style) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
533 style = style.replace('\\', '') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
534 style = _substitute_whitespace('', style) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
535 style = style.lower() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
536 if 'javascript:' in style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
537 return True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
538 if 'expression(' in style: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
539 return True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
540 return False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
541 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
542 def clean_html(self, html): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
543 result_type = type(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
544 if isinstance(html, basestring): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
545 doc = fromstring(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
546 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
547 doc = copy.deepcopy(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
548 self(doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
549 return _transform_result(result_type, doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
550 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
551 clean = Cleaner() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
552 clean_html = clean.clean_html |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
553 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
554 ############################################################ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
555 ## Autolinking |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
556 ############################################################ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
557 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
558 _link_regexes = [ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
559 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
560 # This is conservative, but autolinking can be a bit conservative: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
561 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
562 ] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
563 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
564 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
565 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
566 _avoid_hosts = [ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
567 re.compile(r'^localhost', re.I), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
568 re.compile(r'\bexample\.(?:com|org|net)$', re.I), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
569 re.compile(r'^127\.0\.0\.1$'), |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
570 ] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
571 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
572 _avoid_classes = ['nolink'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
573 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
574 def autolink(el, link_regexes=_link_regexes, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
575 avoid_elements=_avoid_elements, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
576 avoid_hosts=_avoid_hosts, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
577 avoid_classes=_avoid_classes): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
578 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
579 Turn any URLs into links. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
580 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
581 It will search for links identified by the given regular |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
582 expressions (by default mailto and http(s) links). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
583 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
584 It won't link text in an element in avoid_elements, or an element |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
585 with a class in avoid_classes. It won't link to anything with a |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
586 host that matches one of the regular expressions in avoid_hosts |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
587 (default localhost and 127.0.0.1). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
588 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
589 If you pass in an element, the element's tail will not be |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
590 substituted, only the contents of the element. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
591 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
592 if el.tag in avoid_elements: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
593 return |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
594 class_name = el.get('class') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
595 if class_name: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
596 class_name = class_name.split() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
597 for match_class in avoid_classes: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
598 if match_class in class_name: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
599 return |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
600 for child in list(el): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
601 autolink(child, link_regexes=link_regexes, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
602 avoid_elements=avoid_elements, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
603 avoid_hosts=avoid_hosts, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
604 avoid_classes=avoid_classes) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
605 if child.tail: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
606 text, tail_children = _link_text( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
607 child.tail, link_regexes, avoid_hosts, factory=el.makeelement) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
608 if tail_children: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
609 child.tail = text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
610 index = el.index(child) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
611 el[index+1:index+1] = tail_children |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
612 if el.text: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
613 text, pre_children = _link_text( |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
614 el.text, link_regexes, avoid_hosts, factory=el.makeelement) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
615 if pre_children: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
616 el.text = text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
617 el[:0] = pre_children |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
618 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
619 def _link_text(text, link_regexes, avoid_hosts, factory): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
620 leading_text = '' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
621 links = [] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
622 last_pos = 0 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
623 while 1: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
624 best_match, best_pos = None, None |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
625 for regex in link_regexes: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
626 regex_pos = last_pos |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
627 while 1: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
628 match = regex.search(text, pos=regex_pos) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
629 if match is None: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
630 break |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
631 host = match.group('host') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
632 for host_regex in avoid_hosts: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
633 if host_regex.search(host): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
634 regex_pos = match.end() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
635 break |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
636 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
637 break |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
638 if match is None: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
639 continue |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
640 if best_pos is None or match.start() < best_pos: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
641 best_match = match |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
642 best_pos = match.start() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
643 if best_match is None: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
644 # No more matches |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
645 if links: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
646 assert not links[-1].tail |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
647 links[-1].tail = text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
648 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
649 assert not leading_text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
650 leading_text = text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
651 break |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
652 link = best_match.group(0) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
653 end = best_match.end() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
654 if link.endswith('.') or link.endswith(','): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
655 # These punctuation marks shouldn't end a link |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
656 end -= 1 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
657 link = link[:-1] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
658 prev_text = text[:best_match.start()] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
659 if links: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
660 assert not links[-1].tail |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
661 links[-1].tail = prev_text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
662 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
663 assert not leading_text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
664 leading_text = prev_text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
665 anchor = factory('a') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
666 anchor.set('href', link) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
667 body = best_match.group('body') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
668 if not body: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
669 body = link |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
670 if body.endswith('.') or body.endswith(','): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
671 body = body[:-1] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
672 anchor.text = body |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
673 links.append(anchor) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
674 text = text[end:] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
675 return leading_text, links |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
676 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
677 def autolink_html(html, *args, **kw): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
678 result_type = type(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
679 if isinstance(html, basestring): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
680 doc = fromstring(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
681 else: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
682 doc = copy.deepcopy(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
683 autolink(doc, *args, **kw) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
684 return _transform_result(result_type, doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
685 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
686 autolink_html.__doc__ = autolink.__doc__ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
687 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
688 ############################################################ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
689 ## Word wrapping |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
690 ############################################################ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
691 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
692 _avoid_word_break_elements = ['pre', 'textarea', 'code'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
693 _avoid_word_break_classes = ['nobreak'] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
694 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
695 def word_break(el, max_width=40, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
696 avoid_elements=_avoid_word_break_elements, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
697 avoid_classes=_avoid_word_break_classes, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
698 break_character=unichr(0x200b)): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
699 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
700 Breaks any long words found in the body of the text (not attributes). |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
701 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
702 Doesn't effect any of the tags in avoid_elements, by default |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
703 ``<textarea>`` and ``<pre>`` |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
704 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
705 Breaks words by inserting ​, which is a unicode character |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
706 for Zero Width Space character. This generally takes up no space |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
707 in rendering, but does copy as a space, and in monospace contexts |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
708 usually takes up space. |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
709 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
710 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
711 """ |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
712 # Character suggestion of ​ comes from: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
713 # http://www.cs.tut.fi/~jkorpela/html/nobr.html |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
714 if el.tag in _avoid_word_break_elements: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
715 return |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
716 class_name = el.get('class') |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
717 if class_name: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
718 dont_break = False |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
719 class_name = class_name.split() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
720 for avoid in avoid_classes: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
721 if avoid in class_name: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
722 dont_break = True |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
723 break |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
724 if dont_break: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
725 return |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
726 if el.text: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
727 el.text = _break_text(el.text, max_width, break_character) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
728 for child in el: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
729 word_break(child, max_width=max_width, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
730 avoid_elements=avoid_elements, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
731 avoid_classes=avoid_classes, |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
732 break_character=break_character) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
733 if child.tail: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
734 child.tail = _break_text(child.tail, max_width, break_character) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
735 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
736 def word_break_html(html, *args, **kw): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
737 result_type = type(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
738 doc = fromstring(html) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
739 word_break(doc, *args, **kw) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
740 return _transform_result(result_type, doc) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
741 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
742 def _break_text(text, max_width, break_character): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
743 words = text.split() |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
744 for word in words: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
745 if len(word) > max_width: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
746 replacement = _insert_break(word, max_width, break_character) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
747 text = text.replace(word, replacement) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
748 return text |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
749 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
750 _break_prefer_re = re.compile(r'[^a-z]', re.I) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
751 |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
752 def _insert_break(word, width, break_character): |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
753 orig_word = word |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
754 result = '' |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
755 while len(word) > width: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
756 start = word[:width] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
757 breaks = list(_break_prefer_re.finditer(start)) |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
758 if breaks: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
759 last_break = breaks[-1] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
760 # Only walk back up to 10 characters to find a nice break: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
761 if last_break.end() > width-10: |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
762 # FIXME: should the break character be at the end of the |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
763 # chunk, or the beginning of the next chunk? |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
764 start = word[:last_break.end()] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
765 result += start + break_character |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
766 word = word[len(start):] |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
767 result += word |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
768 return result |
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
769 |