Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/bs4/builder/_htmlparser.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | d30785e31577 |
children |
rev | line source |
---|---|
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
1 # encoding: utf-8 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
2 """Use the HTMLParser library to parse HTML files that aren't too bad.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
3 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
4 # Use of this source code is governed by the MIT license. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
5 __license__ = "MIT" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
6 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
7 __all__ = [ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
8 'HTMLParserTreeBuilder', |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
9 ] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
10 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
11 from html.parser import HTMLParser |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
12 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
13 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
14 from html.parser import HTMLParseError |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
15 except ImportError as e: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
16 # HTMLParseError is removed in Python 3.5. Since it can never be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
17 # thrown in 3.5, we can just define our own class as a placeholder. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
18 class HTMLParseError(Exception): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
19 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
20 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
21 import sys |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
22 import warnings |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
23 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
24 # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
25 # argument, which we'd like to set to False. Unfortunately, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
26 # http://bugs.python.org/issue13273 makes strict=True a better bet |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
27 # before Python 3.2.3. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
28 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
29 # At the end of this file, we monkeypatch HTMLParser so that |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
30 # strict=True works well on Python 3.2.2. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
31 major, minor, release = sys.version_info[:3] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
32 CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
33 CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
34 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
35 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
36 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
37 from bs4.element import ( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
38 CData, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
39 Comment, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
40 Declaration, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
41 Doctype, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
42 ProcessingInstruction, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
43 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
44 from bs4.dammit import EntitySubstitution, UnicodeDammit |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
45 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
46 from bs4.builder import ( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
47 HTML, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
48 HTMLTreeBuilder, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
49 STRICT, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
50 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
51 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
52 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
53 HTMLPARSER = 'html.parser' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
54 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
55 class BeautifulSoupHTMLParser(HTMLParser): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
56 """A subclass of the Python standard library's HTMLParser class, which |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
57 listens for HTMLParser events and translates them into calls |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
58 to Beautiful Soup's tree construction API. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
59 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
60 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
61 # Strategies for handling duplicate attributes |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
62 IGNORE = 'ignore' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
63 REPLACE = 'replace' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
64 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
65 def __init__(self, *args, **kwargs): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
66 """Constructor. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
67 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
68 :param on_duplicate_attribute: A strategy for what to do if a |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
69 tag includes the same attribute more than once. Accepted |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
70 values are: REPLACE (replace earlier values with later |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
71 ones, the default), IGNORE (keep the earliest value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
72 encountered), or a callable. A callable must take three |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
73 arguments: the dictionary of attributes already processed, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
74 the name of the duplicate attribute, and the most recent value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
75 encountered. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
76 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
77 self.on_duplicate_attribute = kwargs.pop( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
78 'on_duplicate_attribute', self.REPLACE |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
79 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
80 HTMLParser.__init__(self, *args, **kwargs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
81 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
82 # Keep a list of empty-element tags that were encountered |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
83 # without an explicit closing tag. If we encounter a closing tag |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
84 # of this type, we'll associate it with one of those entries. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
85 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
86 # This isn't a stack because we don't care about the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
87 # order. It's a list of closing tags we've already handled and |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
88 # will ignore, assuming they ever show up. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
89 self.already_closed_empty_element = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
90 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
91 def error(self, msg): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
92 """In Python 3, HTMLParser subclasses must implement error(), although |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
93 this requirement doesn't appear to be documented. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
94 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
95 In Python 2, HTMLParser implements error() by raising an exception, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
96 which we don't want to do. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
97 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
98 In any event, this method is called only on very strange |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
99 markup and our best strategy is to pretend it didn't happen |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
100 and keep going. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
101 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
102 warnings.warn(msg) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
103 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
104 def handle_startendtag(self, name, attrs): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
105 """Handle an incoming empty-element tag. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
106 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
107 This is only called when the markup looks like <tag/>. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
108 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
109 :param name: Name of the tag. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
110 :param attrs: Dictionary of the tag's attributes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
111 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
112 # is_startend() tells handle_starttag not to close the tag |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
113 # just because its name matches a known empty-element tag. We |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
114 # know that this is an empty-element tag and we want to call |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
115 # handle_endtag ourselves. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
116 tag = self.handle_starttag(name, attrs, handle_empty_element=False) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
117 self.handle_endtag(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
118 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
119 def handle_starttag(self, name, attrs, handle_empty_element=True): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
120 """Handle an opening tag, e.g. '<tag>' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
121 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
122 :param name: Name of the tag. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
123 :param attrs: Dictionary of the tag's attributes. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
124 :param handle_empty_element: True if this tag is known to be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
125 an empty-element tag (i.e. there is not expected to be any |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
126 closing tag). |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
127 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
128 # XXX namespace |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
129 attr_dict = {} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
130 for key, value in attrs: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
131 # Change None attribute values to the empty string |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
132 # for consistency with the other tree builders. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
133 if value is None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
134 value = '' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
135 if key in attr_dict: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
136 # A single attribute shows up multiple times in this |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
137 # tag. How to handle it depends on the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
138 # on_duplicate_attribute setting. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
139 on_dupe = self.on_duplicate_attribute |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
140 if on_dupe == self.IGNORE: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
141 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
142 elif on_dupe in (None, self.REPLACE): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
143 attr_dict[key] = value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
144 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
145 on_dupe(attr_dict, key, value) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
146 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
147 attr_dict[key] = value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
148 attrvalue = '""' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
149 #print("START", name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
150 sourceline, sourcepos = self.getpos() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
151 tag = self.soup.handle_starttag( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
152 name, None, None, attr_dict, sourceline=sourceline, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
153 sourcepos=sourcepos |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
154 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
155 if tag and tag.is_empty_element and handle_empty_element: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
156 # Unlike other parsers, html.parser doesn't send separate end tag |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
157 # events for empty-element tags. (It's handled in |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
158 # handle_startendtag, but only if the original markup looked like |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
159 # <tag/>.) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
160 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
161 # So we need to call handle_endtag() ourselves. Since we |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
162 # know the start event is identical to the end event, we |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
163 # don't want handle_endtag() to cross off any previous end |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
164 # events for tags of this name. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
165 self.handle_endtag(name, check_already_closed=False) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
166 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
167 # But we might encounter an explicit closing tag for this tag |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
168 # later on. If so, we want to ignore it. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
169 self.already_closed_empty_element.append(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
170 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
171 def handle_endtag(self, name, check_already_closed=True): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
172 """Handle a closing tag, e.g. '</tag>' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
173 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
174 :param name: A tag name. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
175 :param check_already_closed: True if this tag is expected to |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
176 be the closing portion of an empty-element tag, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
177 e.g. '<tag></tag>'. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
178 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
179 #print("END", name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
180 if check_already_closed and name in self.already_closed_empty_element: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
181 # This is a redundant end tag for an empty-element tag. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
182 # We've already called handle_endtag() for it, so just |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
183 # check it off the list. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
184 # print("ALREADY CLOSED", name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
185 self.already_closed_empty_element.remove(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
186 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
187 self.soup.handle_endtag(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
188 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
189 def handle_data(self, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
190 """Handle some textual data that shows up between tags.""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
191 self.soup.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
192 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
193 def handle_charref(self, name): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
194 """Handle a numeric character reference by converting it to the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
195 corresponding Unicode character and treating it as textual |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
196 data. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
197 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
198 :param name: Character number, possibly in hexadecimal. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
199 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
200 # XXX workaround for a bug in HTMLParser. Remove this once |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
201 # it's fixed in all supported versions. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
202 # http://bugs.python.org/issue13633 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
203 if name.startswith('x'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
204 real_name = int(name.lstrip('x'), 16) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
205 elif name.startswith('X'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
206 real_name = int(name.lstrip('X'), 16) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
207 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
208 real_name = int(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
209 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
210 data = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
211 if real_name < 256: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
212 # HTML numeric entities are supposed to reference Unicode |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
213 # code points, but sometimes they reference code points in |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
214 # some other encoding (ahem, Windows-1252). E.g. “ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
215 # instead of É for LEFT DOUBLE QUOTATION MARK. This |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
216 # code tries to detect this situation and compensate. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
217 for encoding in (self.soup.original_encoding, 'windows-1252'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
218 if not encoding: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
219 continue |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
220 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
221 data = bytearray([real_name]).decode(encoding) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
222 except UnicodeDecodeError as e: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
223 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
224 if not data: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
225 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
226 data = chr(real_name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
227 except (ValueError, OverflowError) as e: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
228 pass |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
229 data = data or "\N{REPLACEMENT CHARACTER}" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
230 self.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
231 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
232 def handle_entityref(self, name): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
233 """Handle a named entity reference by converting it to the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
234 corresponding Unicode character and treating it as textual |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
235 data. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
236 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
237 :param name: Name of the entity reference. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
238 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
239 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
240 if character is not None: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
241 data = character |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
242 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
243 # If this were XML, it would be ambiguous whether "&foo" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
244 # was an character entity reference with a missing |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
245 # semicolon or the literal string "&foo". Since this is |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
246 # HTML, we have a complete list of all character entity references, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
247 # and this one wasn't found, so assume it's the literal string "&foo". |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
248 data = "&%s" % name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
249 self.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
250 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
251 def handle_comment(self, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
252 """Handle an HTML comment. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
253 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
254 :param data: The text of the comment. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
255 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
256 self.soup.endData() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
257 self.soup.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
258 self.soup.endData(Comment) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
259 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
260 def handle_decl(self, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
261 """Handle a DOCTYPE declaration. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
262 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
263 :param data: The text of the declaration. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
264 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
265 self.soup.endData() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
266 data = data[len("DOCTYPE "):] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
267 self.soup.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
268 self.soup.endData(Doctype) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
269 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
270 def unknown_decl(self, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
271 """Handle a declaration of unknown type -- probably a CDATA block. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
272 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
273 :param data: The text of the declaration. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
274 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
275 if data.upper().startswith('CDATA['): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
276 cls = CData |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
277 data = data[len('CDATA['):] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
278 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
279 cls = Declaration |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
280 self.soup.endData() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
281 self.soup.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
282 self.soup.endData(cls) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
283 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
284 def handle_pi(self, data): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
285 """Handle a processing instruction. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
286 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
287 :param data: The text of the instruction. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
288 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
289 self.soup.endData() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
290 self.soup.handle_data(data) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
291 self.soup.endData(ProcessingInstruction) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
292 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
293 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
294 class HTMLParserTreeBuilder(HTMLTreeBuilder): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
295 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
296 found in the Python standard library. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
297 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
298 is_xml = False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
299 picklable = True |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
300 NAME = HTMLPARSER |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
301 features = [NAME, HTML, STRICT] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
302 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
303 # The html.parser knows which line number and position in the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
304 # original file is the source of an element. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
305 TRACKS_LINE_NUMBERS = True |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
306 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
307 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
308 """Constructor. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
309 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
310 :param parser_args: Positional arguments to pass into |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
311 the BeautifulSoupHTMLParser constructor, once it's |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
312 invoked. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
313 :param parser_kwargs: Keyword arguments to pass into |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
314 the BeautifulSoupHTMLParser constructor, once it's |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
315 invoked. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
316 :param kwargs: Keyword arguments for the superclass constructor. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
317 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
318 # Some keyword arguments will be pulled out of kwargs and placed |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
319 # into parser_kwargs. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
320 extra_parser_kwargs = dict() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
321 for arg in ('on_duplicate_attribute',): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
322 if arg in kwargs: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
323 value = kwargs.pop(arg) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
324 extra_parser_kwargs[arg] = value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
325 super(HTMLParserTreeBuilder, self).__init__(**kwargs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
326 parser_args = parser_args or [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
327 parser_kwargs = parser_kwargs or {} |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
328 parser_kwargs.update(extra_parser_kwargs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
329 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
330 parser_kwargs['strict'] = False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
331 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
332 parser_kwargs['convert_charrefs'] = False |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
333 self.parser_args = (parser_args, parser_kwargs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
334 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
335 def prepare_markup(self, markup, user_specified_encoding=None, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
336 document_declared_encoding=None, exclude_encodings=None): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
337 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
338 """Run any preliminary steps necessary to make incoming markup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
339 acceptable to the parser. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
340 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
341 :param markup: Some markup -- probably a bytestring. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
342 :param user_specified_encoding: The user asked to try this encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
343 :param document_declared_encoding: The markup itself claims to be |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
344 in this encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
345 :param exclude_encodings: The user asked _not_ to try any of |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
346 these encodings. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
347 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
348 :yield: A series of 4-tuples: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
349 (markup, encoding, declared encoding, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
350 has undergone character replacement) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
351 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
352 Each 4-tuple represents a strategy for converting the |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
353 document to Unicode and parsing it. Each strategy will be tried |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
354 in turn. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
355 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
356 if isinstance(markup, str): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
357 # Parse Unicode as-is. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
358 yield (markup, None, None, False) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
359 return |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
360 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
361 # Ask UnicodeDammit to sniff the most likely encoding. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
362 try_encodings = [user_specified_encoding, document_declared_encoding] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
363 dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
364 exclude_encodings=exclude_encodings) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
365 yield (dammit.markup, dammit.original_encoding, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
366 dammit.declared_html_encoding, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
367 dammit.contains_replacement_characters) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
368 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
369 def feed(self, markup): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
370 """Run some incoming markup through some parsing process, |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
371 populating the `BeautifulSoup` object in self.soup. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
372 """ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
373 args, kwargs = self.parser_args |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
374 parser = BeautifulSoupHTMLParser(*args, **kwargs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
375 parser.soup = self.soup |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
376 try: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
377 parser.feed(markup) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
378 parser.close() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
379 except HTMLParseError as e: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
380 warnings.warn(RuntimeWarning( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
381 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
382 raise e |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
383 parser.already_closed_empty_element = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
384 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
385 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
386 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
387 # string. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
388 # |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
389 # XXX This code can be removed once most Python 3 users are on 3.2.3. |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
390 if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
391 import re |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
392 attrfind_tolerant = re.compile( |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
393 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
394 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
395 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
396 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
397 locatestarttagend = re.compile(r""" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
398 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
399 (?:\s+ # whitespace before attribute name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
400 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
401 (?:\s*=\s* # value indicator |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
402 (?:'[^']*' # LITA-enclosed value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
403 |\"[^\"]*\" # LIT-enclosed value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
404 |[^'\">\s]+ # bare value |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
405 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
406 )? |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
407 ) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
408 )* |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
409 \s* # trailing whitespace |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
410 """, re.VERBOSE) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
411 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
412 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
413 from html.parser import tagfind, attrfind |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
414 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
415 def parse_starttag(self, i): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
416 self.__starttag_text = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
417 endpos = self.check_for_whole_start_tag(i) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
418 if endpos < 0: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
419 return endpos |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
420 rawdata = self.rawdata |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
421 self.__starttag_text = rawdata[i:endpos] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
422 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
423 # Now parse the data between i+1 and j into a tag and attrs |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
424 attrs = [] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
425 match = tagfind.match(rawdata, i+1) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
426 assert match, 'unexpected call to parse_starttag()' |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
427 k = match.end() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
428 self.lasttag = tag = rawdata[i+1:k].lower() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
429 while k < endpos: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
430 if self.strict: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
431 m = attrfind.match(rawdata, k) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
432 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
433 m = attrfind_tolerant.match(rawdata, k) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
434 if not m: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
435 break |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
436 attrname, rest, attrvalue = m.group(1, 2, 3) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
437 if not rest: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
438 attrvalue = None |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
439 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
440 attrvalue[:1] == '"' == attrvalue[-1:]: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
441 attrvalue = attrvalue[1:-1] |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
442 if attrvalue: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
443 attrvalue = self.unescape(attrvalue) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
444 attrs.append((attrname.lower(), attrvalue)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
445 k = m.end() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
446 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
447 end = rawdata[k:endpos].strip() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
448 if end not in (">", "/>"): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
449 lineno, offset = self.getpos() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
450 if "\n" in self.__starttag_text: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
451 lineno = lineno + self.__starttag_text.count("\n") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
452 offset = len(self.__starttag_text) \ |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
453 - self.__starttag_text.rfind("\n") |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
454 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
455 offset = offset + len(self.__starttag_text) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
456 if self.strict: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
457 self.error("junk characters in start tag: %r" |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
458 % (rawdata[k:endpos][:20],)) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
459 self.handle_data(rawdata[i:endpos]) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
460 return endpos |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
461 if end.endswith('/>'): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
462 # XHTML-style empty tag: <span attr="value" /> |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
463 self.handle_startendtag(tag, attrs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
464 else: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
465 self.handle_starttag(tag, attrs) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
466 if tag in self.CDATA_CONTENT_ELEMENTS: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
467 self.set_cdata_mode(tag) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
468 return endpos |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
469 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
470 def set_cdata_mode(self, elem): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
471 self.cdata_elem = elem.lower() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
472 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
473 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
474 BeautifulSoupHTMLParser.parse_starttag = parse_starttag |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
475 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
476 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
477 CONSTRUCTOR_TAKES_STRICT = True |