comparison env/lib/python3.7/site-packages/bs4/builder/_htmlparser.py @ 2:6af9afd405e9 draft

"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
author shellac
date Thu, 14 May 2020 14:56:58 -0400
parents 26e78fe6e8c4
children
comparison
equal deleted inserted replaced
1:75ca89e9b81c 2:6af9afd405e9
1 # encoding: utf-8
2 """Use the HTMLParser library to parse HTML files that aren't too bad."""
3
4 # Use of this source code is governed by the MIT license.
5 __license__ = "MIT"
6
7 __all__ = [
8 'HTMLParserTreeBuilder',
9 ]
10
11 from html.parser import HTMLParser
12
13 try:
14 from html.parser import HTMLParseError
15 except ImportError as e:
16 # HTMLParseError is removed in Python 3.5. Since it can never be
17 # thrown in 3.5, we can just define our own class as a placeholder.
18 class HTMLParseError(Exception):
19 pass
20
21 import sys
22 import warnings
23
24 # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25 # argument, which we'd like to set to False. Unfortunately,
26 # http://bugs.python.org/issue13273 makes strict=True a better bet
27 # before Python 3.2.3.
28 #
29 # At the end of this file, we monkeypatch HTMLParser so that
30 # strict=True works well on Python 3.2.2.
31 major, minor, release = sys.version_info[:3]
32 CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33 CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35
36
37 from bs4.element import (
38 CData,
39 Comment,
40 Declaration,
41 Doctype,
42 ProcessingInstruction,
43 )
44 from bs4.dammit import EntitySubstitution, UnicodeDammit
45
46 from bs4.builder import (
47 HTML,
48 HTMLTreeBuilder,
49 STRICT,
50 )
51
52
53 HTMLPARSER = 'html.parser'
54
55 class BeautifulSoupHTMLParser(HTMLParser):
56 """A subclass of the Python standard library's HTMLParser class, which
57 listens for HTMLParser events and translates them into calls
58 to Beautiful Soup's tree construction API.
59 """
60
61 def __init__(self, *args, **kwargs):
62 HTMLParser.__init__(self, *args, **kwargs)
63
64 # Keep a list of empty-element tags that were encountered
65 # without an explicit closing tag. If we encounter a closing tag
66 # of this type, we'll associate it with one of those entries.
67 #
68 # This isn't a stack because we don't care about the
69 # order. It's a list of closing tags we've already handled and
70 # will ignore, assuming they ever show up.
71 self.already_closed_empty_element = []
72
73 def error(self, msg):
74 """In Python 3, HTMLParser subclasses must implement error(), although
75 this requirement doesn't appear to be documented.
76
77 In Python 2, HTMLParser implements error() by raising an exception,
78 which we don't want to do.
79
80 In any event, this method is called only on very strange
81 markup and our best strategy is to pretend it didn't happen
82 and keep going.
83 """
84 warnings.warn(msg)
85
86 def handle_startendtag(self, name, attrs):
87 """Handle an incoming empty-element tag.
88
89 This is only called when the markup looks like <tag/>.
90
91 :param name: Name of the tag.
92 :param attrs: Dictionary of the tag's attributes.
93 """
94 # is_startend() tells handle_starttag not to close the tag
95 # just because its name matches a known empty-element tag. We
96 # know that this is an empty-element tag and we want to call
97 # handle_endtag ourselves.
98 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
99 self.handle_endtag(name)
100
101 def handle_starttag(self, name, attrs, handle_empty_element=True):
102 """Handle an opening tag, e.g. '<tag>'
103
104 :param name: Name of the tag.
105 :param attrs: Dictionary of the tag's attributes.
106 :param handle_empty_element: True if this tag is known to be
107 an empty-element tag (i.e. there is not expected to be any
108 closing tag).
109 """
110 # XXX namespace
111 attr_dict = {}
112 for key, value in attrs:
113 # Change None attribute values to the empty string
114 # for consistency with the other tree builders.
115 if value is None:
116 value = ''
117 attr_dict[key] = value
118 attrvalue = '""'
119 #print "START", name
120 sourceline, sourcepos = self.getpos()
121 tag = self.soup.handle_starttag(
122 name, None, None, attr_dict, sourceline=sourceline,
123 sourcepos=sourcepos
124 )
125 if tag and tag.is_empty_element and handle_empty_element:
126 # Unlike other parsers, html.parser doesn't send separate end tag
127 # events for empty-element tags. (It's handled in
128 # handle_startendtag, but only if the original markup looked like
129 # <tag/>.)
130 #
131 # So we need to call handle_endtag() ourselves. Since we
132 # know the start event is identical to the end event, we
133 # don't want handle_endtag() to cross off any previous end
134 # events for tags of this name.
135 self.handle_endtag(name, check_already_closed=False)
136
137 # But we might encounter an explicit closing tag for this tag
138 # later on. If so, we want to ignore it.
139 self.already_closed_empty_element.append(name)
140
141 def handle_endtag(self, name, check_already_closed=True):
142 """Handle a closing tag, e.g. '</tag>'
143
144 :param name: A tag name.
145 :param check_already_closed: True if this tag is expected to
146 be the closing portion of an empty-element tag,
147 e.g. '<tag></tag>'.
148 """
149 #print "END", name
150 if check_already_closed and name in self.already_closed_empty_element:
151 # This is a redundant end tag for an empty-element tag.
152 # We've already called handle_endtag() for it, so just
153 # check it off the list.
154 # print "ALREADY CLOSED", name
155 self.already_closed_empty_element.remove(name)
156 else:
157 self.soup.handle_endtag(name)
158
159 def handle_data(self, data):
160 """Handle some textual data that shows up between tags."""
161 self.soup.handle_data(data)
162
163 def handle_charref(self, name):
164 """Handle a numeric character reference by converting it to the
165 corresponding Unicode character and treating it as textual
166 data.
167
168 :param name: Character number, possibly in hexadecimal.
169 """
170 # XXX workaround for a bug in HTMLParser. Remove this once
171 # it's fixed in all supported versions.
172 # http://bugs.python.org/issue13633
173 if name.startswith('x'):
174 real_name = int(name.lstrip('x'), 16)
175 elif name.startswith('X'):
176 real_name = int(name.lstrip('X'), 16)
177 else:
178 real_name = int(name)
179
180 data = None
181 if real_name < 256:
182 # HTML numeric entities are supposed to reference Unicode
183 # code points, but sometimes they reference code points in
184 # some other encoding (ahem, Windows-1252). E.g. &#147;
185 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
186 # code tries to detect this situation and compensate.
187 for encoding in (self.soup.original_encoding, 'windows-1252'):
188 if not encoding:
189 continue
190 try:
191 data = bytearray([real_name]).decode(encoding)
192 except UnicodeDecodeError as e:
193 pass
194 if not data:
195 try:
196 data = chr(real_name)
197 except (ValueError, OverflowError) as e:
198 pass
199 data = data or "\N{REPLACEMENT CHARACTER}"
200 self.handle_data(data)
201
202 def handle_entityref(self, name):
203 """Handle a named entity reference by converting it to the
204 corresponding Unicode character and treating it as textual
205 data.
206
207 :param name: Name of the entity reference.
208 """
209 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
210 if character is not None:
211 data = character
212 else:
213 # If this were XML, it would be ambiguous whether "&foo"
214 # was an character entity reference with a missing
215 # semicolon or the literal string "&foo". Since this is
216 # HTML, we have a complete list of all character entity references,
217 # and this one wasn't found, so assume it's the literal string "&foo".
218 data = "&%s" % name
219 self.handle_data(data)
220
221 def handle_comment(self, data):
222 """Handle an HTML comment.
223
224 :param data: The text of the comment.
225 """
226 self.soup.endData()
227 self.soup.handle_data(data)
228 self.soup.endData(Comment)
229
230 def handle_decl(self, data):
231 """Handle a DOCTYPE declaration.
232
233 :param data: The text of the declaration.
234 """
235 self.soup.endData()
236 data = data[len("DOCTYPE "):]
237 self.soup.handle_data(data)
238 self.soup.endData(Doctype)
239
240 def unknown_decl(self, data):
241 """Handle a declaration of unknown type -- probably a CDATA block.
242
243 :param data: The text of the declaration.
244 """
245 if data.upper().startswith('CDATA['):
246 cls = CData
247 data = data[len('CDATA['):]
248 else:
249 cls = Declaration
250 self.soup.endData()
251 self.soup.handle_data(data)
252 self.soup.endData(cls)
253
254 def handle_pi(self, data):
255 """Handle a processing instruction.
256
257 :param data: The text of the instruction.
258 """
259 self.soup.endData()
260 self.soup.handle_data(data)
261 self.soup.endData(ProcessingInstruction)
262
263
264 class HTMLParserTreeBuilder(HTMLTreeBuilder):
265 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
266 found in the Python standard library.
267 """
268 is_xml = False
269 picklable = True
270 NAME = HTMLPARSER
271 features = [NAME, HTML, STRICT]
272
273 # The html.parser knows which line number and position in the
274 # original file is the source of an element.
275 TRACKS_LINE_NUMBERS = True
276
277 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
278 """Constructor.
279
280 :param parser_args: Positional arguments to pass into
281 the BeautifulSoupHTMLParser constructor, once it's
282 invoked.
283 :param parser_kwargs: Keyword arguments to pass into
284 the BeautifulSoupHTMLParser constructor, once it's
285 invoked.
286 :param kwargs: Keyword arguments for the superclass constructor.
287 """
288 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
289 parser_args = parser_args or []
290 parser_kwargs = parser_kwargs or {}
291 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
292 parser_kwargs['strict'] = False
293 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
294 parser_kwargs['convert_charrefs'] = False
295 self.parser_args = (parser_args, parser_kwargs)
296
297 def prepare_markup(self, markup, user_specified_encoding=None,
298 document_declared_encoding=None, exclude_encodings=None):
299
300 """Run any preliminary steps necessary to make incoming markup
301 acceptable to the parser.
302
303 :param markup: Some markup -- probably a bytestring.
304 :param user_specified_encoding: The user asked to try this encoding.
305 :param document_declared_encoding: The markup itself claims to be
306 in this encoding.
307 :param exclude_encodings: The user asked _not_ to try any of
308 these encodings.
309
310 :yield: A series of 4-tuples:
311 (markup, encoding, declared encoding,
312 has undergone character replacement)
313
314 Each 4-tuple represents a strategy for converting the
315 document to Unicode and parsing it. Each strategy will be tried
316 in turn.
317 """
318 if isinstance(markup, str):
319 # Parse Unicode as-is.
320 yield (markup, None, None, False)
321 return
322
323 # Ask UnicodeDammit to sniff the most likely encoding.
324 try_encodings = [user_specified_encoding, document_declared_encoding]
325 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
326 exclude_encodings=exclude_encodings)
327 yield (dammit.markup, dammit.original_encoding,
328 dammit.declared_html_encoding,
329 dammit.contains_replacement_characters)
330
331 def feed(self, markup):
332 """Run some incoming markup through some parsing process,
333 populating the `BeautifulSoup` object in self.soup.
334 """
335 args, kwargs = self.parser_args
336 parser = BeautifulSoupHTMLParser(*args, **kwargs)
337 parser.soup = self.soup
338 try:
339 parser.feed(markup)
340 parser.close()
341 except HTMLParseError as e:
342 warnings.warn(RuntimeWarning(
343 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
344 raise e
345 parser.already_closed_empty_element = []
346
347 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
348 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
349 # string.
350 #
351 # XXX This code can be removed once most Python 3 users are on 3.2.3.
352 if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
353 import re
354 attrfind_tolerant = re.compile(
355 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
356 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
357 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
358
359 locatestarttagend = re.compile(r"""
360 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
361 (?:\s+ # whitespace before attribute name
362 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
363 (?:\s*=\s* # value indicator
364 (?:'[^']*' # LITA-enclosed value
365 |\"[^\"]*\" # LIT-enclosed value
366 |[^'\">\s]+ # bare value
367 )
368 )?
369 )
370 )*
371 \s* # trailing whitespace
372 """, re.VERBOSE)
373 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
374
375 from html.parser import tagfind, attrfind
376
377 def parse_starttag(self, i):
378 self.__starttag_text = None
379 endpos = self.check_for_whole_start_tag(i)
380 if endpos < 0:
381 return endpos
382 rawdata = self.rawdata
383 self.__starttag_text = rawdata[i:endpos]
384
385 # Now parse the data between i+1 and j into a tag and attrs
386 attrs = []
387 match = tagfind.match(rawdata, i+1)
388 assert match, 'unexpected call to parse_starttag()'
389 k = match.end()
390 self.lasttag = tag = rawdata[i+1:k].lower()
391 while k < endpos:
392 if self.strict:
393 m = attrfind.match(rawdata, k)
394 else:
395 m = attrfind_tolerant.match(rawdata, k)
396 if not m:
397 break
398 attrname, rest, attrvalue = m.group(1, 2, 3)
399 if not rest:
400 attrvalue = None
401 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
402 attrvalue[:1] == '"' == attrvalue[-1:]:
403 attrvalue = attrvalue[1:-1]
404 if attrvalue:
405 attrvalue = self.unescape(attrvalue)
406 attrs.append((attrname.lower(), attrvalue))
407 k = m.end()
408
409 end = rawdata[k:endpos].strip()
410 if end not in (">", "/>"):
411 lineno, offset = self.getpos()
412 if "\n" in self.__starttag_text:
413 lineno = lineno + self.__starttag_text.count("\n")
414 offset = len(self.__starttag_text) \
415 - self.__starttag_text.rfind("\n")
416 else:
417 offset = offset + len(self.__starttag_text)
418 if self.strict:
419 self.error("junk characters in start tag: %r"
420 % (rawdata[k:endpos][:20],))
421 self.handle_data(rawdata[i:endpos])
422 return endpos
423 if end.endswith('/>'):
424 # XHTML-style empty tag: <span attr="value" />
425 self.handle_startendtag(tag, attrs)
426 else:
427 self.handle_starttag(tag, attrs)
428 if tag in self.CDATA_CONTENT_ELEMENTS:
429 self.set_cdata_mode(tag)
430 return endpos
431
432 def set_cdata_mode(self, elem):
433 self.cdata_elem = elem.lower()
434 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
435
436 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
437 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
438
439 CONSTRUCTOR_TAKES_STRICT = True