comparison planemo/lib/python3.7/site-packages/future/backports/html/parser.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 """A parser for HTML and XHTML.
2
3 Backported for python-future from Python 3.3.
4 """
5
6 # This file is based on sgmllib.py, but the API is slightly different.
7
8 # XXX There should be a way to distinguish between PCDATA (parsed
9 # character data -- the normal case), RCDATA (replaceable character
10 # data -- only char and entity references and end tags are special)
11 # and CDATA (character data -- only end tags are special).
12
13 from __future__ import (absolute_import, division,
14 print_function, unicode_literals)
15 from future.builtins import *
16 from future.backports import _markupbase
17 import re
18 import warnings
19
20 # Regular expressions used for parsing
21
22 interesting_normal = re.compile('[&<]')
23 incomplete = re.compile('&[a-zA-Z#]')
24
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27
28 starttagopen = re.compile('<[a-zA-Z]')
29 piclose = re.compile('>')
30 commentclose = re.compile(r'--\s*>')
31 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
32 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
33 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
34 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
35 # Note:
36 # 1) the strict attrfind isn't really strict, but we can't make it
37 # correctly strict without breaking backward compatibility;
38 # 2) if you change attrfind remember to update locatestarttagend too;
39 # 3) if you change attrfind and/or locatestarttagend the parser will
40 # explode, so don't do it.
41 attrfind = re.compile(
42 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
43 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
44 attrfind_tolerant = re.compile(
45 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
46 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
47 locatestarttagend = re.compile(r"""
48 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
49 (?:\s+ # whitespace before attribute name
50 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
51 (?:\s*=\s* # value indicator
52 (?:'[^']*' # LITA-enclosed value
53 |\"[^\"]*\" # LIT-enclosed value
54 |[^'\">\s]+ # bare value
55 )
56 )?
57 )
58 )*
59 \s* # trailing whitespace
60 """, re.VERBOSE)
61 locatestarttagend_tolerant = re.compile(r"""
62 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
63 (?:[\s/]* # optional whitespace before attribute name
64 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
65 (?:\s*=+\s* # value indicator
66 (?:'[^']*' # LITA-enclosed value
67 |"[^"]*" # LIT-enclosed value
68 |(?!['"])[^>\s]* # bare value
69 )
70 (?:\s*,)* # possibly followed by a comma
71 )?(?:\s|/(?!>))*
72 )*
73 )?
74 \s* # trailing whitespace
75 """, re.VERBOSE)
76 endendtag = re.compile('>')
77 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
78 # </ and the tag name, so maybe this should be fixed
79 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
80
81
82 class HTMLParseError(Exception):
83 """Exception raised for all parse errors."""
84
85 def __init__(self, msg, position=(None, None)):
86 assert msg
87 self.msg = msg
88 self.lineno = position[0]
89 self.offset = position[1]
90
91 def __str__(self):
92 result = self.msg
93 if self.lineno is not None:
94 result = result + ", at line %d" % self.lineno
95 if self.offset is not None:
96 result = result + ", column %d" % (self.offset + 1)
97 return result
98
99
100 class HTMLParser(_markupbase.ParserBase):
101 """Find tags and other markup and call handler functions.
102
103 Usage:
104 p = HTMLParser()
105 p.feed(data)
106 ...
107 p.close()
108
109 Start tags are handled by calling self.handle_starttag() or
110 self.handle_startendtag(); end tags by self.handle_endtag(). The
111 data between tags is passed from the parser to the derived class
112 by calling self.handle_data() with the data as argument (the data
113 may be split up in arbitrary chunks). Entity references are
114 passed by calling self.handle_entityref() with the entity
115 reference as the argument. Numeric character references are
116 passed to self.handle_charref() with the string containing the
117 reference as the argument.
118 """
119
120 CDATA_CONTENT_ELEMENTS = ("script", "style")
121
122 def __init__(self, strict=False):
123 """Initialize and reset this instance.
124
125 If strict is set to False (the default) the parser will parse invalid
126 markup, otherwise it will raise an error. Note that the strict mode
127 is deprecated.
128 """
129 if strict:
130 warnings.warn("The strict mode is deprecated.",
131 DeprecationWarning, stacklevel=2)
132 self.strict = strict
133 self.reset()
134
135 def reset(self):
136 """Reset this instance. Loses all unprocessed data."""
137 self.rawdata = ''
138 self.lasttag = '???'
139 self.interesting = interesting_normal
140 self.cdata_elem = None
141 _markupbase.ParserBase.reset(self)
142
143 def feed(self, data):
144 r"""Feed data to the parser.
145
146 Call this as often as you want, with as little or as much text
147 as you want (may include '\n').
148 """
149 self.rawdata = self.rawdata + data
150 self.goahead(0)
151
152 def close(self):
153 """Handle any buffered data."""
154 self.goahead(1)
155
156 def error(self, message):
157 raise HTMLParseError(message, self.getpos())
158
159 __starttag_text = None
160
161 def get_starttag_text(self):
162 """Return full source of start tag: '<...>'."""
163 return self.__starttag_text
164
165 def set_cdata_mode(self, elem):
166 self.cdata_elem = elem.lower()
167 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
168
169 def clear_cdata_mode(self):
170 self.interesting = interesting_normal
171 self.cdata_elem = None
172
173 # Internal -- handle data as far as reasonable. May leave state
174 # and data to be processed by a subsequent call. If 'end' is
175 # true, force handling all data as if followed by EOF marker.
176 def goahead(self, end):
177 rawdata = self.rawdata
178 i = 0
179 n = len(rawdata)
180 while i < n:
181 match = self.interesting.search(rawdata, i) # < or &
182 if match:
183 j = match.start()
184 else:
185 if self.cdata_elem:
186 break
187 j = n
188 if i < j: self.handle_data(rawdata[i:j])
189 i = self.updatepos(i, j)
190 if i == n: break
191 startswith = rawdata.startswith
192 if startswith('<', i):
193 if starttagopen.match(rawdata, i): # < + letter
194 k = self.parse_starttag(i)
195 elif startswith("</", i):
196 k = self.parse_endtag(i)
197 elif startswith("<!--", i):
198 k = self.parse_comment(i)
199 elif startswith("<?", i):
200 k = self.parse_pi(i)
201 elif startswith("<!", i):
202 if self.strict:
203 k = self.parse_declaration(i)
204 else:
205 k = self.parse_html_declaration(i)
206 elif (i + 1) < n:
207 self.handle_data("<")
208 k = i + 1
209 else:
210 break
211 if k < 0:
212 if not end:
213 break
214 if self.strict:
215 self.error("EOF in middle of construct")
216 k = rawdata.find('>', i + 1)
217 if k < 0:
218 k = rawdata.find('<', i + 1)
219 if k < 0:
220 k = i + 1
221 else:
222 k += 1
223 self.handle_data(rawdata[i:k])
224 i = self.updatepos(i, k)
225 elif startswith("&#", i):
226 match = charref.match(rawdata, i)
227 if match:
228 name = match.group()[2:-1]
229 self.handle_charref(name)
230 k = match.end()
231 if not startswith(';', k-1):
232 k = k - 1
233 i = self.updatepos(i, k)
234 continue
235 else:
236 if ";" in rawdata[i:]: #bail by consuming &#
237 self.handle_data(rawdata[0:2])
238 i = self.updatepos(i, 2)
239 break
240 elif startswith('&', i):
241 match = entityref.match(rawdata, i)
242 if match:
243 name = match.group(1)
244 self.handle_entityref(name)
245 k = match.end()
246 if not startswith(';', k-1):
247 k = k - 1
248 i = self.updatepos(i, k)
249 continue
250 match = incomplete.match(rawdata, i)
251 if match:
252 # match.group() will contain at least 2 chars
253 if end and match.group() == rawdata[i:]:
254 if self.strict:
255 self.error("EOF in middle of entity or char ref")
256 else:
257 if k <= i:
258 k = n
259 i = self.updatepos(i, i + 1)
260 # incomplete
261 break
262 elif (i + 1) < n:
263 # not the end of the buffer, and can't be confused
264 # with some other construct
265 self.handle_data("&")
266 i = self.updatepos(i, i + 1)
267 else:
268 break
269 else:
270 assert 0, "interesting.search() lied"
271 # end while
272 if end and i < n and not self.cdata_elem:
273 self.handle_data(rawdata[i:n])
274 i = self.updatepos(i, n)
275 self.rawdata = rawdata[i:]
276
277 # Internal -- parse html declarations, return length or -1 if not terminated
278 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
279 # See also parse_declaration in _markupbase
280 def parse_html_declaration(self, i):
281 rawdata = self.rawdata
282 assert rawdata[i:i+2] == '<!', ('unexpected call to '
283 'parse_html_declaration()')
284 if rawdata[i:i+4] == '<!--':
285 # this case is actually already handled in goahead()
286 return self.parse_comment(i)
287 elif rawdata[i:i+3] == '<![':
288 return self.parse_marked_section(i)
289 elif rawdata[i:i+9].lower() == '<!doctype':
290 # find the closing >
291 gtpos = rawdata.find('>', i+9)
292 if gtpos == -1:
293 return -1
294 self.handle_decl(rawdata[i+2:gtpos])
295 return gtpos+1
296 else:
297 return self.parse_bogus_comment(i)
298
299 # Internal -- parse bogus comment, return length or -1 if not terminated
300 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
301 def parse_bogus_comment(self, i, report=1):
302 rawdata = self.rawdata
303 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
304 'parse_comment()')
305 pos = rawdata.find('>', i+2)
306 if pos == -1:
307 return -1
308 if report:
309 self.handle_comment(rawdata[i+2:pos])
310 return pos + 1
311
312 # Internal -- parse processing instr, return end or -1 if not terminated
313 def parse_pi(self, i):
314 rawdata = self.rawdata
315 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
316 match = piclose.search(rawdata, i+2) # >
317 if not match:
318 return -1
319 j = match.start()
320 self.handle_pi(rawdata[i+2: j])
321 j = match.end()
322 return j
323
324 # Internal -- handle starttag, return end or -1 if not terminated
325 def parse_starttag(self, i):
326 self.__starttag_text = None
327 endpos = self.check_for_whole_start_tag(i)
328 if endpos < 0:
329 return endpos
330 rawdata = self.rawdata
331 self.__starttag_text = rawdata[i:endpos]
332
333 # Now parse the data between i+1 and j into a tag and attrs
334 attrs = []
335 match = tagfind.match(rawdata, i+1)
336 assert match, 'unexpected call to parse_starttag()'
337 k = match.end()
338 self.lasttag = tag = match.group(1).lower()
339 while k < endpos:
340 if self.strict:
341 m = attrfind.match(rawdata, k)
342 else:
343 m = attrfind_tolerant.match(rawdata, k)
344 if not m:
345 break
346 attrname, rest, attrvalue = m.group(1, 2, 3)
347 if not rest:
348 attrvalue = None
349 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
350 attrvalue[:1] == '"' == attrvalue[-1:]:
351 attrvalue = attrvalue[1:-1]
352 if attrvalue:
353 attrvalue = self.unescape(attrvalue)
354 attrs.append((attrname.lower(), attrvalue))
355 k = m.end()
356
357 end = rawdata[k:endpos].strip()
358 if end not in (">", "/>"):
359 lineno, offset = self.getpos()
360 if "\n" in self.__starttag_text:
361 lineno = lineno + self.__starttag_text.count("\n")
362 offset = len(self.__starttag_text) \
363 - self.__starttag_text.rfind("\n")
364 else:
365 offset = offset + len(self.__starttag_text)
366 if self.strict:
367 self.error("junk characters in start tag: %r"
368 % (rawdata[k:endpos][:20],))
369 self.handle_data(rawdata[i:endpos])
370 return endpos
371 if end.endswith('/>'):
372 # XHTML-style empty tag: <span attr="value" />
373 self.handle_startendtag(tag, attrs)
374 else:
375 self.handle_starttag(tag, attrs)
376 if tag in self.CDATA_CONTENT_ELEMENTS:
377 self.set_cdata_mode(tag)
378 return endpos
379
380 # Internal -- check to see if we have a complete starttag; return end
381 # or -1 if incomplete.
382 def check_for_whole_start_tag(self, i):
383 rawdata = self.rawdata
384 if self.strict:
385 m = locatestarttagend.match(rawdata, i)
386 else:
387 m = locatestarttagend_tolerant.match(rawdata, i)
388 if m:
389 j = m.end()
390 next = rawdata[j:j+1]
391 if next == ">":
392 return j + 1
393 if next == "/":
394 if rawdata.startswith("/>", j):
395 return j + 2
396 if rawdata.startswith("/", j):
397 # buffer boundary
398 return -1
399 # else bogus input
400 if self.strict:
401 self.updatepos(i, j + 1)
402 self.error("malformed empty start tag")
403 if j > i:
404 return j
405 else:
406 return i + 1
407 if next == "":
408 # end of input
409 return -1
410 if next in ("abcdefghijklmnopqrstuvwxyz=/"
411 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
412 # end of input in or before attribute value, or we have the
413 # '/' from a '/>' ending
414 return -1
415 if self.strict:
416 self.updatepos(i, j)
417 self.error("malformed start tag")
418 if j > i:
419 return j
420 else:
421 return i + 1
422 raise AssertionError("we should not get here!")
423
424 # Internal -- parse endtag, return end or -1 if incomplete
425 def parse_endtag(self, i):
426 rawdata = self.rawdata
427 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
428 match = endendtag.search(rawdata, i+1) # >
429 if not match:
430 return -1
431 gtpos = match.end()
432 match = endtagfind.match(rawdata, i) # </ + tag + >
433 if not match:
434 if self.cdata_elem is not None:
435 self.handle_data(rawdata[i:gtpos])
436 return gtpos
437 if self.strict:
438 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
439 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
440 namematch = tagfind_tolerant.match(rawdata, i+2)
441 if not namematch:
442 # w3.org/TR/html5/tokenization.html#end-tag-open-state
443 if rawdata[i:i+3] == '</>':
444 return i+3
445 else:
446 return self.parse_bogus_comment(i)
447 tagname = namematch.group().lower()
448 # consume and ignore other stuff between the name and the >
449 # Note: this is not 100% correct, since we might have things like
450 # </tag attr=">">, but looking for > after tha name should cover
451 # most of the cases and is much simpler
452 gtpos = rawdata.find('>', namematch.end())
453 self.handle_endtag(tagname)
454 return gtpos+1
455
456 elem = match.group(1).lower() # script or style
457 if self.cdata_elem is not None:
458 if elem != self.cdata_elem:
459 self.handle_data(rawdata[i:gtpos])
460 return gtpos
461
462 self.handle_endtag(elem.lower())
463 self.clear_cdata_mode()
464 return gtpos
465
466 # Overridable -- finish processing of start+end tag: <tag.../>
467 def handle_startendtag(self, tag, attrs):
468 self.handle_starttag(tag, attrs)
469 self.handle_endtag(tag)
470
471 # Overridable -- handle start tag
472 def handle_starttag(self, tag, attrs):
473 pass
474
475 # Overridable -- handle end tag
476 def handle_endtag(self, tag):
477 pass
478
479 # Overridable -- handle character reference
480 def handle_charref(self, name):
481 pass
482
483 # Overridable -- handle entity reference
484 def handle_entityref(self, name):
485 pass
486
487 # Overridable -- handle data
488 def handle_data(self, data):
489 pass
490
491 # Overridable -- handle comment
492 def handle_comment(self, data):
493 pass
494
495 # Overridable -- handle declaration
496 def handle_decl(self, decl):
497 pass
498
499 # Overridable -- handle processing instruction
500 def handle_pi(self, data):
501 pass
502
503 def unknown_decl(self, data):
504 if self.strict:
505 self.error("unknown declaration: %r" % (data,))
506
507 # Internal -- helper to remove special character quoting
508 def unescape(self, s):
509 if '&' not in s:
510 return s
511 def replaceEntities(s):
512 s = s.groups()[0]
513 try:
514 if s[0] == "#":
515 s = s[1:]
516 if s[0] in ['x','X']:
517 c = int(s[1:].rstrip(';'), 16)
518 else:
519 c = int(s.rstrip(';'))
520 return chr(c)
521 except ValueError:
522 return '&#' + s
523 else:
524 from future.backports.html.entities import html5
525 if s in html5:
526 return html5[s]
527 elif s.endswith(';'):
528 return '&' + s
529 for x in range(2, len(s)):
530 if s[:x] in html5:
531 return html5[s[:x]] + s[x:]
532 else:
533 return '&' + s
534
535 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
536 replaceEntities, s)