Mercurial > repos > guerler > hhblits
comparison lib/python3.8/site-packages/pip/_vendor/html5lib/html5parser.py @ 0:9e54283cc701 draft
"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author | guerler |
---|---|
date | Mon, 27 Jul 2020 03:47:31 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9e54283cc701 |
---|---|
1 from __future__ import absolute_import, division, unicode_literals | |
2 from pip._vendor.six import with_metaclass, viewkeys | |
3 | |
4 import types | |
5 from collections import OrderedDict | |
6 | |
7 from . import _inputstream | |
8 from . import _tokenizer | |
9 | |
10 from . import treebuilders | |
11 from .treebuilders.base import Marker | |
12 | |
13 from . import _utils | |
14 from .constants import ( | |
15 spaceCharacters, asciiUpper2Lower, | |
16 specialElements, headingElements, cdataElements, rcdataElements, | |
17 tokenTypes, tagTokenTypes, | |
18 namespaces, | |
19 htmlIntegrationPointElements, mathmlTextIntegrationPointElements, | |
20 adjustForeignAttributes as adjustForeignAttributesMap, | |
21 adjustMathMLAttributes, adjustSVGAttributes, | |
22 E, | |
23 _ReparseException | |
24 ) | |
25 | |
26 | |
27 def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): | |
28 """Parse an HTML document as a string or file-like object into a tree | |
29 | |
30 :arg doc: the document to parse as a string or file-like object | |
31 | |
32 :arg treebuilder: the treebuilder to use when parsing | |
33 | |
34 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
35 | |
36 :returns: parsed tree | |
37 | |
38 Example: | |
39 | |
40 >>> from html5lib.html5parser import parse | |
41 >>> parse('<html><body><p>This is a doc</p></body></html>') | |
42 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> | |
43 | |
44 """ | |
45 tb = treebuilders.getTreeBuilder(treebuilder) | |
46 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | |
47 return p.parse(doc, **kwargs) | |
48 | |
49 | |
50 def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): | |
51 """Parse an HTML fragment as a string or file-like object into a tree | |
52 | |
53 :arg doc: the fragment to parse as a string or file-like object | |
54 | |
55 :arg container: the container context to parse the fragment in | |
56 | |
57 :arg treebuilder: the treebuilder to use when parsing | |
58 | |
59 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
60 | |
61 :returns: parsed tree | |
62 | |
63 Example: | |
64 | |
65 >>> from html5lib.html5libparser import parseFragment | |
66 >>> parseFragment('<b>this is a fragment</b>') | |
67 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> | |
68 | |
69 """ | |
70 tb = treebuilders.getTreeBuilder(treebuilder) | |
71 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | |
72 return p.parseFragment(doc, container=container, **kwargs) | |
73 | |
74 | |
75 def method_decorator_metaclass(function): | |
76 class Decorated(type): | |
77 def __new__(meta, classname, bases, classDict): | |
78 for attributeName, attribute in classDict.items(): | |
79 if isinstance(attribute, types.FunctionType): | |
80 attribute = function(attribute) | |
81 | |
82 classDict[attributeName] = attribute | |
83 return type.__new__(meta, classname, bases, classDict) | |
84 return Decorated | |
85 | |
86 | |
87 class HTMLParser(object): | |
88 """HTML parser | |
89 | |
90 Generates a tree structure from a stream of (possibly malformed) HTML. | |
91 | |
92 """ | |
93 | |
94 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): | |
95 """ | |
96 :arg tree: a treebuilder class controlling the type of tree that will be | |
97 returned. Built in treebuilders can be accessed through | |
98 html5lib.treebuilders.getTreeBuilder(treeType) | |
99 | |
100 :arg strict: raise an exception when a parse error is encountered | |
101 | |
102 :arg namespaceHTMLElements: whether or not to namespace HTML elements | |
103 | |
104 :arg debug: whether or not to enable debug mode which logs things | |
105 | |
106 Example: | |
107 | |
108 >>> from html5lib.html5parser import HTMLParser | |
109 >>> parser = HTMLParser() # generates parser with etree builder | |
110 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict | |
111 | |
112 """ | |
113 | |
114 # Raise an exception on the first error encountered | |
115 self.strict = strict | |
116 | |
117 if tree is None: | |
118 tree = treebuilders.getTreeBuilder("etree") | |
119 self.tree = tree(namespaceHTMLElements) | |
120 self.errors = [] | |
121 | |
122 self.phases = dict([(name, cls(self, self.tree)) for name, cls in | |
123 getPhases(debug).items()]) | |
124 | |
125 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): | |
126 | |
127 self.innerHTMLMode = innerHTML | |
128 self.container = container | |
129 self.scripting = scripting | |
130 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) | |
131 self.reset() | |
132 | |
133 try: | |
134 self.mainLoop() | |
135 except _ReparseException: | |
136 self.reset() | |
137 self.mainLoop() | |
138 | |
139 def reset(self): | |
140 self.tree.reset() | |
141 self.firstStartTag = False | |
142 self.errors = [] | |
143 self.log = [] # only used with debug mode | |
144 # "quirks" / "limited quirks" / "no quirks" | |
145 self.compatMode = "no quirks" | |
146 | |
147 if self.innerHTMLMode: | |
148 self.innerHTML = self.container.lower() | |
149 | |
150 if self.innerHTML in cdataElements: | |
151 self.tokenizer.state = self.tokenizer.rcdataState | |
152 elif self.innerHTML in rcdataElements: | |
153 self.tokenizer.state = self.tokenizer.rawtextState | |
154 elif self.innerHTML == 'plaintext': | |
155 self.tokenizer.state = self.tokenizer.plaintextState | |
156 else: | |
157 # state already is data state | |
158 # self.tokenizer.state = self.tokenizer.dataState | |
159 pass | |
160 self.phase = self.phases["beforeHtml"] | |
161 self.phase.insertHtmlElement() | |
162 self.resetInsertionMode() | |
163 else: | |
164 self.innerHTML = False # pylint:disable=redefined-variable-type | |
165 self.phase = self.phases["initial"] | |
166 | |
167 self.lastPhase = None | |
168 | |
169 self.beforeRCDataPhase = None | |
170 | |
171 self.framesetOK = True | |
172 | |
173 @property | |
174 def documentEncoding(self): | |
175 """Name of the character encoding that was used to decode the input stream, or | |
176 :obj:`None` if that is not determined yet | |
177 | |
178 """ | |
179 if not hasattr(self, 'tokenizer'): | |
180 return None | |
181 return self.tokenizer.stream.charEncoding[0].name | |
182 | |
183 def isHTMLIntegrationPoint(self, element): | |
184 if (element.name == "annotation-xml" and | |
185 element.namespace == namespaces["mathml"]): | |
186 return ("encoding" in element.attributes and | |
187 element.attributes["encoding"].translate( | |
188 asciiUpper2Lower) in | |
189 ("text/html", "application/xhtml+xml")) | |
190 else: | |
191 return (element.namespace, element.name) in htmlIntegrationPointElements | |
192 | |
193 def isMathMLTextIntegrationPoint(self, element): | |
194 return (element.namespace, element.name) in mathmlTextIntegrationPointElements | |
195 | |
196 def mainLoop(self): | |
197 CharactersToken = tokenTypes["Characters"] | |
198 SpaceCharactersToken = tokenTypes["SpaceCharacters"] | |
199 StartTagToken = tokenTypes["StartTag"] | |
200 EndTagToken = tokenTypes["EndTag"] | |
201 CommentToken = tokenTypes["Comment"] | |
202 DoctypeToken = tokenTypes["Doctype"] | |
203 ParseErrorToken = tokenTypes["ParseError"] | |
204 | |
205 for token in self.normalizedTokens(): | |
206 prev_token = None | |
207 new_token = token | |
208 while new_token is not None: | |
209 prev_token = new_token | |
210 currentNode = self.tree.openElements[-1] if self.tree.openElements else None | |
211 currentNodeNamespace = currentNode.namespace if currentNode else None | |
212 currentNodeName = currentNode.name if currentNode else None | |
213 | |
214 type = new_token["type"] | |
215 | |
216 if type == ParseErrorToken: | |
217 self.parseError(new_token["data"], new_token.get("datavars", {})) | |
218 new_token = None | |
219 else: | |
220 if (len(self.tree.openElements) == 0 or | |
221 currentNodeNamespace == self.tree.defaultNamespace or | |
222 (self.isMathMLTextIntegrationPoint(currentNode) and | |
223 ((type == StartTagToken and | |
224 token["name"] not in frozenset(["mglyph", "malignmark"])) or | |
225 type in (CharactersToken, SpaceCharactersToken))) or | |
226 (currentNodeNamespace == namespaces["mathml"] and | |
227 currentNodeName == "annotation-xml" and | |
228 type == StartTagToken and | |
229 token["name"] == "svg") or | |
230 (self.isHTMLIntegrationPoint(currentNode) and | |
231 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): | |
232 phase = self.phase | |
233 else: | |
234 phase = self.phases["inForeignContent"] | |
235 | |
236 if type == CharactersToken: | |
237 new_token = phase.processCharacters(new_token) | |
238 elif type == SpaceCharactersToken: | |
239 new_token = phase.processSpaceCharacters(new_token) | |
240 elif type == StartTagToken: | |
241 new_token = phase.processStartTag(new_token) | |
242 elif type == EndTagToken: | |
243 new_token = phase.processEndTag(new_token) | |
244 elif type == CommentToken: | |
245 new_token = phase.processComment(new_token) | |
246 elif type == DoctypeToken: | |
247 new_token = phase.processDoctype(new_token) | |
248 | |
249 if (type == StartTagToken and prev_token["selfClosing"] and | |
250 not prev_token["selfClosingAcknowledged"]): | |
251 self.parseError("non-void-element-with-trailing-solidus", | |
252 {"name": prev_token["name"]}) | |
253 | |
254 # When the loop finishes it's EOF | |
255 reprocess = True | |
256 phases = [] | |
257 while reprocess: | |
258 phases.append(self.phase) | |
259 reprocess = self.phase.processEOF() | |
260 if reprocess: | |
261 assert self.phase not in phases | |
262 | |
263 def normalizedTokens(self): | |
264 for token in self.tokenizer: | |
265 yield self.normalizeToken(token) | |
266 | |
267 def parse(self, stream, *args, **kwargs): | |
268 """Parse a HTML document into a well-formed tree | |
269 | |
270 :arg stream: a file-like object or string containing the HTML to be parsed | |
271 | |
272 The optional encoding parameter must be a string that indicates | |
273 the encoding. If specified, that encoding will be used, | |
274 regardless of any BOM or later declaration (such as in a meta | |
275 element). | |
276 | |
277 :arg scripting: treat noscript elements as if JavaScript was turned on | |
278 | |
279 :returns: parsed tree | |
280 | |
281 Example: | |
282 | |
283 >>> from html5lib.html5parser import HTMLParser | |
284 >>> parser = HTMLParser() | |
285 >>> parser.parse('<html><body><p>This is a doc</p></body></html>') | |
286 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> | |
287 | |
288 """ | |
289 self._parse(stream, False, None, *args, **kwargs) | |
290 return self.tree.getDocument() | |
291 | |
292 def parseFragment(self, stream, *args, **kwargs): | |
293 """Parse a HTML fragment into a well-formed tree fragment | |
294 | |
295 :arg container: name of the element we're setting the innerHTML | |
296 property if set to None, default to 'div' | |
297 | |
298 :arg stream: a file-like object or string containing the HTML to be parsed | |
299 | |
300 The optional encoding parameter must be a string that indicates | |
301 the encoding. If specified, that encoding will be used, | |
302 regardless of any BOM or later declaration (such as in a meta | |
303 element) | |
304 | |
305 :arg scripting: treat noscript elements as if JavaScript was turned on | |
306 | |
307 :returns: parsed tree | |
308 | |
309 Example: | |
310 | |
311 >>> from html5lib.html5libparser import HTMLParser | |
312 >>> parser = HTMLParser() | |
313 >>> parser.parseFragment('<b>this is a fragment</b>') | |
314 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> | |
315 | |
316 """ | |
317 self._parse(stream, True, *args, **kwargs) | |
318 return self.tree.getFragment() | |
319 | |
320 def parseError(self, errorcode="XXX-undefined-error", datavars=None): | |
321 # XXX The idea is to make errorcode mandatory. | |
322 if datavars is None: | |
323 datavars = {} | |
324 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) | |
325 if self.strict: | |
326 raise ParseError(E[errorcode] % datavars) | |
327 | |
328 def normalizeToken(self, token): | |
329 # HTML5 specific normalizations to the token stream | |
330 if token["type"] == tokenTypes["StartTag"]: | |
331 raw = token["data"] | |
332 token["data"] = OrderedDict(raw) | |
333 if len(raw) > len(token["data"]): | |
334 # we had some duplicated attribute, fix so first wins | |
335 token["data"].update(raw[::-1]) | |
336 | |
337 return token | |
338 | |
339 def adjustMathMLAttributes(self, token): | |
340 adjust_attributes(token, adjustMathMLAttributes) | |
341 | |
342 def adjustSVGAttributes(self, token): | |
343 adjust_attributes(token, adjustSVGAttributes) | |
344 | |
345 def adjustForeignAttributes(self, token): | |
346 adjust_attributes(token, adjustForeignAttributesMap) | |
347 | |
348 def reparseTokenNormal(self, token): | |
349 # pylint:disable=unused-argument | |
350 self.parser.phase() | |
351 | |
352 def resetInsertionMode(self): | |
353 # The name of this method is mostly historical. (It's also used in the | |
354 # specification.) | |
355 last = False | |
356 newModes = { | |
357 "select": "inSelect", | |
358 "td": "inCell", | |
359 "th": "inCell", | |
360 "tr": "inRow", | |
361 "tbody": "inTableBody", | |
362 "thead": "inTableBody", | |
363 "tfoot": "inTableBody", | |
364 "caption": "inCaption", | |
365 "colgroup": "inColumnGroup", | |
366 "table": "inTable", | |
367 "head": "inBody", | |
368 "body": "inBody", | |
369 "frameset": "inFrameset", | |
370 "html": "beforeHead" | |
371 } | |
372 for node in self.tree.openElements[::-1]: | |
373 nodeName = node.name | |
374 new_phase = None | |
375 if node == self.tree.openElements[0]: | |
376 assert self.innerHTML | |
377 last = True | |
378 nodeName = self.innerHTML | |
379 # Check for conditions that should only happen in the innerHTML | |
380 # case | |
381 if nodeName in ("select", "colgroup", "head", "html"): | |
382 assert self.innerHTML | |
383 | |
384 if not last and node.namespace != self.tree.defaultNamespace: | |
385 continue | |
386 | |
387 if nodeName in newModes: | |
388 new_phase = self.phases[newModes[nodeName]] | |
389 break | |
390 elif last: | |
391 new_phase = self.phases["inBody"] | |
392 break | |
393 | |
394 self.phase = new_phase | |
395 | |
396 def parseRCDataRawtext(self, token, contentType): | |
397 # Generic RCDATA/RAWTEXT Parsing algorithm | |
398 assert contentType in ("RAWTEXT", "RCDATA") | |
399 | |
400 self.tree.insertElement(token) | |
401 | |
402 if contentType == "RAWTEXT": | |
403 self.tokenizer.state = self.tokenizer.rawtextState | |
404 else: | |
405 self.tokenizer.state = self.tokenizer.rcdataState | |
406 | |
407 self.originalPhase = self.phase | |
408 | |
409 self.phase = self.phases["text"] | |
410 | |
411 | |
412 @_utils.memoize | |
413 def getPhases(debug): | |
414 def log(function): | |
415 """Logger that records which phase processes each token""" | |
416 type_names = dict((value, key) for key, value in | |
417 tokenTypes.items()) | |
418 | |
419 def wrapped(self, *args, **kwargs): | |
420 if function.__name__.startswith("process") and len(args) > 0: | |
421 token = args[0] | |
422 try: | |
423 info = {"type": type_names[token['type']]} | |
424 except: | |
425 raise | |
426 if token['type'] in tagTokenTypes: | |
427 info["name"] = token['name'] | |
428 | |
429 self.parser.log.append((self.parser.tokenizer.state.__name__, | |
430 self.parser.phase.__class__.__name__, | |
431 self.__class__.__name__, | |
432 function.__name__, | |
433 info)) | |
434 return function(self, *args, **kwargs) | |
435 else: | |
436 return function(self, *args, **kwargs) | |
437 return wrapped | |
438 | |
439 def getMetaclass(use_metaclass, metaclass_func): | |
440 if use_metaclass: | |
441 return method_decorator_metaclass(metaclass_func) | |
442 else: | |
443 return type | |
444 | |
445 # pylint:disable=unused-argument | |
446 class Phase(with_metaclass(getMetaclass(debug, log))): | |
447 """Base class for helper object that implements each phase of processing | |
448 """ | |
449 | |
450 def __init__(self, parser, tree): | |
451 self.parser = parser | |
452 self.tree = tree | |
453 | |
454 def processEOF(self): | |
455 raise NotImplementedError | |
456 | |
457 def processComment(self, token): | |
458 # For most phases the following is correct. Where it's not it will be | |
459 # overridden. | |
460 self.tree.insertComment(token, self.tree.openElements[-1]) | |
461 | |
462 def processDoctype(self, token): | |
463 self.parser.parseError("unexpected-doctype") | |
464 | |
465 def processCharacters(self, token): | |
466 self.tree.insertText(token["data"]) | |
467 | |
468 def processSpaceCharacters(self, token): | |
469 self.tree.insertText(token["data"]) | |
470 | |
471 def processStartTag(self, token): | |
472 return self.startTagHandler[token["name"]](token) | |
473 | |
474 def startTagHtml(self, token): | |
475 if not self.parser.firstStartTag and token["name"] == "html": | |
476 self.parser.parseError("non-html-root") | |
477 # XXX Need a check here to see if the first start tag token emitted is | |
478 # this token... If it's not, invoke self.parser.parseError(). | |
479 for attr, value in token["data"].items(): | |
480 if attr not in self.tree.openElements[0].attributes: | |
481 self.tree.openElements[0].attributes[attr] = value | |
482 self.parser.firstStartTag = False | |
483 | |
484 def processEndTag(self, token): | |
485 return self.endTagHandler[token["name"]](token) | |
486 | |
487 class InitialPhase(Phase): | |
488 def processSpaceCharacters(self, token): | |
489 pass | |
490 | |
491 def processComment(self, token): | |
492 self.tree.insertComment(token, self.tree.document) | |
493 | |
494 def processDoctype(self, token): | |
495 name = token["name"] | |
496 publicId = token["publicId"] | |
497 systemId = token["systemId"] | |
498 correct = token["correct"] | |
499 | |
500 if (name != "html" or publicId is not None or | |
501 systemId is not None and systemId != "about:legacy-compat"): | |
502 self.parser.parseError("unknown-doctype") | |
503 | |
504 if publicId is None: | |
505 publicId = "" | |
506 | |
507 self.tree.insertDoctype(token) | |
508 | |
509 if publicId != "": | |
510 publicId = publicId.translate(asciiUpper2Lower) | |
511 | |
512 if (not correct or token["name"] != "html" or | |
513 publicId.startswith( | |
514 ("+//silmaril//dtd html pro v0r11 19970101//", | |
515 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", | |
516 "-//as//dtd html 3.0 aswedit + extensions//", | |
517 "-//ietf//dtd html 2.0 level 1//", | |
518 "-//ietf//dtd html 2.0 level 2//", | |
519 "-//ietf//dtd html 2.0 strict level 1//", | |
520 "-//ietf//dtd html 2.0 strict level 2//", | |
521 "-//ietf//dtd html 2.0 strict//", | |
522 "-//ietf//dtd html 2.0//", | |
523 "-//ietf//dtd html 2.1e//", | |
524 "-//ietf//dtd html 3.0//", | |
525 "-//ietf//dtd html 3.2 final//", | |
526 "-//ietf//dtd html 3.2//", | |
527 "-//ietf//dtd html 3//", | |
528 "-//ietf//dtd html level 0//", | |
529 "-//ietf//dtd html level 1//", | |
530 "-//ietf//dtd html level 2//", | |
531 "-//ietf//dtd html level 3//", | |
532 "-//ietf//dtd html strict level 0//", | |
533 "-//ietf//dtd html strict level 1//", | |
534 "-//ietf//dtd html strict level 2//", | |
535 "-//ietf//dtd html strict level 3//", | |
536 "-//ietf//dtd html strict//", | |
537 "-//ietf//dtd html//", | |
538 "-//metrius//dtd metrius presentational//", | |
539 "-//microsoft//dtd internet explorer 2.0 html strict//", | |
540 "-//microsoft//dtd internet explorer 2.0 html//", | |
541 "-//microsoft//dtd internet explorer 2.0 tables//", | |
542 "-//microsoft//dtd internet explorer 3.0 html strict//", | |
543 "-//microsoft//dtd internet explorer 3.0 html//", | |
544 "-//microsoft//dtd internet explorer 3.0 tables//", | |
545 "-//netscape comm. corp.//dtd html//", | |
546 "-//netscape comm. corp.//dtd strict html//", | |
547 "-//o'reilly and associates//dtd html 2.0//", | |
548 "-//o'reilly and associates//dtd html extended 1.0//", | |
549 "-//o'reilly and associates//dtd html extended relaxed 1.0//", | |
550 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", | |
551 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", | |
552 "-//spyglass//dtd html 2.0 extended//", | |
553 "-//sq//dtd html 2.0 hotmetal + extensions//", | |
554 "-//sun microsystems corp.//dtd hotjava html//", | |
555 "-//sun microsystems corp.//dtd hotjava strict html//", | |
556 "-//w3c//dtd html 3 1995-03-24//", | |
557 "-//w3c//dtd html 3.2 draft//", | |
558 "-//w3c//dtd html 3.2 final//", | |
559 "-//w3c//dtd html 3.2//", | |
560 "-//w3c//dtd html 3.2s draft//", | |
561 "-//w3c//dtd html 4.0 frameset//", | |
562 "-//w3c//dtd html 4.0 transitional//", | |
563 "-//w3c//dtd html experimental 19960712//", | |
564 "-//w3c//dtd html experimental 970421//", | |
565 "-//w3c//dtd w3 html//", | |
566 "-//w3o//dtd w3 html 3.0//", | |
567 "-//webtechs//dtd mozilla html 2.0//", | |
568 "-//webtechs//dtd mozilla html//")) or | |
569 publicId in ("-//w3o//dtd w3 html strict 3.0//en//", | |
570 "-/w3c/dtd html 4.0 transitional/en", | |
571 "html") or | |
572 publicId.startswith( | |
573 ("-//w3c//dtd html 4.01 frameset//", | |
574 "-//w3c//dtd html 4.01 transitional//")) and | |
575 systemId is None or | |
576 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): | |
577 self.parser.compatMode = "quirks" | |
578 elif (publicId.startswith( | |
579 ("-//w3c//dtd xhtml 1.0 frameset//", | |
580 "-//w3c//dtd xhtml 1.0 transitional//")) or | |
581 publicId.startswith( | |
582 ("-//w3c//dtd html 4.01 frameset//", | |
583 "-//w3c//dtd html 4.01 transitional//")) and | |
584 systemId is not None): | |
585 self.parser.compatMode = "limited quirks" | |
586 | |
587 self.parser.phase = self.parser.phases["beforeHtml"] | |
588 | |
589 def anythingElse(self): | |
590 self.parser.compatMode = "quirks" | |
591 self.parser.phase = self.parser.phases["beforeHtml"] | |
592 | |
593 def processCharacters(self, token): | |
594 self.parser.parseError("expected-doctype-but-got-chars") | |
595 self.anythingElse() | |
596 return token | |
597 | |
598 def processStartTag(self, token): | |
599 self.parser.parseError("expected-doctype-but-got-start-tag", | |
600 {"name": token["name"]}) | |
601 self.anythingElse() | |
602 return token | |
603 | |
604 def processEndTag(self, token): | |
605 self.parser.parseError("expected-doctype-but-got-end-tag", | |
606 {"name": token["name"]}) | |
607 self.anythingElse() | |
608 return token | |
609 | |
610 def processEOF(self): | |
611 self.parser.parseError("expected-doctype-but-got-eof") | |
612 self.anythingElse() | |
613 return True | |
614 | |
615 class BeforeHtmlPhase(Phase): | |
616 # helper methods | |
617 def insertHtmlElement(self): | |
618 self.tree.insertRoot(impliedTagToken("html", "StartTag")) | |
619 self.parser.phase = self.parser.phases["beforeHead"] | |
620 | |
621 # other | |
622 def processEOF(self): | |
623 self.insertHtmlElement() | |
624 return True | |
625 | |
626 def processComment(self, token): | |
627 self.tree.insertComment(token, self.tree.document) | |
628 | |
629 def processSpaceCharacters(self, token): | |
630 pass | |
631 | |
632 def processCharacters(self, token): | |
633 self.insertHtmlElement() | |
634 return token | |
635 | |
636 def processStartTag(self, token): | |
637 if token["name"] == "html": | |
638 self.parser.firstStartTag = True | |
639 self.insertHtmlElement() | |
640 return token | |
641 | |
642 def processEndTag(self, token): | |
643 if token["name"] not in ("head", "body", "html", "br"): | |
644 self.parser.parseError("unexpected-end-tag-before-html", | |
645 {"name": token["name"]}) | |
646 else: | |
647 self.insertHtmlElement() | |
648 return token | |
649 | |
650 class BeforeHeadPhase(Phase): | |
651 def __init__(self, parser, tree): | |
652 Phase.__init__(self, parser, tree) | |
653 | |
654 self.startTagHandler = _utils.MethodDispatcher([ | |
655 ("html", self.startTagHtml), | |
656 ("head", self.startTagHead) | |
657 ]) | |
658 self.startTagHandler.default = self.startTagOther | |
659 | |
660 self.endTagHandler = _utils.MethodDispatcher([ | |
661 (("head", "body", "html", "br"), self.endTagImplyHead) | |
662 ]) | |
663 self.endTagHandler.default = self.endTagOther | |
664 | |
665 def processEOF(self): | |
666 self.startTagHead(impliedTagToken("head", "StartTag")) | |
667 return True | |
668 | |
669 def processSpaceCharacters(self, token): | |
670 pass | |
671 | |
672 def processCharacters(self, token): | |
673 self.startTagHead(impliedTagToken("head", "StartTag")) | |
674 return token | |
675 | |
676 def startTagHtml(self, token): | |
677 return self.parser.phases["inBody"].processStartTag(token) | |
678 | |
679 def startTagHead(self, token): | |
680 self.tree.insertElement(token) | |
681 self.tree.headPointer = self.tree.openElements[-1] | |
682 self.parser.phase = self.parser.phases["inHead"] | |
683 | |
684 def startTagOther(self, token): | |
685 self.startTagHead(impliedTagToken("head", "StartTag")) | |
686 return token | |
687 | |
688 def endTagImplyHead(self, token): | |
689 self.startTagHead(impliedTagToken("head", "StartTag")) | |
690 return token | |
691 | |
692 def endTagOther(self, token): | |
693 self.parser.parseError("end-tag-after-implied-root", | |
694 {"name": token["name"]}) | |
695 | |
696 class InHeadPhase(Phase): | |
697 def __init__(self, parser, tree): | |
698 Phase.__init__(self, parser, tree) | |
699 | |
700 self.startTagHandler = _utils.MethodDispatcher([ | |
701 ("html", self.startTagHtml), | |
702 ("title", self.startTagTitle), | |
703 (("noframes", "style"), self.startTagNoFramesStyle), | |
704 ("noscript", self.startTagNoscript), | |
705 ("script", self.startTagScript), | |
706 (("base", "basefont", "bgsound", "command", "link"), | |
707 self.startTagBaseLinkCommand), | |
708 ("meta", self.startTagMeta), | |
709 ("head", self.startTagHead) | |
710 ]) | |
711 self.startTagHandler.default = self.startTagOther | |
712 | |
713 self.endTagHandler = _utils.MethodDispatcher([ | |
714 ("head", self.endTagHead), | |
715 (("br", "html", "body"), self.endTagHtmlBodyBr) | |
716 ]) | |
717 self.endTagHandler.default = self.endTagOther | |
718 | |
719 # the real thing | |
720 def processEOF(self): | |
721 self.anythingElse() | |
722 return True | |
723 | |
724 def processCharacters(self, token): | |
725 self.anythingElse() | |
726 return token | |
727 | |
728 def startTagHtml(self, token): | |
729 return self.parser.phases["inBody"].processStartTag(token) | |
730 | |
731 def startTagHead(self, token): | |
732 self.parser.parseError("two-heads-are-not-better-than-one") | |
733 | |
734 def startTagBaseLinkCommand(self, token): | |
735 self.tree.insertElement(token) | |
736 self.tree.openElements.pop() | |
737 token["selfClosingAcknowledged"] = True | |
738 | |
739 def startTagMeta(self, token): | |
740 self.tree.insertElement(token) | |
741 self.tree.openElements.pop() | |
742 token["selfClosingAcknowledged"] = True | |
743 | |
744 attributes = token["data"] | |
745 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": | |
746 if "charset" in attributes: | |
747 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) | |
748 elif ("content" in attributes and | |
749 "http-equiv" in attributes and | |
750 attributes["http-equiv"].lower() == "content-type"): | |
751 # Encoding it as UTF-8 here is a hack, as really we should pass | |
752 # the abstract Unicode string, and just use the | |
753 # ContentAttrParser on that, but using UTF-8 allows all chars | |
754 # to be encoded and as a ASCII-superset works. | |
755 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) | |
756 parser = _inputstream.ContentAttrParser(data) | |
757 codec = parser.parse() | |
758 self.parser.tokenizer.stream.changeEncoding(codec) | |
759 | |
760 def startTagTitle(self, token): | |
761 self.parser.parseRCDataRawtext(token, "RCDATA") | |
762 | |
763 def startTagNoFramesStyle(self, token): | |
764 # Need to decide whether to implement the scripting-disabled case | |
765 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
766 | |
767 def startTagNoscript(self, token): | |
768 if self.parser.scripting: | |
769 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
770 else: | |
771 self.tree.insertElement(token) | |
772 self.parser.phase = self.parser.phases["inHeadNoscript"] | |
773 | |
774 def startTagScript(self, token): | |
775 self.tree.insertElement(token) | |
776 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState | |
777 self.parser.originalPhase = self.parser.phase | |
778 self.parser.phase = self.parser.phases["text"] | |
779 | |
780 def startTagOther(self, token): | |
781 self.anythingElse() | |
782 return token | |
783 | |
784 def endTagHead(self, token): | |
785 node = self.parser.tree.openElements.pop() | |
786 assert node.name == "head", "Expected head got %s" % node.name | |
787 self.parser.phase = self.parser.phases["afterHead"] | |
788 | |
789 def endTagHtmlBodyBr(self, token): | |
790 self.anythingElse() | |
791 return token | |
792 | |
793 def endTagOther(self, token): | |
794 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
795 | |
796 def anythingElse(self): | |
797 self.endTagHead(impliedTagToken("head")) | |
798 | |
799 class InHeadNoscriptPhase(Phase): | |
800 def __init__(self, parser, tree): | |
801 Phase.__init__(self, parser, tree) | |
802 | |
803 self.startTagHandler = _utils.MethodDispatcher([ | |
804 ("html", self.startTagHtml), | |
805 (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), | |
806 (("head", "noscript"), self.startTagHeadNoscript), | |
807 ]) | |
808 self.startTagHandler.default = self.startTagOther | |
809 | |
810 self.endTagHandler = _utils.MethodDispatcher([ | |
811 ("noscript", self.endTagNoscript), | |
812 ("br", self.endTagBr), | |
813 ]) | |
814 self.endTagHandler.default = self.endTagOther | |
815 | |
816 def processEOF(self): | |
817 self.parser.parseError("eof-in-head-noscript") | |
818 self.anythingElse() | |
819 return True | |
820 | |
821 def processComment(self, token): | |
822 return self.parser.phases["inHead"].processComment(token) | |
823 | |
824 def processCharacters(self, token): | |
825 self.parser.parseError("char-in-head-noscript") | |
826 self.anythingElse() | |
827 return token | |
828 | |
829 def processSpaceCharacters(self, token): | |
830 return self.parser.phases["inHead"].processSpaceCharacters(token) | |
831 | |
832 def startTagHtml(self, token): | |
833 return self.parser.phases["inBody"].processStartTag(token) | |
834 | |
835 def startTagBaseLinkCommand(self, token): | |
836 return self.parser.phases["inHead"].processStartTag(token) | |
837 | |
838 def startTagHeadNoscript(self, token): | |
839 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
840 | |
841 def startTagOther(self, token): | |
842 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) | |
843 self.anythingElse() | |
844 return token | |
845 | |
846 def endTagNoscript(self, token): | |
847 node = self.parser.tree.openElements.pop() | |
848 assert node.name == "noscript", "Expected noscript got %s" % node.name | |
849 self.parser.phase = self.parser.phases["inHead"] | |
850 | |
851 def endTagBr(self, token): | |
852 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) | |
853 self.anythingElse() | |
854 return token | |
855 | |
856 def endTagOther(self, token): | |
857 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
858 | |
859 def anythingElse(self): | |
860 # Caller must raise parse error first! | |
861 self.endTagNoscript(impliedTagToken("noscript")) | |
862 | |
863 class AfterHeadPhase(Phase): | |
864 def __init__(self, parser, tree): | |
865 Phase.__init__(self, parser, tree) | |
866 | |
867 self.startTagHandler = _utils.MethodDispatcher([ | |
868 ("html", self.startTagHtml), | |
869 ("body", self.startTagBody), | |
870 ("frameset", self.startTagFrameset), | |
871 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", | |
872 "style", "title"), | |
873 self.startTagFromHead), | |
874 ("head", self.startTagHead) | |
875 ]) | |
876 self.startTagHandler.default = self.startTagOther | |
877 self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), | |
878 self.endTagHtmlBodyBr)]) | |
879 self.endTagHandler.default = self.endTagOther | |
880 | |
881 def processEOF(self): | |
882 self.anythingElse() | |
883 return True | |
884 | |
885 def processCharacters(self, token): | |
886 self.anythingElse() | |
887 return token | |
888 | |
889 def startTagHtml(self, token): | |
890 return self.parser.phases["inBody"].processStartTag(token) | |
891 | |
892 def startTagBody(self, token): | |
893 self.parser.framesetOK = False | |
894 self.tree.insertElement(token) | |
895 self.parser.phase = self.parser.phases["inBody"] | |
896 | |
897 def startTagFrameset(self, token): | |
898 self.tree.insertElement(token) | |
899 self.parser.phase = self.parser.phases["inFrameset"] | |
900 | |
901 def startTagFromHead(self, token): | |
902 self.parser.parseError("unexpected-start-tag-out-of-my-head", | |
903 {"name": token["name"]}) | |
904 self.tree.openElements.append(self.tree.headPointer) | |
905 self.parser.phases["inHead"].processStartTag(token) | |
906 for node in self.tree.openElements[::-1]: | |
907 if node.name == "head": | |
908 self.tree.openElements.remove(node) | |
909 break | |
910 | |
911 def startTagHead(self, token): | |
912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
913 | |
914 def startTagOther(self, token): | |
915 self.anythingElse() | |
916 return token | |
917 | |
918 def endTagHtmlBodyBr(self, token): | |
919 self.anythingElse() | |
920 return token | |
921 | |
922 def endTagOther(self, token): | |
923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
924 | |
925 def anythingElse(self): | |
926 self.tree.insertElement(impliedTagToken("body", "StartTag")) | |
927 self.parser.phase = self.parser.phases["inBody"] | |
928 self.parser.framesetOK = True | |
929 | |
930 class InBodyPhase(Phase): | |
931 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody | |
932 # the really-really-really-very crazy mode | |
933 def __init__(self, parser, tree): | |
934 Phase.__init__(self, parser, tree) | |
935 | |
936 # Set this to the default handler | |
937 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
938 | |
939 self.startTagHandler = _utils.MethodDispatcher([ | |
940 ("html", self.startTagHtml), | |
941 (("base", "basefont", "bgsound", "command", "link", "meta", | |
942 "script", "style", "title"), | |
943 self.startTagProcessInHead), | |
944 ("body", self.startTagBody), | |
945 ("frameset", self.startTagFrameset), | |
946 (("address", "article", "aside", "blockquote", "center", "details", | |
947 "dir", "div", "dl", "fieldset", "figcaption", "figure", | |
948 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", | |
949 "section", "summary", "ul"), | |
950 self.startTagCloseP), | |
951 (headingElements, self.startTagHeading), | |
952 (("pre", "listing"), self.startTagPreListing), | |
953 ("form", self.startTagForm), | |
954 (("li", "dd", "dt"), self.startTagListItem), | |
955 ("plaintext", self.startTagPlaintext), | |
956 ("a", self.startTagA), | |
957 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", | |
958 "strong", "tt", "u"), self.startTagFormatting), | |
959 ("nobr", self.startTagNobr), | |
960 ("button", self.startTagButton), | |
961 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), | |
962 ("xmp", self.startTagXmp), | |
963 ("table", self.startTagTable), | |
964 (("area", "br", "embed", "img", "keygen", "wbr"), | |
965 self.startTagVoidFormatting), | |
966 (("param", "source", "track"), self.startTagParamSource), | |
967 ("input", self.startTagInput), | |
968 ("hr", self.startTagHr), | |
969 ("image", self.startTagImage), | |
970 ("isindex", self.startTagIsIndex), | |
971 ("textarea", self.startTagTextarea), | |
972 ("iframe", self.startTagIFrame), | |
973 ("noscript", self.startTagNoscript), | |
974 (("noembed", "noframes"), self.startTagRawtext), | |
975 ("select", self.startTagSelect), | |
976 (("rp", "rt"), self.startTagRpRt), | |
977 (("option", "optgroup"), self.startTagOpt), | |
978 (("math"), self.startTagMath), | |
979 (("svg"), self.startTagSvg), | |
980 (("caption", "col", "colgroup", "frame", "head", | |
981 "tbody", "td", "tfoot", "th", "thead", | |
982 "tr"), self.startTagMisplaced) | |
983 ]) | |
984 self.startTagHandler.default = self.startTagOther | |
985 | |
986 self.endTagHandler = _utils.MethodDispatcher([ | |
987 ("body", self.endTagBody), | |
988 ("html", self.endTagHtml), | |
989 (("address", "article", "aside", "blockquote", "button", "center", | |
990 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", | |
991 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", | |
992 "section", "summary", "ul"), self.endTagBlock), | |
993 ("form", self.endTagForm), | |
994 ("p", self.endTagP), | |
995 (("dd", "dt", "li"), self.endTagListItem), | |
996 (headingElements, self.endTagHeading), | |
997 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", | |
998 "strike", "strong", "tt", "u"), self.endTagFormatting), | |
999 (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), | |
1000 ("br", self.endTagBr), | |
1001 ]) | |
1002 self.endTagHandler.default = self.endTagOther | |
1003 | |
1004 def isMatchingFormattingElement(self, node1, node2): | |
1005 return (node1.name == node2.name and | |
1006 node1.namespace == node2.namespace and | |
1007 node1.attributes == node2.attributes) | |
1008 | |
1009 # helper | |
1010 def addFormattingElement(self, token): | |
1011 self.tree.insertElement(token) | |
1012 element = self.tree.openElements[-1] | |
1013 | |
1014 matchingElements = [] | |
1015 for node in self.tree.activeFormattingElements[::-1]: | |
1016 if node is Marker: | |
1017 break | |
1018 elif self.isMatchingFormattingElement(node, element): | |
1019 matchingElements.append(node) | |
1020 | |
1021 assert len(matchingElements) <= 3 | |
1022 if len(matchingElements) == 3: | |
1023 self.tree.activeFormattingElements.remove(matchingElements[-1]) | |
1024 self.tree.activeFormattingElements.append(element) | |
1025 | |
1026 # the real deal | |
1027 def processEOF(self): | |
1028 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", | |
1029 "tfoot", "th", "thead", "tr", "body", | |
1030 "html")) | |
1031 for node in self.tree.openElements[::-1]: | |
1032 if node.name not in allowed_elements: | |
1033 self.parser.parseError("expected-closing-tag-but-got-eof") | |
1034 break | |
1035 # Stop parsing | |
1036 | |
1037 def processSpaceCharactersDropNewline(self, token): | |
1038 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we | |
1039 # want to drop leading newlines | |
1040 data = token["data"] | |
1041 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
1042 if (data.startswith("\n") and | |
1043 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and | |
1044 not self.tree.openElements[-1].hasContent()): | |
1045 data = data[1:] | |
1046 if data: | |
1047 self.tree.reconstructActiveFormattingElements() | |
1048 self.tree.insertText(data) | |
1049 | |
1050 def processCharacters(self, token): | |
1051 if token["data"] == "\u0000": | |
1052 # The tokenizer should always emit null on its own | |
1053 return | |
1054 self.tree.reconstructActiveFormattingElements() | |
1055 self.tree.insertText(token["data"]) | |
1056 # This must be bad for performance | |
1057 if (self.parser.framesetOK and | |
1058 any([char not in spaceCharacters | |
1059 for char in token["data"]])): | |
1060 self.parser.framesetOK = False | |
1061 | |
1062 def processSpaceCharactersNonPre(self, token): | |
1063 self.tree.reconstructActiveFormattingElements() | |
1064 self.tree.insertText(token["data"]) | |
1065 | |
1066 def startTagProcessInHead(self, token): | |
1067 return self.parser.phases["inHead"].processStartTag(token) | |
1068 | |
1069 def startTagBody(self, token): | |
1070 self.parser.parseError("unexpected-start-tag", {"name": "body"}) | |
1071 if (len(self.tree.openElements) == 1 or | |
1072 self.tree.openElements[1].name != "body"): | |
1073 assert self.parser.innerHTML | |
1074 else: | |
1075 self.parser.framesetOK = False | |
1076 for attr, value in token["data"].items(): | |
1077 if attr not in self.tree.openElements[1].attributes: | |
1078 self.tree.openElements[1].attributes[attr] = value | |
1079 | |
1080 def startTagFrameset(self, token): | |
1081 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) | |
1082 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): | |
1083 assert self.parser.innerHTML | |
1084 elif not self.parser.framesetOK: | |
1085 pass | |
1086 else: | |
1087 if self.tree.openElements[1].parent: | |
1088 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) | |
1089 while self.tree.openElements[-1].name != "html": | |
1090 self.tree.openElements.pop() | |
1091 self.tree.insertElement(token) | |
1092 self.parser.phase = self.parser.phases["inFrameset"] | |
1093 | |
1094 def startTagCloseP(self, token): | |
1095 if self.tree.elementInScope("p", variant="button"): | |
1096 self.endTagP(impliedTagToken("p")) | |
1097 self.tree.insertElement(token) | |
1098 | |
1099 def startTagPreListing(self, token): | |
1100 if self.tree.elementInScope("p", variant="button"): | |
1101 self.endTagP(impliedTagToken("p")) | |
1102 self.tree.insertElement(token) | |
1103 self.parser.framesetOK = False | |
1104 self.processSpaceCharacters = self.processSpaceCharactersDropNewline | |
1105 | |
1106 def startTagForm(self, token): | |
1107 if self.tree.formPointer: | |
1108 self.parser.parseError("unexpected-start-tag", {"name": "form"}) | |
1109 else: | |
1110 if self.tree.elementInScope("p", variant="button"): | |
1111 self.endTagP(impliedTagToken("p")) | |
1112 self.tree.insertElement(token) | |
1113 self.tree.formPointer = self.tree.openElements[-1] | |
1114 | |
1115 def startTagListItem(self, token): | |
1116 self.parser.framesetOK = False | |
1117 | |
1118 stopNamesMap = {"li": ["li"], | |
1119 "dt": ["dt", "dd"], | |
1120 "dd": ["dt", "dd"]} | |
1121 stopNames = stopNamesMap[token["name"]] | |
1122 for node in reversed(self.tree.openElements): | |
1123 if node.name in stopNames: | |
1124 self.parser.phase.processEndTag( | |
1125 impliedTagToken(node.name, "EndTag")) | |
1126 break | |
1127 if (node.nameTuple in specialElements and | |
1128 node.name not in ("address", "div", "p")): | |
1129 break | |
1130 | |
1131 if self.tree.elementInScope("p", variant="button"): | |
1132 self.parser.phase.processEndTag( | |
1133 impliedTagToken("p", "EndTag")) | |
1134 | |
1135 self.tree.insertElement(token) | |
1136 | |
1137 def startTagPlaintext(self, token): | |
1138 if self.tree.elementInScope("p", variant="button"): | |
1139 self.endTagP(impliedTagToken("p")) | |
1140 self.tree.insertElement(token) | |
1141 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState | |
1142 | |
1143 def startTagHeading(self, token): | |
1144 if self.tree.elementInScope("p", variant="button"): | |
1145 self.endTagP(impliedTagToken("p")) | |
1146 if self.tree.openElements[-1].name in headingElements: | |
1147 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | |
1148 self.tree.openElements.pop() | |
1149 self.tree.insertElement(token) | |
1150 | |
1151 def startTagA(self, token): | |
1152 afeAElement = self.tree.elementInActiveFormattingElements("a") | |
1153 if afeAElement: | |
1154 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1155 {"startName": "a", "endName": "a"}) | |
1156 self.endTagFormatting(impliedTagToken("a")) | |
1157 if afeAElement in self.tree.openElements: | |
1158 self.tree.openElements.remove(afeAElement) | |
1159 if afeAElement in self.tree.activeFormattingElements: | |
1160 self.tree.activeFormattingElements.remove(afeAElement) | |
1161 self.tree.reconstructActiveFormattingElements() | |
1162 self.addFormattingElement(token) | |
1163 | |
1164 def startTagFormatting(self, token): | |
1165 self.tree.reconstructActiveFormattingElements() | |
1166 self.addFormattingElement(token) | |
1167 | |
1168 def startTagNobr(self, token): | |
1169 self.tree.reconstructActiveFormattingElements() | |
1170 if self.tree.elementInScope("nobr"): | |
1171 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1172 {"startName": "nobr", "endName": "nobr"}) | |
1173 self.processEndTag(impliedTagToken("nobr")) | |
1174 # XXX Need tests that trigger the following | |
1175 self.tree.reconstructActiveFormattingElements() | |
1176 self.addFormattingElement(token) | |
1177 | |
1178 def startTagButton(self, token): | |
1179 if self.tree.elementInScope("button"): | |
1180 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1181 {"startName": "button", "endName": "button"}) | |
1182 self.processEndTag(impliedTagToken("button")) | |
1183 return token | |
1184 else: | |
1185 self.tree.reconstructActiveFormattingElements() | |
1186 self.tree.insertElement(token) | |
1187 self.parser.framesetOK = False | |
1188 | |
1189 def startTagAppletMarqueeObject(self, token): | |
1190 self.tree.reconstructActiveFormattingElements() | |
1191 self.tree.insertElement(token) | |
1192 self.tree.activeFormattingElements.append(Marker) | |
1193 self.parser.framesetOK = False | |
1194 | |
1195 def startTagXmp(self, token): | |
1196 if self.tree.elementInScope("p", variant="button"): | |
1197 self.endTagP(impliedTagToken("p")) | |
1198 self.tree.reconstructActiveFormattingElements() | |
1199 self.parser.framesetOK = False | |
1200 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
1201 | |
1202 def startTagTable(self, token): | |
1203 if self.parser.compatMode != "quirks": | |
1204 if self.tree.elementInScope("p", variant="button"): | |
1205 self.processEndTag(impliedTagToken("p")) | |
1206 self.tree.insertElement(token) | |
1207 self.parser.framesetOK = False | |
1208 self.parser.phase = self.parser.phases["inTable"] | |
1209 | |
1210 def startTagVoidFormatting(self, token): | |
1211 self.tree.reconstructActiveFormattingElements() | |
1212 self.tree.insertElement(token) | |
1213 self.tree.openElements.pop() | |
1214 token["selfClosingAcknowledged"] = True | |
1215 self.parser.framesetOK = False | |
1216 | |
1217 def startTagInput(self, token): | |
1218 framesetOK = self.parser.framesetOK | |
1219 self.startTagVoidFormatting(token) | |
1220 if ("type" in token["data"] and | |
1221 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): | |
1222 # input type=hidden doesn't change framesetOK | |
1223 self.parser.framesetOK = framesetOK | |
1224 | |
1225 def startTagParamSource(self, token): | |
1226 self.tree.insertElement(token) | |
1227 self.tree.openElements.pop() | |
1228 token["selfClosingAcknowledged"] = True | |
1229 | |
1230 def startTagHr(self, token): | |
1231 if self.tree.elementInScope("p", variant="button"): | |
1232 self.endTagP(impliedTagToken("p")) | |
1233 self.tree.insertElement(token) | |
1234 self.tree.openElements.pop() | |
1235 token["selfClosingAcknowledged"] = True | |
1236 self.parser.framesetOK = False | |
1237 | |
1238 def startTagImage(self, token): | |
1239 # No really... | |
1240 self.parser.parseError("unexpected-start-tag-treated-as", | |
1241 {"originalName": "image", "newName": "img"}) | |
1242 self.processStartTag(impliedTagToken("img", "StartTag", | |
1243 attributes=token["data"], | |
1244 selfClosing=token["selfClosing"])) | |
1245 | |
1246 def startTagIsIndex(self, token): | |
1247 self.parser.parseError("deprecated-tag", {"name": "isindex"}) | |
1248 if self.tree.formPointer: | |
1249 return | |
1250 form_attrs = {} | |
1251 if "action" in token["data"]: | |
1252 form_attrs["action"] = token["data"]["action"] | |
1253 self.processStartTag(impliedTagToken("form", "StartTag", | |
1254 attributes=form_attrs)) | |
1255 self.processStartTag(impliedTagToken("hr", "StartTag")) | |
1256 self.processStartTag(impliedTagToken("label", "StartTag")) | |
1257 # XXX Localization ... | |
1258 if "prompt" in token["data"]: | |
1259 prompt = token["data"]["prompt"] | |
1260 else: | |
1261 prompt = "This is a searchable index. Enter search keywords: " | |
1262 self.processCharacters( | |
1263 {"type": tokenTypes["Characters"], "data": prompt}) | |
1264 attributes = token["data"].copy() | |
1265 if "action" in attributes: | |
1266 del attributes["action"] | |
1267 if "prompt" in attributes: | |
1268 del attributes["prompt"] | |
1269 attributes["name"] = "isindex" | |
1270 self.processStartTag(impliedTagToken("input", "StartTag", | |
1271 attributes=attributes, | |
1272 selfClosing=token["selfClosing"])) | |
1273 self.processEndTag(impliedTagToken("label")) | |
1274 self.processStartTag(impliedTagToken("hr", "StartTag")) | |
1275 self.processEndTag(impliedTagToken("form")) | |
1276 | |
1277 def startTagTextarea(self, token): | |
1278 self.tree.insertElement(token) | |
1279 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState | |
1280 self.processSpaceCharacters = self.processSpaceCharactersDropNewline | |
1281 self.parser.framesetOK = False | |
1282 | |
1283 def startTagIFrame(self, token): | |
1284 self.parser.framesetOK = False | |
1285 self.startTagRawtext(token) | |
1286 | |
1287 def startTagNoscript(self, token): | |
1288 if self.parser.scripting: | |
1289 self.startTagRawtext(token) | |
1290 else: | |
1291 self.startTagOther(token) | |
1292 | |
1293 def startTagRawtext(self, token): | |
1294 """iframe, noembed noframes, noscript(if scripting enabled)""" | |
1295 self.parser.parseRCDataRawtext(token, "RAWTEXT") | |
1296 | |
1297 def startTagOpt(self, token): | |
1298 if self.tree.openElements[-1].name == "option": | |
1299 self.parser.phase.processEndTag(impliedTagToken("option")) | |
1300 self.tree.reconstructActiveFormattingElements() | |
1301 self.parser.tree.insertElement(token) | |
1302 | |
1303 def startTagSelect(self, token): | |
1304 self.tree.reconstructActiveFormattingElements() | |
1305 self.tree.insertElement(token) | |
1306 self.parser.framesetOK = False | |
1307 if self.parser.phase in (self.parser.phases["inTable"], | |
1308 self.parser.phases["inCaption"], | |
1309 self.parser.phases["inColumnGroup"], | |
1310 self.parser.phases["inTableBody"], | |
1311 self.parser.phases["inRow"], | |
1312 self.parser.phases["inCell"]): | |
1313 self.parser.phase = self.parser.phases["inSelectInTable"] | |
1314 else: | |
1315 self.parser.phase = self.parser.phases["inSelect"] | |
1316 | |
1317 def startTagRpRt(self, token): | |
1318 if self.tree.elementInScope("ruby"): | |
1319 self.tree.generateImpliedEndTags() | |
1320 if self.tree.openElements[-1].name != "ruby": | |
1321 self.parser.parseError() | |
1322 self.tree.insertElement(token) | |
1323 | |
1324 def startTagMath(self, token): | |
1325 self.tree.reconstructActiveFormattingElements() | |
1326 self.parser.adjustMathMLAttributes(token) | |
1327 self.parser.adjustForeignAttributes(token) | |
1328 token["namespace"] = namespaces["mathml"] | |
1329 self.tree.insertElement(token) | |
1330 # Need to get the parse error right for the case where the token | |
1331 # has a namespace not equal to the xmlns attribute | |
1332 if token["selfClosing"]: | |
1333 self.tree.openElements.pop() | |
1334 token["selfClosingAcknowledged"] = True | |
1335 | |
1336 def startTagSvg(self, token): | |
1337 self.tree.reconstructActiveFormattingElements() | |
1338 self.parser.adjustSVGAttributes(token) | |
1339 self.parser.adjustForeignAttributes(token) | |
1340 token["namespace"] = namespaces["svg"] | |
1341 self.tree.insertElement(token) | |
1342 # Need to get the parse error right for the case where the token | |
1343 # has a namespace not equal to the xmlns attribute | |
1344 if token["selfClosing"]: | |
1345 self.tree.openElements.pop() | |
1346 token["selfClosingAcknowledged"] = True | |
1347 | |
1348 def startTagMisplaced(self, token): | |
1349 """ Elements that should be children of other elements that have a | |
1350 different insertion mode; here they are ignored | |
1351 "caption", "col", "colgroup", "frame", "frameset", "head", | |
1352 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", | |
1353 "tr", "noscript" | |
1354 """ | |
1355 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) | |
1356 | |
1357 def startTagOther(self, token): | |
1358 self.tree.reconstructActiveFormattingElements() | |
1359 self.tree.insertElement(token) | |
1360 | |
1361 def endTagP(self, token): | |
1362 if not self.tree.elementInScope("p", variant="button"): | |
1363 self.startTagCloseP(impliedTagToken("p", "StartTag")) | |
1364 self.parser.parseError("unexpected-end-tag", {"name": "p"}) | |
1365 self.endTagP(impliedTagToken("p", "EndTag")) | |
1366 else: | |
1367 self.tree.generateImpliedEndTags("p") | |
1368 if self.tree.openElements[-1].name != "p": | |
1369 self.parser.parseError("unexpected-end-tag", {"name": "p"}) | |
1370 node = self.tree.openElements.pop() | |
1371 while node.name != "p": | |
1372 node = self.tree.openElements.pop() | |
1373 | |
1374 def endTagBody(self, token): | |
1375 if not self.tree.elementInScope("body"): | |
1376 self.parser.parseError() | |
1377 return | |
1378 elif self.tree.openElements[-1].name != "body": | |
1379 for node in self.tree.openElements[2:]: | |
1380 if node.name not in frozenset(("dd", "dt", "li", "optgroup", | |
1381 "option", "p", "rp", "rt", | |
1382 "tbody", "td", "tfoot", | |
1383 "th", "thead", "tr", "body", | |
1384 "html")): | |
1385 # Not sure this is the correct name for the parse error | |
1386 self.parser.parseError( | |
1387 "expected-one-end-tag-but-got-another", | |
1388 {"gotName": "body", "expectedName": node.name}) | |
1389 break | |
1390 self.parser.phase = self.parser.phases["afterBody"] | |
1391 | |
1392 def endTagHtml(self, token): | |
1393 # We repeat the test for the body end tag token being ignored here | |
1394 if self.tree.elementInScope("body"): | |
1395 self.endTagBody(impliedTagToken("body")) | |
1396 return token | |
1397 | |
1398 def endTagBlock(self, token): | |
1399 # Put us back in the right whitespace handling mode | |
1400 if token["name"] == "pre": | |
1401 self.processSpaceCharacters = self.processSpaceCharactersNonPre | |
1402 inScope = self.tree.elementInScope(token["name"]) | |
1403 if inScope: | |
1404 self.tree.generateImpliedEndTags() | |
1405 if self.tree.openElements[-1].name != token["name"]: | |
1406 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1407 if inScope: | |
1408 node = self.tree.openElements.pop() | |
1409 while node.name != token["name"]: | |
1410 node = self.tree.openElements.pop() | |
1411 | |
1412 def endTagForm(self, token): | |
1413 node = self.tree.formPointer | |
1414 self.tree.formPointer = None | |
1415 if node is None or not self.tree.elementInScope(node): | |
1416 self.parser.parseError("unexpected-end-tag", | |
1417 {"name": "form"}) | |
1418 else: | |
1419 self.tree.generateImpliedEndTags() | |
1420 if self.tree.openElements[-1] != node: | |
1421 self.parser.parseError("end-tag-too-early-ignored", | |
1422 {"name": "form"}) | |
1423 self.tree.openElements.remove(node) | |
1424 | |
1425 def endTagListItem(self, token): | |
1426 if token["name"] == "li": | |
1427 variant = "list" | |
1428 else: | |
1429 variant = None | |
1430 if not self.tree.elementInScope(token["name"], variant=variant): | |
1431 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1432 else: | |
1433 self.tree.generateImpliedEndTags(exclude=token["name"]) | |
1434 if self.tree.openElements[-1].name != token["name"]: | |
1435 self.parser.parseError( | |
1436 "end-tag-too-early", | |
1437 {"name": token["name"]}) | |
1438 node = self.tree.openElements.pop() | |
1439 while node.name != token["name"]: | |
1440 node = self.tree.openElements.pop() | |
1441 | |
1442 def endTagHeading(self, token): | |
1443 for item in headingElements: | |
1444 if self.tree.elementInScope(item): | |
1445 self.tree.generateImpliedEndTags() | |
1446 break | |
1447 if self.tree.openElements[-1].name != token["name"]: | |
1448 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1449 | |
1450 for item in headingElements: | |
1451 if self.tree.elementInScope(item): | |
1452 item = self.tree.openElements.pop() | |
1453 while item.name not in headingElements: | |
1454 item = self.tree.openElements.pop() | |
1455 break | |
1456 | |
1457 def endTagFormatting(self, token): | |
1458 """The much-feared adoption agency algorithm""" | |
1459 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 | |
1460 # XXX Better parseError messages appreciated. | |
1461 | |
1462 # Step 1 | |
1463 outerLoopCounter = 0 | |
1464 | |
1465 # Step 2 | |
1466 while outerLoopCounter < 8: | |
1467 | |
1468 # Step 3 | |
1469 outerLoopCounter += 1 | |
1470 | |
1471 # Step 4: | |
1472 | |
1473 # Let the formatting element be the last element in | |
1474 # the list of active formatting elements that: | |
1475 # - is between the end of the list and the last scope | |
1476 # marker in the list, if any, or the start of the list | |
1477 # otherwise, and | |
1478 # - has the same tag name as the token. | |
1479 formattingElement = self.tree.elementInActiveFormattingElements( | |
1480 token["name"]) | |
1481 if (not formattingElement or | |
1482 (formattingElement in self.tree.openElements and | |
1483 not self.tree.elementInScope(formattingElement.name))): | |
1484 # If there is no such node, then abort these steps | |
1485 # and instead act as described in the "any other | |
1486 # end tag" entry below. | |
1487 self.endTagOther(token) | |
1488 return | |
1489 | |
1490 # Otherwise, if there is such a node, but that node is | |
1491 # not in the stack of open elements, then this is a | |
1492 # parse error; remove the element from the list, and | |
1493 # abort these steps. | |
1494 elif formattingElement not in self.tree.openElements: | |
1495 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) | |
1496 self.tree.activeFormattingElements.remove(formattingElement) | |
1497 return | |
1498 | |
1499 # Otherwise, if there is such a node, and that node is | |
1500 # also in the stack of open elements, but the element | |
1501 # is not in scope, then this is a parse error; ignore | |
1502 # the token, and abort these steps. | |
1503 elif not self.tree.elementInScope(formattingElement.name): | |
1504 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) | |
1505 return | |
1506 | |
1507 # Otherwise, there is a formatting element and that | |
1508 # element is in the stack and is in scope. If the | |
1509 # element is not the current node, this is a parse | |
1510 # error. In any case, proceed with the algorithm as | |
1511 # written in the following steps. | |
1512 else: | |
1513 if formattingElement != self.tree.openElements[-1]: | |
1514 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) | |
1515 | |
1516 # Step 5: | |
1517 | |
1518 # Let the furthest block be the topmost node in the | |
1519 # stack of open elements that is lower in the stack | |
1520 # than the formatting element, and is an element in | |
1521 # the special category. There might not be one. | |
1522 afeIndex = self.tree.openElements.index(formattingElement) | |
1523 furthestBlock = None | |
1524 for element in self.tree.openElements[afeIndex:]: | |
1525 if element.nameTuple in specialElements: | |
1526 furthestBlock = element | |
1527 break | |
1528 | |
1529 # Step 6: | |
1530 | |
1531 # If there is no furthest block, then the UA must | |
1532 # first pop all the nodes from the bottom of the stack | |
1533 # of open elements, from the current node up to and | |
1534 # including the formatting element, then remove the | |
1535 # formatting element from the list of active | |
1536 # formatting elements, and finally abort these steps. | |
1537 if furthestBlock is None: | |
1538 element = self.tree.openElements.pop() | |
1539 while element != formattingElement: | |
1540 element = self.tree.openElements.pop() | |
1541 self.tree.activeFormattingElements.remove(element) | |
1542 return | |
1543 | |
1544 # Step 7 | |
1545 commonAncestor = self.tree.openElements[afeIndex - 1] | |
1546 | |
1547 # Step 8: | |
1548 # The bookmark is supposed to help us identify where to reinsert | |
1549 # nodes in step 15. We have to ensure that we reinsert nodes after | |
1550 # the node before the active formatting element. Note the bookmark | |
1551 # can move in step 9.7 | |
1552 bookmark = self.tree.activeFormattingElements.index(formattingElement) | |
1553 | |
1554 # Step 9 | |
1555 lastNode = node = furthestBlock | |
1556 innerLoopCounter = 0 | |
1557 | |
1558 index = self.tree.openElements.index(node) | |
1559 while innerLoopCounter < 3: | |
1560 innerLoopCounter += 1 | |
1561 # Node is element before node in open elements | |
1562 index -= 1 | |
1563 node = self.tree.openElements[index] | |
1564 if node not in self.tree.activeFormattingElements: | |
1565 self.tree.openElements.remove(node) | |
1566 continue | |
1567 # Step 9.6 | |
1568 if node == formattingElement: | |
1569 break | |
1570 # Step 9.7 | |
1571 if lastNode == furthestBlock: | |
1572 bookmark = self.tree.activeFormattingElements.index(node) + 1 | |
1573 # Step 9.8 | |
1574 clone = node.cloneNode() | |
1575 # Replace node with clone | |
1576 self.tree.activeFormattingElements[ | |
1577 self.tree.activeFormattingElements.index(node)] = clone | |
1578 self.tree.openElements[ | |
1579 self.tree.openElements.index(node)] = clone | |
1580 node = clone | |
1581 # Step 9.9 | |
1582 # Remove lastNode from its parents, if any | |
1583 if lastNode.parent: | |
1584 lastNode.parent.removeChild(lastNode) | |
1585 node.appendChild(lastNode) | |
1586 # Step 9.10 | |
1587 lastNode = node | |
1588 | |
1589 # Step 10 | |
1590 # Foster parent lastNode if commonAncestor is a | |
1591 # table, tbody, tfoot, thead, or tr we need to foster | |
1592 # parent the lastNode | |
1593 if lastNode.parent: | |
1594 lastNode.parent.removeChild(lastNode) | |
1595 | |
1596 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): | |
1597 parent, insertBefore = self.tree.getTableMisnestedNodePosition() | |
1598 parent.insertBefore(lastNode, insertBefore) | |
1599 else: | |
1600 commonAncestor.appendChild(lastNode) | |
1601 | |
1602 # Step 11 | |
1603 clone = formattingElement.cloneNode() | |
1604 | |
1605 # Step 12 | |
1606 furthestBlock.reparentChildren(clone) | |
1607 | |
1608 # Step 13 | |
1609 furthestBlock.appendChild(clone) | |
1610 | |
1611 # Step 14 | |
1612 self.tree.activeFormattingElements.remove(formattingElement) | |
1613 self.tree.activeFormattingElements.insert(bookmark, clone) | |
1614 | |
1615 # Step 15 | |
1616 self.tree.openElements.remove(formattingElement) | |
1617 self.tree.openElements.insert( | |
1618 self.tree.openElements.index(furthestBlock) + 1, clone) | |
1619 | |
1620 def endTagAppletMarqueeObject(self, token): | |
1621 if self.tree.elementInScope(token["name"]): | |
1622 self.tree.generateImpliedEndTags() | |
1623 if self.tree.openElements[-1].name != token["name"]: | |
1624 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) | |
1625 | |
1626 if self.tree.elementInScope(token["name"]): | |
1627 element = self.tree.openElements.pop() | |
1628 while element.name != token["name"]: | |
1629 element = self.tree.openElements.pop() | |
1630 self.tree.clearActiveFormattingElements() | |
1631 | |
1632 def endTagBr(self, token): | |
1633 self.parser.parseError("unexpected-end-tag-treated-as", | |
1634 {"originalName": "br", "newName": "br element"}) | |
1635 self.tree.reconstructActiveFormattingElements() | |
1636 self.tree.insertElement(impliedTagToken("br", "StartTag")) | |
1637 self.tree.openElements.pop() | |
1638 | |
1639 def endTagOther(self, token): | |
1640 for node in self.tree.openElements[::-1]: | |
1641 if node.name == token["name"]: | |
1642 self.tree.generateImpliedEndTags(exclude=token["name"]) | |
1643 if self.tree.openElements[-1].name != token["name"]: | |
1644 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1645 while self.tree.openElements.pop() != node: | |
1646 pass | |
1647 break | |
1648 else: | |
1649 if node.nameTuple in specialElements: | |
1650 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1651 break | |
1652 | |
1653 class TextPhase(Phase): | |
1654 def __init__(self, parser, tree): | |
1655 Phase.__init__(self, parser, tree) | |
1656 self.startTagHandler = _utils.MethodDispatcher([]) | |
1657 self.startTagHandler.default = self.startTagOther | |
1658 self.endTagHandler = _utils.MethodDispatcher([ | |
1659 ("script", self.endTagScript)]) | |
1660 self.endTagHandler.default = self.endTagOther | |
1661 | |
1662 def processCharacters(self, token): | |
1663 self.tree.insertText(token["data"]) | |
1664 | |
1665 def processEOF(self): | |
1666 self.parser.parseError("expected-named-closing-tag-but-got-eof", | |
1667 {"name": self.tree.openElements[-1].name}) | |
1668 self.tree.openElements.pop() | |
1669 self.parser.phase = self.parser.originalPhase | |
1670 return True | |
1671 | |
1672 def startTagOther(self, token): | |
1673 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] | |
1674 | |
1675 def endTagScript(self, token): | |
1676 node = self.tree.openElements.pop() | |
1677 assert node.name == "script" | |
1678 self.parser.phase = self.parser.originalPhase | |
1679 # The rest of this method is all stuff that only happens if | |
1680 # document.write works | |
1681 | |
1682 def endTagOther(self, token): | |
1683 self.tree.openElements.pop() | |
1684 self.parser.phase = self.parser.originalPhase | |
1685 | |
1686 class InTablePhase(Phase): | |
1687 # http://www.whatwg.org/specs/web-apps/current-work/#in-table | |
1688 def __init__(self, parser, tree): | |
1689 Phase.__init__(self, parser, tree) | |
1690 self.startTagHandler = _utils.MethodDispatcher([ | |
1691 ("html", self.startTagHtml), | |
1692 ("caption", self.startTagCaption), | |
1693 ("colgroup", self.startTagColgroup), | |
1694 ("col", self.startTagCol), | |
1695 (("tbody", "tfoot", "thead"), self.startTagRowGroup), | |
1696 (("td", "th", "tr"), self.startTagImplyTbody), | |
1697 ("table", self.startTagTable), | |
1698 (("style", "script"), self.startTagStyleScript), | |
1699 ("input", self.startTagInput), | |
1700 ("form", self.startTagForm) | |
1701 ]) | |
1702 self.startTagHandler.default = self.startTagOther | |
1703 | |
1704 self.endTagHandler = _utils.MethodDispatcher([ | |
1705 ("table", self.endTagTable), | |
1706 (("body", "caption", "col", "colgroup", "html", "tbody", "td", | |
1707 "tfoot", "th", "thead", "tr"), self.endTagIgnore) | |
1708 ]) | |
1709 self.endTagHandler.default = self.endTagOther | |
1710 | |
1711 # helper methods | |
1712 def clearStackToTableContext(self): | |
1713 # "clear the stack back to a table context" | |
1714 while self.tree.openElements[-1].name not in ("table", "html"): | |
1715 # self.parser.parseError("unexpected-implied-end-tag-in-table", | |
1716 # {"name": self.tree.openElements[-1].name}) | |
1717 self.tree.openElements.pop() | |
1718 # When the current node is <html> it's an innerHTML case | |
1719 | |
1720 # processing methods | |
1721 def processEOF(self): | |
1722 if self.tree.openElements[-1].name != "html": | |
1723 self.parser.parseError("eof-in-table") | |
1724 else: | |
1725 assert self.parser.innerHTML | |
1726 # Stop parsing | |
1727 | |
1728 def processSpaceCharacters(self, token): | |
1729 originalPhase = self.parser.phase | |
1730 self.parser.phase = self.parser.phases["inTableText"] | |
1731 self.parser.phase.originalPhase = originalPhase | |
1732 self.parser.phase.processSpaceCharacters(token) | |
1733 | |
1734 def processCharacters(self, token): | |
1735 originalPhase = self.parser.phase | |
1736 self.parser.phase = self.parser.phases["inTableText"] | |
1737 self.parser.phase.originalPhase = originalPhase | |
1738 self.parser.phase.processCharacters(token) | |
1739 | |
1740 def insertText(self, token): | |
1741 # If we get here there must be at least one non-whitespace character | |
1742 # Do the table magic! | |
1743 self.tree.insertFromTable = True | |
1744 self.parser.phases["inBody"].processCharacters(token) | |
1745 self.tree.insertFromTable = False | |
1746 | |
1747 def startTagCaption(self, token): | |
1748 self.clearStackToTableContext() | |
1749 self.tree.activeFormattingElements.append(Marker) | |
1750 self.tree.insertElement(token) | |
1751 self.parser.phase = self.parser.phases["inCaption"] | |
1752 | |
1753 def startTagColgroup(self, token): | |
1754 self.clearStackToTableContext() | |
1755 self.tree.insertElement(token) | |
1756 self.parser.phase = self.parser.phases["inColumnGroup"] | |
1757 | |
1758 def startTagCol(self, token): | |
1759 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) | |
1760 return token | |
1761 | |
1762 def startTagRowGroup(self, token): | |
1763 self.clearStackToTableContext() | |
1764 self.tree.insertElement(token) | |
1765 self.parser.phase = self.parser.phases["inTableBody"] | |
1766 | |
1767 def startTagImplyTbody(self, token): | |
1768 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) | |
1769 return token | |
1770 | |
1771 def startTagTable(self, token): | |
1772 self.parser.parseError("unexpected-start-tag-implies-end-tag", | |
1773 {"startName": "table", "endName": "table"}) | |
1774 self.parser.phase.processEndTag(impliedTagToken("table")) | |
1775 if not self.parser.innerHTML: | |
1776 return token | |
1777 | |
1778 def startTagStyleScript(self, token): | |
1779 return self.parser.phases["inHead"].processStartTag(token) | |
1780 | |
1781 def startTagInput(self, token): | |
1782 if ("type" in token["data"] and | |
1783 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): | |
1784 self.parser.parseError("unexpected-hidden-input-in-table") | |
1785 self.tree.insertElement(token) | |
1786 # XXX associate with form | |
1787 self.tree.openElements.pop() | |
1788 else: | |
1789 self.startTagOther(token) | |
1790 | |
1791 def startTagForm(self, token): | |
1792 self.parser.parseError("unexpected-form-in-table") | |
1793 if self.tree.formPointer is None: | |
1794 self.tree.insertElement(token) | |
1795 self.tree.formPointer = self.tree.openElements[-1] | |
1796 self.tree.openElements.pop() | |
1797 | |
1798 def startTagOther(self, token): | |
1799 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) | |
1800 # Do the table magic! | |
1801 self.tree.insertFromTable = True | |
1802 self.parser.phases["inBody"].processStartTag(token) | |
1803 self.tree.insertFromTable = False | |
1804 | |
1805 def endTagTable(self, token): | |
1806 if self.tree.elementInScope("table", variant="table"): | |
1807 self.tree.generateImpliedEndTags() | |
1808 if self.tree.openElements[-1].name != "table": | |
1809 self.parser.parseError("end-tag-too-early-named", | |
1810 {"gotName": "table", | |
1811 "expectedName": self.tree.openElements[-1].name}) | |
1812 while self.tree.openElements[-1].name != "table": | |
1813 self.tree.openElements.pop() | |
1814 self.tree.openElements.pop() | |
1815 self.parser.resetInsertionMode() | |
1816 else: | |
1817 # innerHTML case | |
1818 assert self.parser.innerHTML | |
1819 self.parser.parseError() | |
1820 | |
1821 def endTagIgnore(self, token): | |
1822 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1823 | |
1824 def endTagOther(self, token): | |
1825 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) | |
1826 # Do the table magic! | |
1827 self.tree.insertFromTable = True | |
1828 self.parser.phases["inBody"].processEndTag(token) | |
1829 self.tree.insertFromTable = False | |
1830 | |
1831 class InTableTextPhase(Phase): | |
1832 def __init__(self, parser, tree): | |
1833 Phase.__init__(self, parser, tree) | |
1834 self.originalPhase = None | |
1835 self.characterTokens = [] | |
1836 | |
1837 def flushCharacters(self): | |
1838 data = "".join([item["data"] for item in self.characterTokens]) | |
1839 if any([item not in spaceCharacters for item in data]): | |
1840 token = {"type": tokenTypes["Characters"], "data": data} | |
1841 self.parser.phases["inTable"].insertText(token) | |
1842 elif data: | |
1843 self.tree.insertText(data) | |
1844 self.characterTokens = [] | |
1845 | |
1846 def processComment(self, token): | |
1847 self.flushCharacters() | |
1848 self.parser.phase = self.originalPhase | |
1849 return token | |
1850 | |
1851 def processEOF(self): | |
1852 self.flushCharacters() | |
1853 self.parser.phase = self.originalPhase | |
1854 return True | |
1855 | |
1856 def processCharacters(self, token): | |
1857 if token["data"] == "\u0000": | |
1858 return | |
1859 self.characterTokens.append(token) | |
1860 | |
1861 def processSpaceCharacters(self, token): | |
1862 # pretty sure we should never reach here | |
1863 self.characterTokens.append(token) | |
1864 # assert False | |
1865 | |
1866 def processStartTag(self, token): | |
1867 self.flushCharacters() | |
1868 self.parser.phase = self.originalPhase | |
1869 return token | |
1870 | |
1871 def processEndTag(self, token): | |
1872 self.flushCharacters() | |
1873 self.parser.phase = self.originalPhase | |
1874 return token | |
1875 | |
1876 class InCaptionPhase(Phase): | |
1877 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption | |
1878 def __init__(self, parser, tree): | |
1879 Phase.__init__(self, parser, tree) | |
1880 | |
1881 self.startTagHandler = _utils.MethodDispatcher([ | |
1882 ("html", self.startTagHtml), | |
1883 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", | |
1884 "thead", "tr"), self.startTagTableElement) | |
1885 ]) | |
1886 self.startTagHandler.default = self.startTagOther | |
1887 | |
1888 self.endTagHandler = _utils.MethodDispatcher([ | |
1889 ("caption", self.endTagCaption), | |
1890 ("table", self.endTagTable), | |
1891 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", | |
1892 "thead", "tr"), self.endTagIgnore) | |
1893 ]) | |
1894 self.endTagHandler.default = self.endTagOther | |
1895 | |
1896 def ignoreEndTagCaption(self): | |
1897 return not self.tree.elementInScope("caption", variant="table") | |
1898 | |
1899 def processEOF(self): | |
1900 self.parser.phases["inBody"].processEOF() | |
1901 | |
1902 def processCharacters(self, token): | |
1903 return self.parser.phases["inBody"].processCharacters(token) | |
1904 | |
1905 def startTagTableElement(self, token): | |
1906 self.parser.parseError() | |
1907 # XXX Have to duplicate logic here to find out if the tag is ignored | |
1908 ignoreEndTag = self.ignoreEndTagCaption() | |
1909 self.parser.phase.processEndTag(impliedTagToken("caption")) | |
1910 if not ignoreEndTag: | |
1911 return token | |
1912 | |
1913 def startTagOther(self, token): | |
1914 return self.parser.phases["inBody"].processStartTag(token) | |
1915 | |
1916 def endTagCaption(self, token): | |
1917 if not self.ignoreEndTagCaption(): | |
1918 # AT this code is quite similar to endTagTable in "InTable" | |
1919 self.tree.generateImpliedEndTags() | |
1920 if self.tree.openElements[-1].name != "caption": | |
1921 self.parser.parseError("expected-one-end-tag-but-got-another", | |
1922 {"gotName": "caption", | |
1923 "expectedName": self.tree.openElements[-1].name}) | |
1924 while self.tree.openElements[-1].name != "caption": | |
1925 self.tree.openElements.pop() | |
1926 self.tree.openElements.pop() | |
1927 self.tree.clearActiveFormattingElements() | |
1928 self.parser.phase = self.parser.phases["inTable"] | |
1929 else: | |
1930 # innerHTML case | |
1931 assert self.parser.innerHTML | |
1932 self.parser.parseError() | |
1933 | |
1934 def endTagTable(self, token): | |
1935 self.parser.parseError() | |
1936 ignoreEndTag = self.ignoreEndTagCaption() | |
1937 self.parser.phase.processEndTag(impliedTagToken("caption")) | |
1938 if not ignoreEndTag: | |
1939 return token | |
1940 | |
1941 def endTagIgnore(self, token): | |
1942 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
1943 | |
1944 def endTagOther(self, token): | |
1945 return self.parser.phases["inBody"].processEndTag(token) | |
1946 | |
1947 class InColumnGroupPhase(Phase): | |
1948 # http://www.whatwg.org/specs/web-apps/current-work/#in-column | |
1949 | |
1950 def __init__(self, parser, tree): | |
1951 Phase.__init__(self, parser, tree) | |
1952 | |
1953 self.startTagHandler = _utils.MethodDispatcher([ | |
1954 ("html", self.startTagHtml), | |
1955 ("col", self.startTagCol) | |
1956 ]) | |
1957 self.startTagHandler.default = self.startTagOther | |
1958 | |
1959 self.endTagHandler = _utils.MethodDispatcher([ | |
1960 ("colgroup", self.endTagColgroup), | |
1961 ("col", self.endTagCol) | |
1962 ]) | |
1963 self.endTagHandler.default = self.endTagOther | |
1964 | |
1965 def ignoreEndTagColgroup(self): | |
1966 return self.tree.openElements[-1].name == "html" | |
1967 | |
1968 def processEOF(self): | |
1969 if self.tree.openElements[-1].name == "html": | |
1970 assert self.parser.innerHTML | |
1971 return | |
1972 else: | |
1973 ignoreEndTag = self.ignoreEndTagColgroup() | |
1974 self.endTagColgroup(impliedTagToken("colgroup")) | |
1975 if not ignoreEndTag: | |
1976 return True | |
1977 | |
1978 def processCharacters(self, token): | |
1979 ignoreEndTag = self.ignoreEndTagColgroup() | |
1980 self.endTagColgroup(impliedTagToken("colgroup")) | |
1981 if not ignoreEndTag: | |
1982 return token | |
1983 | |
1984 def startTagCol(self, token): | |
1985 self.tree.insertElement(token) | |
1986 self.tree.openElements.pop() | |
1987 token["selfClosingAcknowledged"] = True | |
1988 | |
1989 def startTagOther(self, token): | |
1990 ignoreEndTag = self.ignoreEndTagColgroup() | |
1991 self.endTagColgroup(impliedTagToken("colgroup")) | |
1992 if not ignoreEndTag: | |
1993 return token | |
1994 | |
1995 def endTagColgroup(self, token): | |
1996 if self.ignoreEndTagColgroup(): | |
1997 # innerHTML case | |
1998 assert self.parser.innerHTML | |
1999 self.parser.parseError() | |
2000 else: | |
2001 self.tree.openElements.pop() | |
2002 self.parser.phase = self.parser.phases["inTable"] | |
2003 | |
2004 def endTagCol(self, token): | |
2005 self.parser.parseError("no-end-tag", {"name": "col"}) | |
2006 | |
2007 def endTagOther(self, token): | |
2008 ignoreEndTag = self.ignoreEndTagColgroup() | |
2009 self.endTagColgroup(impliedTagToken("colgroup")) | |
2010 if not ignoreEndTag: | |
2011 return token | |
2012 | |
2013 class InTableBodyPhase(Phase): | |
2014 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 | |
2015 def __init__(self, parser, tree): | |
2016 Phase.__init__(self, parser, tree) | |
2017 self.startTagHandler = _utils.MethodDispatcher([ | |
2018 ("html", self.startTagHtml), | |
2019 ("tr", self.startTagTr), | |
2020 (("td", "th"), self.startTagTableCell), | |
2021 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), | |
2022 self.startTagTableOther) | |
2023 ]) | |
2024 self.startTagHandler.default = self.startTagOther | |
2025 | |
2026 self.endTagHandler = _utils.MethodDispatcher([ | |
2027 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), | |
2028 ("table", self.endTagTable), | |
2029 (("body", "caption", "col", "colgroup", "html", "td", "th", | |
2030 "tr"), self.endTagIgnore) | |
2031 ]) | |
2032 self.endTagHandler.default = self.endTagOther | |
2033 | |
2034 # helper methods | |
2035 def clearStackToTableBodyContext(self): | |
2036 while self.tree.openElements[-1].name not in ("tbody", "tfoot", | |
2037 "thead", "html"): | |
2038 # self.parser.parseError("unexpected-implied-end-tag-in-table", | |
2039 # {"name": self.tree.openElements[-1].name}) | |
2040 self.tree.openElements.pop() | |
2041 if self.tree.openElements[-1].name == "html": | |
2042 assert self.parser.innerHTML | |
2043 | |
2044 # the rest | |
2045 def processEOF(self): | |
2046 self.parser.phases["inTable"].processEOF() | |
2047 | |
2048 def processSpaceCharacters(self, token): | |
2049 return self.parser.phases["inTable"].processSpaceCharacters(token) | |
2050 | |
2051 def processCharacters(self, token): | |
2052 return self.parser.phases["inTable"].processCharacters(token) | |
2053 | |
2054 def startTagTr(self, token): | |
2055 self.clearStackToTableBodyContext() | |
2056 self.tree.insertElement(token) | |
2057 self.parser.phase = self.parser.phases["inRow"] | |
2058 | |
2059 def startTagTableCell(self, token): | |
2060 self.parser.parseError("unexpected-cell-in-table-body", | |
2061 {"name": token["name"]}) | |
2062 self.startTagTr(impliedTagToken("tr", "StartTag")) | |
2063 return token | |
2064 | |
2065 def startTagTableOther(self, token): | |
2066 # XXX AT Any ideas on how to share this with endTagTable? | |
2067 if (self.tree.elementInScope("tbody", variant="table") or | |
2068 self.tree.elementInScope("thead", variant="table") or | |
2069 self.tree.elementInScope("tfoot", variant="table")): | |
2070 self.clearStackToTableBodyContext() | |
2071 self.endTagTableRowGroup( | |
2072 impliedTagToken(self.tree.openElements[-1].name)) | |
2073 return token | |
2074 else: | |
2075 # innerHTML case | |
2076 assert self.parser.innerHTML | |
2077 self.parser.parseError() | |
2078 | |
2079 def startTagOther(self, token): | |
2080 return self.parser.phases["inTable"].processStartTag(token) | |
2081 | |
2082 def endTagTableRowGroup(self, token): | |
2083 if self.tree.elementInScope(token["name"], variant="table"): | |
2084 self.clearStackToTableBodyContext() | |
2085 self.tree.openElements.pop() | |
2086 self.parser.phase = self.parser.phases["inTable"] | |
2087 else: | |
2088 self.parser.parseError("unexpected-end-tag-in-table-body", | |
2089 {"name": token["name"]}) | |
2090 | |
2091 def endTagTable(self, token): | |
2092 if (self.tree.elementInScope("tbody", variant="table") or | |
2093 self.tree.elementInScope("thead", variant="table") or | |
2094 self.tree.elementInScope("tfoot", variant="table")): | |
2095 self.clearStackToTableBodyContext() | |
2096 self.endTagTableRowGroup( | |
2097 impliedTagToken(self.tree.openElements[-1].name)) | |
2098 return token | |
2099 else: | |
2100 # innerHTML case | |
2101 assert self.parser.innerHTML | |
2102 self.parser.parseError() | |
2103 | |
2104 def endTagIgnore(self, token): | |
2105 self.parser.parseError("unexpected-end-tag-in-table-body", | |
2106 {"name": token["name"]}) | |
2107 | |
2108 def endTagOther(self, token): | |
2109 return self.parser.phases["inTable"].processEndTag(token) | |
2110 | |
2111 class InRowPhase(Phase): | |
2112 # http://www.whatwg.org/specs/web-apps/current-work/#in-row | |
2113 def __init__(self, parser, tree): | |
2114 Phase.__init__(self, parser, tree) | |
2115 self.startTagHandler = _utils.MethodDispatcher([ | |
2116 ("html", self.startTagHtml), | |
2117 (("td", "th"), self.startTagTableCell), | |
2118 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", | |
2119 "tr"), self.startTagTableOther) | |
2120 ]) | |
2121 self.startTagHandler.default = self.startTagOther | |
2122 | |
2123 self.endTagHandler = _utils.MethodDispatcher([ | |
2124 ("tr", self.endTagTr), | |
2125 ("table", self.endTagTable), | |
2126 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), | |
2127 (("body", "caption", "col", "colgroup", "html", "td", "th"), | |
2128 self.endTagIgnore) | |
2129 ]) | |
2130 self.endTagHandler.default = self.endTagOther | |
2131 | |
2132 # helper methods (XXX unify this with other table helper methods) | |
2133 def clearStackToTableRowContext(self): | |
2134 while self.tree.openElements[-1].name not in ("tr", "html"): | |
2135 self.parser.parseError("unexpected-implied-end-tag-in-table-row", | |
2136 {"name": self.tree.openElements[-1].name}) | |
2137 self.tree.openElements.pop() | |
2138 | |
2139 def ignoreEndTagTr(self): | |
2140 return not self.tree.elementInScope("tr", variant="table") | |
2141 | |
2142 # the rest | |
2143 def processEOF(self): | |
2144 self.parser.phases["inTable"].processEOF() | |
2145 | |
2146 def processSpaceCharacters(self, token): | |
2147 return self.parser.phases["inTable"].processSpaceCharacters(token) | |
2148 | |
2149 def processCharacters(self, token): | |
2150 return self.parser.phases["inTable"].processCharacters(token) | |
2151 | |
2152 def startTagTableCell(self, token): | |
2153 self.clearStackToTableRowContext() | |
2154 self.tree.insertElement(token) | |
2155 self.parser.phase = self.parser.phases["inCell"] | |
2156 self.tree.activeFormattingElements.append(Marker) | |
2157 | |
2158 def startTagTableOther(self, token): | |
2159 ignoreEndTag = self.ignoreEndTagTr() | |
2160 self.endTagTr(impliedTagToken("tr")) | |
2161 # XXX how are we sure it's always ignored in the innerHTML case? | |
2162 if not ignoreEndTag: | |
2163 return token | |
2164 | |
2165 def startTagOther(self, token): | |
2166 return self.parser.phases["inTable"].processStartTag(token) | |
2167 | |
2168 def endTagTr(self, token): | |
2169 if not self.ignoreEndTagTr(): | |
2170 self.clearStackToTableRowContext() | |
2171 self.tree.openElements.pop() | |
2172 self.parser.phase = self.parser.phases["inTableBody"] | |
2173 else: | |
2174 # innerHTML case | |
2175 assert self.parser.innerHTML | |
2176 self.parser.parseError() | |
2177 | |
2178 def endTagTable(self, token): | |
2179 ignoreEndTag = self.ignoreEndTagTr() | |
2180 self.endTagTr(impliedTagToken("tr")) | |
2181 # Reprocess the current tag if the tr end tag was not ignored | |
2182 # XXX how are we sure it's always ignored in the innerHTML case? | |
2183 if not ignoreEndTag: | |
2184 return token | |
2185 | |
2186 def endTagTableRowGroup(self, token): | |
2187 if self.tree.elementInScope(token["name"], variant="table"): | |
2188 self.endTagTr(impliedTagToken("tr")) | |
2189 return token | |
2190 else: | |
2191 self.parser.parseError() | |
2192 | |
2193 def endTagIgnore(self, token): | |
2194 self.parser.parseError("unexpected-end-tag-in-table-row", | |
2195 {"name": token["name"]}) | |
2196 | |
2197 def endTagOther(self, token): | |
2198 return self.parser.phases["inTable"].processEndTag(token) | |
2199 | |
2200 class InCellPhase(Phase): | |
2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell | |
2202 def __init__(self, parser, tree): | |
2203 Phase.__init__(self, parser, tree) | |
2204 self.startTagHandler = _utils.MethodDispatcher([ | |
2205 ("html", self.startTagHtml), | |
2206 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", | |
2207 "thead", "tr"), self.startTagTableOther) | |
2208 ]) | |
2209 self.startTagHandler.default = self.startTagOther | |
2210 | |
2211 self.endTagHandler = _utils.MethodDispatcher([ | |
2212 (("td", "th"), self.endTagTableCell), | |
2213 (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), | |
2214 (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) | |
2215 ]) | |
2216 self.endTagHandler.default = self.endTagOther | |
2217 | |
2218 # helper | |
2219 def closeCell(self): | |
2220 if self.tree.elementInScope("td", variant="table"): | |
2221 self.endTagTableCell(impliedTagToken("td")) | |
2222 elif self.tree.elementInScope("th", variant="table"): | |
2223 self.endTagTableCell(impliedTagToken("th")) | |
2224 | |
2225 # the rest | |
2226 def processEOF(self): | |
2227 self.parser.phases["inBody"].processEOF() | |
2228 | |
2229 def processCharacters(self, token): | |
2230 return self.parser.phases["inBody"].processCharacters(token) | |
2231 | |
2232 def startTagTableOther(self, token): | |
2233 if (self.tree.elementInScope("td", variant="table") or | |
2234 self.tree.elementInScope("th", variant="table")): | |
2235 self.closeCell() | |
2236 return token | |
2237 else: | |
2238 # innerHTML case | |
2239 assert self.parser.innerHTML | |
2240 self.parser.parseError() | |
2241 | |
2242 def startTagOther(self, token): | |
2243 return self.parser.phases["inBody"].processStartTag(token) | |
2244 | |
2245 def endTagTableCell(self, token): | |
2246 if self.tree.elementInScope(token["name"], variant="table"): | |
2247 self.tree.generateImpliedEndTags(token["name"]) | |
2248 if self.tree.openElements[-1].name != token["name"]: | |
2249 self.parser.parseError("unexpected-cell-end-tag", | |
2250 {"name": token["name"]}) | |
2251 while True: | |
2252 node = self.tree.openElements.pop() | |
2253 if node.name == token["name"]: | |
2254 break | |
2255 else: | |
2256 self.tree.openElements.pop() | |
2257 self.tree.clearActiveFormattingElements() | |
2258 self.parser.phase = self.parser.phases["inRow"] | |
2259 else: | |
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2261 | |
2262 def endTagIgnore(self, token): | |
2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2264 | |
2265 def endTagImply(self, token): | |
2266 if self.tree.elementInScope(token["name"], variant="table"): | |
2267 self.closeCell() | |
2268 return token | |
2269 else: | |
2270 # sometimes innerHTML case | |
2271 self.parser.parseError() | |
2272 | |
2273 def endTagOther(self, token): | |
2274 return self.parser.phases["inBody"].processEndTag(token) | |
2275 | |
2276 class InSelectPhase(Phase): | |
2277 def __init__(self, parser, tree): | |
2278 Phase.__init__(self, parser, tree) | |
2279 | |
2280 self.startTagHandler = _utils.MethodDispatcher([ | |
2281 ("html", self.startTagHtml), | |
2282 ("option", self.startTagOption), | |
2283 ("optgroup", self.startTagOptgroup), | |
2284 ("select", self.startTagSelect), | |
2285 (("input", "keygen", "textarea"), self.startTagInput), | |
2286 ("script", self.startTagScript) | |
2287 ]) | |
2288 self.startTagHandler.default = self.startTagOther | |
2289 | |
2290 self.endTagHandler = _utils.MethodDispatcher([ | |
2291 ("option", self.endTagOption), | |
2292 ("optgroup", self.endTagOptgroup), | |
2293 ("select", self.endTagSelect) | |
2294 ]) | |
2295 self.endTagHandler.default = self.endTagOther | |
2296 | |
2297 # http://www.whatwg.org/specs/web-apps/current-work/#in-select | |
2298 def processEOF(self): | |
2299 if self.tree.openElements[-1].name != "html": | |
2300 self.parser.parseError("eof-in-select") | |
2301 else: | |
2302 assert self.parser.innerHTML | |
2303 | |
2304 def processCharacters(self, token): | |
2305 if token["data"] == "\u0000": | |
2306 return | |
2307 self.tree.insertText(token["data"]) | |
2308 | |
2309 def startTagOption(self, token): | |
2310 # We need to imply </option> if <option> is the current node. | |
2311 if self.tree.openElements[-1].name == "option": | |
2312 self.tree.openElements.pop() | |
2313 self.tree.insertElement(token) | |
2314 | |
2315 def startTagOptgroup(self, token): | |
2316 if self.tree.openElements[-1].name == "option": | |
2317 self.tree.openElements.pop() | |
2318 if self.tree.openElements[-1].name == "optgroup": | |
2319 self.tree.openElements.pop() | |
2320 self.tree.insertElement(token) | |
2321 | |
2322 def startTagSelect(self, token): | |
2323 self.parser.parseError("unexpected-select-in-select") | |
2324 self.endTagSelect(impliedTagToken("select")) | |
2325 | |
2326 def startTagInput(self, token): | |
2327 self.parser.parseError("unexpected-input-in-select") | |
2328 if self.tree.elementInScope("select", variant="select"): | |
2329 self.endTagSelect(impliedTagToken("select")) | |
2330 return token | |
2331 else: | |
2332 assert self.parser.innerHTML | |
2333 | |
2334 def startTagScript(self, token): | |
2335 return self.parser.phases["inHead"].processStartTag(token) | |
2336 | |
2337 def startTagOther(self, token): | |
2338 self.parser.parseError("unexpected-start-tag-in-select", | |
2339 {"name": token["name"]}) | |
2340 | |
2341 def endTagOption(self, token): | |
2342 if self.tree.openElements[-1].name == "option": | |
2343 self.tree.openElements.pop() | |
2344 else: | |
2345 self.parser.parseError("unexpected-end-tag-in-select", | |
2346 {"name": "option"}) | |
2347 | |
2348 def endTagOptgroup(self, token): | |
2349 # </optgroup> implicitly closes <option> | |
2350 if (self.tree.openElements[-1].name == "option" and | |
2351 self.tree.openElements[-2].name == "optgroup"): | |
2352 self.tree.openElements.pop() | |
2353 # It also closes </optgroup> | |
2354 if self.tree.openElements[-1].name == "optgroup": | |
2355 self.tree.openElements.pop() | |
2356 # But nothing else | |
2357 else: | |
2358 self.parser.parseError("unexpected-end-tag-in-select", | |
2359 {"name": "optgroup"}) | |
2360 | |
2361 def endTagSelect(self, token): | |
2362 if self.tree.elementInScope("select", variant="select"): | |
2363 node = self.tree.openElements.pop() | |
2364 while node.name != "select": | |
2365 node = self.tree.openElements.pop() | |
2366 self.parser.resetInsertionMode() | |
2367 else: | |
2368 # innerHTML case | |
2369 assert self.parser.innerHTML | |
2370 self.parser.parseError() | |
2371 | |
2372 def endTagOther(self, token): | |
2373 self.parser.parseError("unexpected-end-tag-in-select", | |
2374 {"name": token["name"]}) | |
2375 | |
2376 class InSelectInTablePhase(Phase): | |
2377 def __init__(self, parser, tree): | |
2378 Phase.__init__(self, parser, tree) | |
2379 | |
2380 self.startTagHandler = _utils.MethodDispatcher([ | |
2381 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), | |
2382 self.startTagTable) | |
2383 ]) | |
2384 self.startTagHandler.default = self.startTagOther | |
2385 | |
2386 self.endTagHandler = _utils.MethodDispatcher([ | |
2387 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), | |
2388 self.endTagTable) | |
2389 ]) | |
2390 self.endTagHandler.default = self.endTagOther | |
2391 | |
2392 def processEOF(self): | |
2393 self.parser.phases["inSelect"].processEOF() | |
2394 | |
2395 def processCharacters(self, token): | |
2396 return self.parser.phases["inSelect"].processCharacters(token) | |
2397 | |
2398 def startTagTable(self, token): | |
2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) | |
2400 self.endTagOther(impliedTagToken("select")) | |
2401 return token | |
2402 | |
2403 def startTagOther(self, token): | |
2404 return self.parser.phases["inSelect"].processStartTag(token) | |
2405 | |
2406 def endTagTable(self, token): | |
2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) | |
2408 if self.tree.elementInScope(token["name"], variant="table"): | |
2409 self.endTagOther(impliedTagToken("select")) | |
2410 return token | |
2411 | |
2412 def endTagOther(self, token): | |
2413 return self.parser.phases["inSelect"].processEndTag(token) | |
2414 | |
2415 class InForeignContentPhase(Phase): | |
2416 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", | |
2417 "center", "code", "dd", "div", "dl", "dt", | |
2418 "em", "embed", "h1", "h2", "h3", | |
2419 "h4", "h5", "h6", "head", "hr", "i", "img", | |
2420 "li", "listing", "menu", "meta", "nobr", | |
2421 "ol", "p", "pre", "ruby", "s", "small", | |
2422 "span", "strong", "strike", "sub", "sup", | |
2423 "table", "tt", "u", "ul", "var"]) | |
2424 | |
2425 def __init__(self, parser, tree): | |
2426 Phase.__init__(self, parser, tree) | |
2427 | |
2428 def adjustSVGTagNames(self, token): | |
2429 replacements = {"altglyph": "altGlyph", | |
2430 "altglyphdef": "altGlyphDef", | |
2431 "altglyphitem": "altGlyphItem", | |
2432 "animatecolor": "animateColor", | |
2433 "animatemotion": "animateMotion", | |
2434 "animatetransform": "animateTransform", | |
2435 "clippath": "clipPath", | |
2436 "feblend": "feBlend", | |
2437 "fecolormatrix": "feColorMatrix", | |
2438 "fecomponenttransfer": "feComponentTransfer", | |
2439 "fecomposite": "feComposite", | |
2440 "feconvolvematrix": "feConvolveMatrix", | |
2441 "fediffuselighting": "feDiffuseLighting", | |
2442 "fedisplacementmap": "feDisplacementMap", | |
2443 "fedistantlight": "feDistantLight", | |
2444 "feflood": "feFlood", | |
2445 "fefunca": "feFuncA", | |
2446 "fefuncb": "feFuncB", | |
2447 "fefuncg": "feFuncG", | |
2448 "fefuncr": "feFuncR", | |
2449 "fegaussianblur": "feGaussianBlur", | |
2450 "feimage": "feImage", | |
2451 "femerge": "feMerge", | |
2452 "femergenode": "feMergeNode", | |
2453 "femorphology": "feMorphology", | |
2454 "feoffset": "feOffset", | |
2455 "fepointlight": "fePointLight", | |
2456 "fespecularlighting": "feSpecularLighting", | |
2457 "fespotlight": "feSpotLight", | |
2458 "fetile": "feTile", | |
2459 "feturbulence": "feTurbulence", | |
2460 "foreignobject": "foreignObject", | |
2461 "glyphref": "glyphRef", | |
2462 "lineargradient": "linearGradient", | |
2463 "radialgradient": "radialGradient", | |
2464 "textpath": "textPath"} | |
2465 | |
2466 if token["name"] in replacements: | |
2467 token["name"] = replacements[token["name"]] | |
2468 | |
2469 def processCharacters(self, token): | |
2470 if token["data"] == "\u0000": | |
2471 token["data"] = "\uFFFD" | |
2472 elif (self.parser.framesetOK and | |
2473 any(char not in spaceCharacters for char in token["data"])): | |
2474 self.parser.framesetOK = False | |
2475 Phase.processCharacters(self, token) | |
2476 | |
2477 def processStartTag(self, token): | |
2478 currentNode = self.tree.openElements[-1] | |
2479 if (token["name"] in self.breakoutElements or | |
2480 (token["name"] == "font" and | |
2481 set(token["data"].keys()) & set(["color", "face", "size"]))): | |
2482 self.parser.parseError("unexpected-html-element-in-foreign-content", | |
2483 {"name": token["name"]}) | |
2484 while (self.tree.openElements[-1].namespace != | |
2485 self.tree.defaultNamespace and | |
2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and | |
2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): | |
2488 self.tree.openElements.pop() | |
2489 return token | |
2490 | |
2491 else: | |
2492 if currentNode.namespace == namespaces["mathml"]: | |
2493 self.parser.adjustMathMLAttributes(token) | |
2494 elif currentNode.namespace == namespaces["svg"]: | |
2495 self.adjustSVGTagNames(token) | |
2496 self.parser.adjustSVGAttributes(token) | |
2497 self.parser.adjustForeignAttributes(token) | |
2498 token["namespace"] = currentNode.namespace | |
2499 self.tree.insertElement(token) | |
2500 if token["selfClosing"]: | |
2501 self.tree.openElements.pop() | |
2502 token["selfClosingAcknowledged"] = True | |
2503 | |
2504 def processEndTag(self, token): | |
2505 nodeIndex = len(self.tree.openElements) - 1 | |
2506 node = self.tree.openElements[-1] | |
2507 if node.name.translate(asciiUpper2Lower) != token["name"]: | |
2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | |
2509 | |
2510 while True: | |
2511 if node.name.translate(asciiUpper2Lower) == token["name"]: | |
2512 # XXX this isn't in the spec but it seems necessary | |
2513 if self.parser.phase == self.parser.phases["inTableText"]: | |
2514 self.parser.phase.flushCharacters() | |
2515 self.parser.phase = self.parser.phase.originalPhase | |
2516 while self.tree.openElements.pop() != node: | |
2517 assert self.tree.openElements | |
2518 new_token = None | |
2519 break | |
2520 nodeIndex -= 1 | |
2521 | |
2522 node = self.tree.openElements[nodeIndex] | |
2523 if node.namespace != self.tree.defaultNamespace: | |
2524 continue | |
2525 else: | |
2526 new_token = self.parser.phase.processEndTag(token) | |
2527 break | |
2528 return new_token | |
2529 | |
2530 class AfterBodyPhase(Phase): | |
2531 def __init__(self, parser, tree): | |
2532 Phase.__init__(self, parser, tree) | |
2533 | |
2534 self.startTagHandler = _utils.MethodDispatcher([ | |
2535 ("html", self.startTagHtml) | |
2536 ]) | |
2537 self.startTagHandler.default = self.startTagOther | |
2538 | |
2539 self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) | |
2540 self.endTagHandler.default = self.endTagOther | |
2541 | |
2542 def processEOF(self): | |
2543 # Stop parsing | |
2544 pass | |
2545 | |
2546 def processComment(self, token): | |
2547 # This is needed because data is to be appended to the <html> element | |
2548 # here and not to whatever is currently open. | |
2549 self.tree.insertComment(token, self.tree.openElements[0]) | |
2550 | |
2551 def processCharacters(self, token): | |
2552 self.parser.parseError("unexpected-char-after-body") | |
2553 self.parser.phase = self.parser.phases["inBody"] | |
2554 return token | |
2555 | |
2556 def startTagHtml(self, token): | |
2557 return self.parser.phases["inBody"].processStartTag(token) | |
2558 | |
2559 def startTagOther(self, token): | |
2560 self.parser.parseError("unexpected-start-tag-after-body", | |
2561 {"name": token["name"]}) | |
2562 self.parser.phase = self.parser.phases["inBody"] | |
2563 return token | |
2564 | |
2565 def endTagHtml(self, name): | |
2566 if self.parser.innerHTML: | |
2567 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") | |
2568 else: | |
2569 self.parser.phase = self.parser.phases["afterAfterBody"] | |
2570 | |
2571 def endTagOther(self, token): | |
2572 self.parser.parseError("unexpected-end-tag-after-body", | |
2573 {"name": token["name"]}) | |
2574 self.parser.phase = self.parser.phases["inBody"] | |
2575 return token | |
2576 | |
2577 class InFramesetPhase(Phase): | |
2578 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset | |
2579 def __init__(self, parser, tree): | |
2580 Phase.__init__(self, parser, tree) | |
2581 | |
2582 self.startTagHandler = _utils.MethodDispatcher([ | |
2583 ("html", self.startTagHtml), | |
2584 ("frameset", self.startTagFrameset), | |
2585 ("frame", self.startTagFrame), | |
2586 ("noframes", self.startTagNoframes) | |
2587 ]) | |
2588 self.startTagHandler.default = self.startTagOther | |
2589 | |
2590 self.endTagHandler = _utils.MethodDispatcher([ | |
2591 ("frameset", self.endTagFrameset) | |
2592 ]) | |
2593 self.endTagHandler.default = self.endTagOther | |
2594 | |
2595 def processEOF(self): | |
2596 if self.tree.openElements[-1].name != "html": | |
2597 self.parser.parseError("eof-in-frameset") | |
2598 else: | |
2599 assert self.parser.innerHTML | |
2600 | |
2601 def processCharacters(self, token): | |
2602 self.parser.parseError("unexpected-char-in-frameset") | |
2603 | |
2604 def startTagFrameset(self, token): | |
2605 self.tree.insertElement(token) | |
2606 | |
2607 def startTagFrame(self, token): | |
2608 self.tree.insertElement(token) | |
2609 self.tree.openElements.pop() | |
2610 | |
2611 def startTagNoframes(self, token): | |
2612 return self.parser.phases["inBody"].processStartTag(token) | |
2613 | |
2614 def startTagOther(self, token): | |
2615 self.parser.parseError("unexpected-start-tag-in-frameset", | |
2616 {"name": token["name"]}) | |
2617 | |
2618 def endTagFrameset(self, token): | |
2619 if self.tree.openElements[-1].name == "html": | |
2620 # innerHTML case | |
2621 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") | |
2622 else: | |
2623 self.tree.openElements.pop() | |
2624 if (not self.parser.innerHTML and | |
2625 self.tree.openElements[-1].name != "frameset"): | |
2626 # If we're not in innerHTML mode and the current node is not a | |
2627 # "frameset" element (anymore) then switch. | |
2628 self.parser.phase = self.parser.phases["afterFrameset"] | |
2629 | |
2630 def endTagOther(self, token): | |
2631 self.parser.parseError("unexpected-end-tag-in-frameset", | |
2632 {"name": token["name"]}) | |
2633 | |
2634 class AfterFramesetPhase(Phase): | |
2635 # http://www.whatwg.org/specs/web-apps/current-work/#after3 | |
2636 def __init__(self, parser, tree): | |
2637 Phase.__init__(self, parser, tree) | |
2638 | |
2639 self.startTagHandler = _utils.MethodDispatcher([ | |
2640 ("html", self.startTagHtml), | |
2641 ("noframes", self.startTagNoframes) | |
2642 ]) | |
2643 self.startTagHandler.default = self.startTagOther | |
2644 | |
2645 self.endTagHandler = _utils.MethodDispatcher([ | |
2646 ("html", self.endTagHtml) | |
2647 ]) | |
2648 self.endTagHandler.default = self.endTagOther | |
2649 | |
2650 def processEOF(self): | |
2651 # Stop parsing | |
2652 pass | |
2653 | |
2654 def processCharacters(self, token): | |
2655 self.parser.parseError("unexpected-char-after-frameset") | |
2656 | |
2657 def startTagNoframes(self, token): | |
2658 return self.parser.phases["inHead"].processStartTag(token) | |
2659 | |
2660 def startTagOther(self, token): | |
2661 self.parser.parseError("unexpected-start-tag-after-frameset", | |
2662 {"name": token["name"]}) | |
2663 | |
2664 def endTagHtml(self, token): | |
2665 self.parser.phase = self.parser.phases["afterAfterFrameset"] | |
2666 | |
2667 def endTagOther(self, token): | |
2668 self.parser.parseError("unexpected-end-tag-after-frameset", | |
2669 {"name": token["name"]}) | |
2670 | |
2671 class AfterAfterBodyPhase(Phase): | |
2672 def __init__(self, parser, tree): | |
2673 Phase.__init__(self, parser, tree) | |
2674 | |
2675 self.startTagHandler = _utils.MethodDispatcher([ | |
2676 ("html", self.startTagHtml) | |
2677 ]) | |
2678 self.startTagHandler.default = self.startTagOther | |
2679 | |
2680 def processEOF(self): | |
2681 pass | |
2682 | |
2683 def processComment(self, token): | |
2684 self.tree.insertComment(token, self.tree.document) | |
2685 | |
2686 def processSpaceCharacters(self, token): | |
2687 return self.parser.phases["inBody"].processSpaceCharacters(token) | |
2688 | |
2689 def processCharacters(self, token): | |
2690 self.parser.parseError("expected-eof-but-got-char") | |
2691 self.parser.phase = self.parser.phases["inBody"] | |
2692 return token | |
2693 | |
2694 def startTagHtml(self, token): | |
2695 return self.parser.phases["inBody"].processStartTag(token) | |
2696 | |
2697 def startTagOther(self, token): | |
2698 self.parser.parseError("expected-eof-but-got-start-tag", | |
2699 {"name": token["name"]}) | |
2700 self.parser.phase = self.parser.phases["inBody"] | |
2701 return token | |
2702 | |
2703 def processEndTag(self, token): | |
2704 self.parser.parseError("expected-eof-but-got-end-tag", | |
2705 {"name": token["name"]}) | |
2706 self.parser.phase = self.parser.phases["inBody"] | |
2707 return token | |
2708 | |
2709 class AfterAfterFramesetPhase(Phase): | |
2710 def __init__(self, parser, tree): | |
2711 Phase.__init__(self, parser, tree) | |
2712 | |
2713 self.startTagHandler = _utils.MethodDispatcher([ | |
2714 ("html", self.startTagHtml), | |
2715 ("noframes", self.startTagNoFrames) | |
2716 ]) | |
2717 self.startTagHandler.default = self.startTagOther | |
2718 | |
2719 def processEOF(self): | |
2720 pass | |
2721 | |
2722 def processComment(self, token): | |
2723 self.tree.insertComment(token, self.tree.document) | |
2724 | |
2725 def processSpaceCharacters(self, token): | |
2726 return self.parser.phases["inBody"].processSpaceCharacters(token) | |
2727 | |
2728 def processCharacters(self, token): | |
2729 self.parser.parseError("expected-eof-but-got-char") | |
2730 | |
2731 def startTagHtml(self, token): | |
2732 return self.parser.phases["inBody"].processStartTag(token) | |
2733 | |
2734 def startTagNoFrames(self, token): | |
2735 return self.parser.phases["inHead"].processStartTag(token) | |
2736 | |
2737 def startTagOther(self, token): | |
2738 self.parser.parseError("expected-eof-but-got-start-tag", | |
2739 {"name": token["name"]}) | |
2740 | |
2741 def processEndTag(self, token): | |
2742 self.parser.parseError("expected-eof-but-got-end-tag", | |
2743 {"name": token["name"]}) | |
2744 # pylint:enable=unused-argument | |
2745 | |
2746 return { | |
2747 "initial": InitialPhase, | |
2748 "beforeHtml": BeforeHtmlPhase, | |
2749 "beforeHead": BeforeHeadPhase, | |
2750 "inHead": InHeadPhase, | |
2751 "inHeadNoscript": InHeadNoscriptPhase, | |
2752 "afterHead": AfterHeadPhase, | |
2753 "inBody": InBodyPhase, | |
2754 "text": TextPhase, | |
2755 "inTable": InTablePhase, | |
2756 "inTableText": InTableTextPhase, | |
2757 "inCaption": InCaptionPhase, | |
2758 "inColumnGroup": InColumnGroupPhase, | |
2759 "inTableBody": InTableBodyPhase, | |
2760 "inRow": InRowPhase, | |
2761 "inCell": InCellPhase, | |
2762 "inSelect": InSelectPhase, | |
2763 "inSelectInTable": InSelectInTablePhase, | |
2764 "inForeignContent": InForeignContentPhase, | |
2765 "afterBody": AfterBodyPhase, | |
2766 "inFrameset": InFramesetPhase, | |
2767 "afterFrameset": AfterFramesetPhase, | |
2768 "afterAfterBody": AfterAfterBodyPhase, | |
2769 "afterAfterFrameset": AfterAfterFramesetPhase, | |
2770 # XXX after after frameset | |
2771 } | |
2772 | |
2773 | |
2774 def adjust_attributes(token, replacements): | |
2775 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) | |
2776 if needs_adjustment: | |
2777 token['data'] = OrderedDict((replacements.get(k, k), v) | |
2778 for k, v in token['data'].items()) | |
2779 | |
2780 | |
2781 def impliedTagToken(name, type="EndTag", attributes=None, | |
2782 selfClosing=False): | |
2783 if attributes is None: | |
2784 attributes = {} | |
2785 return {"type": tokenTypes[type], "name": name, "data": attributes, | |
2786 "selfClosing": selfClosing} | |
2787 | |
2788 | |
2789 class ParseError(Exception): | |
2790 """Error in parsed document""" | |
2791 pass |