comparison lib/python3.8/site-packages/pip/_vendor/html5lib/html5parser.py @ 0:9e54283cc701 draft

"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author guerler
date Mon, 27 Jul 2020 03:47:31 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9e54283cc701
1 from __future__ import absolute_import, division, unicode_literals
2 from pip._vendor.six import with_metaclass, viewkeys
3
4 import types
5 from collections import OrderedDict
6
7 from . import _inputstream
8 from . import _tokenizer
9
10 from . import treebuilders
11 from .treebuilders.base import Marker
12
13 from . import _utils
14 from .constants import (
15 spaceCharacters, asciiUpper2Lower,
16 specialElements, headingElements, cdataElements, rcdataElements,
17 tokenTypes, tagTokenTypes,
18 namespaces,
19 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
20 adjustForeignAttributes as adjustForeignAttributesMap,
21 adjustMathMLAttributes, adjustSVGAttributes,
22 E,
23 _ReparseException
24 )
25
26
27 def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
28 """Parse an HTML document as a string or file-like object into a tree
29
30 :arg doc: the document to parse as a string or file-like object
31
32 :arg treebuilder: the treebuilder to use when parsing
33
34 :arg namespaceHTMLElements: whether or not to namespace HTML elements
35
36 :returns: parsed tree
37
38 Example:
39
40 >>> from html5lib.html5parser import parse
41 >>> parse('<html><body><p>This is a doc</p></body></html>')
42 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43
44 """
45 tb = treebuilders.getTreeBuilder(treebuilder)
46 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
47 return p.parse(doc, **kwargs)
48
49
50 def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
51 """Parse an HTML fragment as a string or file-like object into a tree
52
53 :arg doc: the fragment to parse as a string or file-like object
54
55 :arg container: the container context to parse the fragment in
56
57 :arg treebuilder: the treebuilder to use when parsing
58
59 :arg namespaceHTMLElements: whether or not to namespace HTML elements
60
61 :returns: parsed tree
62
63 Example:
64
65 >>> from html5lib.html5libparser import parseFragment
66 >>> parseFragment('<b>this is a fragment</b>')
67 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68
69 """
70 tb = treebuilders.getTreeBuilder(treebuilder)
71 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
72 return p.parseFragment(doc, container=container, **kwargs)
73
74
75 def method_decorator_metaclass(function):
76 class Decorated(type):
77 def __new__(meta, classname, bases, classDict):
78 for attributeName, attribute in classDict.items():
79 if isinstance(attribute, types.FunctionType):
80 attribute = function(attribute)
81
82 classDict[attributeName] = attribute
83 return type.__new__(meta, classname, bases, classDict)
84 return Decorated
85
86
87 class HTMLParser(object):
88 """HTML parser
89
90 Generates a tree structure from a stream of (possibly malformed) HTML.
91
92 """
93
94 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
95 """
96 :arg tree: a treebuilder class controlling the type of tree that will be
97 returned. Built in treebuilders can be accessed through
98 html5lib.treebuilders.getTreeBuilder(treeType)
99
100 :arg strict: raise an exception when a parse error is encountered
101
102 :arg namespaceHTMLElements: whether or not to namespace HTML elements
103
104 :arg debug: whether or not to enable debug mode which logs things
105
106 Example:
107
108 >>> from html5lib.html5parser import HTMLParser
109 >>> parser = HTMLParser() # generates parser with etree builder
110 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
111
112 """
113
114 # Raise an exception on the first error encountered
115 self.strict = strict
116
117 if tree is None:
118 tree = treebuilders.getTreeBuilder("etree")
119 self.tree = tree(namespaceHTMLElements)
120 self.errors = []
121
122 self.phases = dict([(name, cls(self, self.tree)) for name, cls in
123 getPhases(debug).items()])
124
125 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
126
127 self.innerHTMLMode = innerHTML
128 self.container = container
129 self.scripting = scripting
130 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
131 self.reset()
132
133 try:
134 self.mainLoop()
135 except _ReparseException:
136 self.reset()
137 self.mainLoop()
138
139 def reset(self):
140 self.tree.reset()
141 self.firstStartTag = False
142 self.errors = []
143 self.log = [] # only used with debug mode
144 # "quirks" / "limited quirks" / "no quirks"
145 self.compatMode = "no quirks"
146
147 if self.innerHTMLMode:
148 self.innerHTML = self.container.lower()
149
150 if self.innerHTML in cdataElements:
151 self.tokenizer.state = self.tokenizer.rcdataState
152 elif self.innerHTML in rcdataElements:
153 self.tokenizer.state = self.tokenizer.rawtextState
154 elif self.innerHTML == 'plaintext':
155 self.tokenizer.state = self.tokenizer.plaintextState
156 else:
157 # state already is data state
158 # self.tokenizer.state = self.tokenizer.dataState
159 pass
160 self.phase = self.phases["beforeHtml"]
161 self.phase.insertHtmlElement()
162 self.resetInsertionMode()
163 else:
164 self.innerHTML = False # pylint:disable=redefined-variable-type
165 self.phase = self.phases["initial"]
166
167 self.lastPhase = None
168
169 self.beforeRCDataPhase = None
170
171 self.framesetOK = True
172
173 @property
174 def documentEncoding(self):
175 """Name of the character encoding that was used to decode the input stream, or
176 :obj:`None` if that is not determined yet
177
178 """
179 if not hasattr(self, 'tokenizer'):
180 return None
181 return self.tokenizer.stream.charEncoding[0].name
182
183 def isHTMLIntegrationPoint(self, element):
184 if (element.name == "annotation-xml" and
185 element.namespace == namespaces["mathml"]):
186 return ("encoding" in element.attributes and
187 element.attributes["encoding"].translate(
188 asciiUpper2Lower) in
189 ("text/html", "application/xhtml+xml"))
190 else:
191 return (element.namespace, element.name) in htmlIntegrationPointElements
192
193 def isMathMLTextIntegrationPoint(self, element):
194 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
195
196 def mainLoop(self):
197 CharactersToken = tokenTypes["Characters"]
198 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
199 StartTagToken = tokenTypes["StartTag"]
200 EndTagToken = tokenTypes["EndTag"]
201 CommentToken = tokenTypes["Comment"]
202 DoctypeToken = tokenTypes["Doctype"]
203 ParseErrorToken = tokenTypes["ParseError"]
204
205 for token in self.normalizedTokens():
206 prev_token = None
207 new_token = token
208 while new_token is not None:
209 prev_token = new_token
210 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
211 currentNodeNamespace = currentNode.namespace if currentNode else None
212 currentNodeName = currentNode.name if currentNode else None
213
214 type = new_token["type"]
215
216 if type == ParseErrorToken:
217 self.parseError(new_token["data"], new_token.get("datavars", {}))
218 new_token = None
219 else:
220 if (len(self.tree.openElements) == 0 or
221 currentNodeNamespace == self.tree.defaultNamespace or
222 (self.isMathMLTextIntegrationPoint(currentNode) and
223 ((type == StartTagToken and
224 token["name"] not in frozenset(["mglyph", "malignmark"])) or
225 type in (CharactersToken, SpaceCharactersToken))) or
226 (currentNodeNamespace == namespaces["mathml"] and
227 currentNodeName == "annotation-xml" and
228 type == StartTagToken and
229 token["name"] == "svg") or
230 (self.isHTMLIntegrationPoint(currentNode) and
231 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
232 phase = self.phase
233 else:
234 phase = self.phases["inForeignContent"]
235
236 if type == CharactersToken:
237 new_token = phase.processCharacters(new_token)
238 elif type == SpaceCharactersToken:
239 new_token = phase.processSpaceCharacters(new_token)
240 elif type == StartTagToken:
241 new_token = phase.processStartTag(new_token)
242 elif type == EndTagToken:
243 new_token = phase.processEndTag(new_token)
244 elif type == CommentToken:
245 new_token = phase.processComment(new_token)
246 elif type == DoctypeToken:
247 new_token = phase.processDoctype(new_token)
248
249 if (type == StartTagToken and prev_token["selfClosing"] and
250 not prev_token["selfClosingAcknowledged"]):
251 self.parseError("non-void-element-with-trailing-solidus",
252 {"name": prev_token["name"]})
253
254 # When the loop finishes it's EOF
255 reprocess = True
256 phases = []
257 while reprocess:
258 phases.append(self.phase)
259 reprocess = self.phase.processEOF()
260 if reprocess:
261 assert self.phase not in phases
262
263 def normalizedTokens(self):
264 for token in self.tokenizer:
265 yield self.normalizeToken(token)
266
267 def parse(self, stream, *args, **kwargs):
268 """Parse a HTML document into a well-formed tree
269
270 :arg stream: a file-like object or string containing the HTML to be parsed
271
272 The optional encoding parameter must be a string that indicates
273 the encoding. If specified, that encoding will be used,
274 regardless of any BOM or later declaration (such as in a meta
275 element).
276
277 :arg scripting: treat noscript elements as if JavaScript was turned on
278
279 :returns: parsed tree
280
281 Example:
282
283 >>> from html5lib.html5parser import HTMLParser
284 >>> parser = HTMLParser()
285 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
287
288 """
289 self._parse(stream, False, None, *args, **kwargs)
290 return self.tree.getDocument()
291
292 def parseFragment(self, stream, *args, **kwargs):
293 """Parse a HTML fragment into a well-formed tree fragment
294
295 :arg container: name of the element we're setting the innerHTML
296 property if set to None, default to 'div'
297
298 :arg stream: a file-like object or string containing the HTML to be parsed
299
300 The optional encoding parameter must be a string that indicates
301 the encoding. If specified, that encoding will be used,
302 regardless of any BOM or later declaration (such as in a meta
303 element)
304
305 :arg scripting: treat noscript elements as if JavaScript was turned on
306
307 :returns: parsed tree
308
309 Example:
310
311 >>> from html5lib.html5libparser import HTMLParser
312 >>> parser = HTMLParser()
313 >>> parser.parseFragment('<b>this is a fragment</b>')
314 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
315
316 """
317 self._parse(stream, True, *args, **kwargs)
318 return self.tree.getFragment()
319
320 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
321 # XXX The idea is to make errorcode mandatory.
322 if datavars is None:
323 datavars = {}
324 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
325 if self.strict:
326 raise ParseError(E[errorcode] % datavars)
327
328 def normalizeToken(self, token):
329 # HTML5 specific normalizations to the token stream
330 if token["type"] == tokenTypes["StartTag"]:
331 raw = token["data"]
332 token["data"] = OrderedDict(raw)
333 if len(raw) > len(token["data"]):
334 # we had some duplicated attribute, fix so first wins
335 token["data"].update(raw[::-1])
336
337 return token
338
339 def adjustMathMLAttributes(self, token):
340 adjust_attributes(token, adjustMathMLAttributes)
341
342 def adjustSVGAttributes(self, token):
343 adjust_attributes(token, adjustSVGAttributes)
344
345 def adjustForeignAttributes(self, token):
346 adjust_attributes(token, adjustForeignAttributesMap)
347
348 def reparseTokenNormal(self, token):
349 # pylint:disable=unused-argument
350 self.parser.phase()
351
352 def resetInsertionMode(self):
353 # The name of this method is mostly historical. (It's also used in the
354 # specification.)
355 last = False
356 newModes = {
357 "select": "inSelect",
358 "td": "inCell",
359 "th": "inCell",
360 "tr": "inRow",
361 "tbody": "inTableBody",
362 "thead": "inTableBody",
363 "tfoot": "inTableBody",
364 "caption": "inCaption",
365 "colgroup": "inColumnGroup",
366 "table": "inTable",
367 "head": "inBody",
368 "body": "inBody",
369 "frameset": "inFrameset",
370 "html": "beforeHead"
371 }
372 for node in self.tree.openElements[::-1]:
373 nodeName = node.name
374 new_phase = None
375 if node == self.tree.openElements[0]:
376 assert self.innerHTML
377 last = True
378 nodeName = self.innerHTML
379 # Check for conditions that should only happen in the innerHTML
380 # case
381 if nodeName in ("select", "colgroup", "head", "html"):
382 assert self.innerHTML
383
384 if not last and node.namespace != self.tree.defaultNamespace:
385 continue
386
387 if nodeName in newModes:
388 new_phase = self.phases[newModes[nodeName]]
389 break
390 elif last:
391 new_phase = self.phases["inBody"]
392 break
393
394 self.phase = new_phase
395
396 def parseRCDataRawtext(self, token, contentType):
397 # Generic RCDATA/RAWTEXT Parsing algorithm
398 assert contentType in ("RAWTEXT", "RCDATA")
399
400 self.tree.insertElement(token)
401
402 if contentType == "RAWTEXT":
403 self.tokenizer.state = self.tokenizer.rawtextState
404 else:
405 self.tokenizer.state = self.tokenizer.rcdataState
406
407 self.originalPhase = self.phase
408
409 self.phase = self.phases["text"]
410
411
412 @_utils.memoize
413 def getPhases(debug):
414 def log(function):
415 """Logger that records which phase processes each token"""
416 type_names = dict((value, key) for key, value in
417 tokenTypes.items())
418
419 def wrapped(self, *args, **kwargs):
420 if function.__name__.startswith("process") and len(args) > 0:
421 token = args[0]
422 try:
423 info = {"type": type_names[token['type']]}
424 except:
425 raise
426 if token['type'] in tagTokenTypes:
427 info["name"] = token['name']
428
429 self.parser.log.append((self.parser.tokenizer.state.__name__,
430 self.parser.phase.__class__.__name__,
431 self.__class__.__name__,
432 function.__name__,
433 info))
434 return function(self, *args, **kwargs)
435 else:
436 return function(self, *args, **kwargs)
437 return wrapped
438
439 def getMetaclass(use_metaclass, metaclass_func):
440 if use_metaclass:
441 return method_decorator_metaclass(metaclass_func)
442 else:
443 return type
444
445 # pylint:disable=unused-argument
446 class Phase(with_metaclass(getMetaclass(debug, log))):
447 """Base class for helper object that implements each phase of processing
448 """
449
450 def __init__(self, parser, tree):
451 self.parser = parser
452 self.tree = tree
453
454 def processEOF(self):
455 raise NotImplementedError
456
457 def processComment(self, token):
458 # For most phases the following is correct. Where it's not it will be
459 # overridden.
460 self.tree.insertComment(token, self.tree.openElements[-1])
461
462 def processDoctype(self, token):
463 self.parser.parseError("unexpected-doctype")
464
465 def processCharacters(self, token):
466 self.tree.insertText(token["data"])
467
468 def processSpaceCharacters(self, token):
469 self.tree.insertText(token["data"])
470
471 def processStartTag(self, token):
472 return self.startTagHandler[token["name"]](token)
473
474 def startTagHtml(self, token):
475 if not self.parser.firstStartTag and token["name"] == "html":
476 self.parser.parseError("non-html-root")
477 # XXX Need a check here to see if the first start tag token emitted is
478 # this token... If it's not, invoke self.parser.parseError().
479 for attr, value in token["data"].items():
480 if attr not in self.tree.openElements[0].attributes:
481 self.tree.openElements[0].attributes[attr] = value
482 self.parser.firstStartTag = False
483
484 def processEndTag(self, token):
485 return self.endTagHandler[token["name"]](token)
486
487 class InitialPhase(Phase):
488 def processSpaceCharacters(self, token):
489 pass
490
491 def processComment(self, token):
492 self.tree.insertComment(token, self.tree.document)
493
494 def processDoctype(self, token):
495 name = token["name"]
496 publicId = token["publicId"]
497 systemId = token["systemId"]
498 correct = token["correct"]
499
500 if (name != "html" or publicId is not None or
501 systemId is not None and systemId != "about:legacy-compat"):
502 self.parser.parseError("unknown-doctype")
503
504 if publicId is None:
505 publicId = ""
506
507 self.tree.insertDoctype(token)
508
509 if publicId != "":
510 publicId = publicId.translate(asciiUpper2Lower)
511
512 if (not correct or token["name"] != "html" or
513 publicId.startswith(
514 ("+//silmaril//dtd html pro v0r11 19970101//",
515 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
516 "-//as//dtd html 3.0 aswedit + extensions//",
517 "-//ietf//dtd html 2.0 level 1//",
518 "-//ietf//dtd html 2.0 level 2//",
519 "-//ietf//dtd html 2.0 strict level 1//",
520 "-//ietf//dtd html 2.0 strict level 2//",
521 "-//ietf//dtd html 2.0 strict//",
522 "-//ietf//dtd html 2.0//",
523 "-//ietf//dtd html 2.1e//",
524 "-//ietf//dtd html 3.0//",
525 "-//ietf//dtd html 3.2 final//",
526 "-//ietf//dtd html 3.2//",
527 "-//ietf//dtd html 3//",
528 "-//ietf//dtd html level 0//",
529 "-//ietf//dtd html level 1//",
530 "-//ietf//dtd html level 2//",
531 "-//ietf//dtd html level 3//",
532 "-//ietf//dtd html strict level 0//",
533 "-//ietf//dtd html strict level 1//",
534 "-//ietf//dtd html strict level 2//",
535 "-//ietf//dtd html strict level 3//",
536 "-//ietf//dtd html strict//",
537 "-//ietf//dtd html//",
538 "-//metrius//dtd metrius presentational//",
539 "-//microsoft//dtd internet explorer 2.0 html strict//",
540 "-//microsoft//dtd internet explorer 2.0 html//",
541 "-//microsoft//dtd internet explorer 2.0 tables//",
542 "-//microsoft//dtd internet explorer 3.0 html strict//",
543 "-//microsoft//dtd internet explorer 3.0 html//",
544 "-//microsoft//dtd internet explorer 3.0 tables//",
545 "-//netscape comm. corp.//dtd html//",
546 "-//netscape comm. corp.//dtd strict html//",
547 "-//o'reilly and associates//dtd html 2.0//",
548 "-//o'reilly and associates//dtd html extended 1.0//",
549 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
550 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
551 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
552 "-//spyglass//dtd html 2.0 extended//",
553 "-//sq//dtd html 2.0 hotmetal + extensions//",
554 "-//sun microsystems corp.//dtd hotjava html//",
555 "-//sun microsystems corp.//dtd hotjava strict html//",
556 "-//w3c//dtd html 3 1995-03-24//",
557 "-//w3c//dtd html 3.2 draft//",
558 "-//w3c//dtd html 3.2 final//",
559 "-//w3c//dtd html 3.2//",
560 "-//w3c//dtd html 3.2s draft//",
561 "-//w3c//dtd html 4.0 frameset//",
562 "-//w3c//dtd html 4.0 transitional//",
563 "-//w3c//dtd html experimental 19960712//",
564 "-//w3c//dtd html experimental 970421//",
565 "-//w3c//dtd w3 html//",
566 "-//w3o//dtd w3 html 3.0//",
567 "-//webtechs//dtd mozilla html 2.0//",
568 "-//webtechs//dtd mozilla html//")) or
569 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
570 "-/w3c/dtd html 4.0 transitional/en",
571 "html") or
572 publicId.startswith(
573 ("-//w3c//dtd html 4.01 frameset//",
574 "-//w3c//dtd html 4.01 transitional//")) and
575 systemId is None or
576 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
577 self.parser.compatMode = "quirks"
578 elif (publicId.startswith(
579 ("-//w3c//dtd xhtml 1.0 frameset//",
580 "-//w3c//dtd xhtml 1.0 transitional//")) or
581 publicId.startswith(
582 ("-//w3c//dtd html 4.01 frameset//",
583 "-//w3c//dtd html 4.01 transitional//")) and
584 systemId is not None):
585 self.parser.compatMode = "limited quirks"
586
587 self.parser.phase = self.parser.phases["beforeHtml"]
588
589 def anythingElse(self):
590 self.parser.compatMode = "quirks"
591 self.parser.phase = self.parser.phases["beforeHtml"]
592
593 def processCharacters(self, token):
594 self.parser.parseError("expected-doctype-but-got-chars")
595 self.anythingElse()
596 return token
597
598 def processStartTag(self, token):
599 self.parser.parseError("expected-doctype-but-got-start-tag",
600 {"name": token["name"]})
601 self.anythingElse()
602 return token
603
604 def processEndTag(self, token):
605 self.parser.parseError("expected-doctype-but-got-end-tag",
606 {"name": token["name"]})
607 self.anythingElse()
608 return token
609
610 def processEOF(self):
611 self.parser.parseError("expected-doctype-but-got-eof")
612 self.anythingElse()
613 return True
614
615 class BeforeHtmlPhase(Phase):
616 # helper methods
617 def insertHtmlElement(self):
618 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
619 self.parser.phase = self.parser.phases["beforeHead"]
620
621 # other
622 def processEOF(self):
623 self.insertHtmlElement()
624 return True
625
626 def processComment(self, token):
627 self.tree.insertComment(token, self.tree.document)
628
629 def processSpaceCharacters(self, token):
630 pass
631
632 def processCharacters(self, token):
633 self.insertHtmlElement()
634 return token
635
636 def processStartTag(self, token):
637 if token["name"] == "html":
638 self.parser.firstStartTag = True
639 self.insertHtmlElement()
640 return token
641
642 def processEndTag(self, token):
643 if token["name"] not in ("head", "body", "html", "br"):
644 self.parser.parseError("unexpected-end-tag-before-html",
645 {"name": token["name"]})
646 else:
647 self.insertHtmlElement()
648 return token
649
650 class BeforeHeadPhase(Phase):
651 def __init__(self, parser, tree):
652 Phase.__init__(self, parser, tree)
653
654 self.startTagHandler = _utils.MethodDispatcher([
655 ("html", self.startTagHtml),
656 ("head", self.startTagHead)
657 ])
658 self.startTagHandler.default = self.startTagOther
659
660 self.endTagHandler = _utils.MethodDispatcher([
661 (("head", "body", "html", "br"), self.endTagImplyHead)
662 ])
663 self.endTagHandler.default = self.endTagOther
664
665 def processEOF(self):
666 self.startTagHead(impliedTagToken("head", "StartTag"))
667 return True
668
669 def processSpaceCharacters(self, token):
670 pass
671
672 def processCharacters(self, token):
673 self.startTagHead(impliedTagToken("head", "StartTag"))
674 return token
675
676 def startTagHtml(self, token):
677 return self.parser.phases["inBody"].processStartTag(token)
678
679 def startTagHead(self, token):
680 self.tree.insertElement(token)
681 self.tree.headPointer = self.tree.openElements[-1]
682 self.parser.phase = self.parser.phases["inHead"]
683
684 def startTagOther(self, token):
685 self.startTagHead(impliedTagToken("head", "StartTag"))
686 return token
687
688 def endTagImplyHead(self, token):
689 self.startTagHead(impliedTagToken("head", "StartTag"))
690 return token
691
692 def endTagOther(self, token):
693 self.parser.parseError("end-tag-after-implied-root",
694 {"name": token["name"]})
695
696 class InHeadPhase(Phase):
697 def __init__(self, parser, tree):
698 Phase.__init__(self, parser, tree)
699
700 self.startTagHandler = _utils.MethodDispatcher([
701 ("html", self.startTagHtml),
702 ("title", self.startTagTitle),
703 (("noframes", "style"), self.startTagNoFramesStyle),
704 ("noscript", self.startTagNoscript),
705 ("script", self.startTagScript),
706 (("base", "basefont", "bgsound", "command", "link"),
707 self.startTagBaseLinkCommand),
708 ("meta", self.startTagMeta),
709 ("head", self.startTagHead)
710 ])
711 self.startTagHandler.default = self.startTagOther
712
713 self.endTagHandler = _utils.MethodDispatcher([
714 ("head", self.endTagHead),
715 (("br", "html", "body"), self.endTagHtmlBodyBr)
716 ])
717 self.endTagHandler.default = self.endTagOther
718
719 # the real thing
720 def processEOF(self):
721 self.anythingElse()
722 return True
723
724 def processCharacters(self, token):
725 self.anythingElse()
726 return token
727
728 def startTagHtml(self, token):
729 return self.parser.phases["inBody"].processStartTag(token)
730
731 def startTagHead(self, token):
732 self.parser.parseError("two-heads-are-not-better-than-one")
733
734 def startTagBaseLinkCommand(self, token):
735 self.tree.insertElement(token)
736 self.tree.openElements.pop()
737 token["selfClosingAcknowledged"] = True
738
739 def startTagMeta(self, token):
740 self.tree.insertElement(token)
741 self.tree.openElements.pop()
742 token["selfClosingAcknowledged"] = True
743
744 attributes = token["data"]
745 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
746 if "charset" in attributes:
747 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
748 elif ("content" in attributes and
749 "http-equiv" in attributes and
750 attributes["http-equiv"].lower() == "content-type"):
751 # Encoding it as UTF-8 here is a hack, as really we should pass
752 # the abstract Unicode string, and just use the
753 # ContentAttrParser on that, but using UTF-8 allows all chars
754 # to be encoded and as a ASCII-superset works.
755 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
756 parser = _inputstream.ContentAttrParser(data)
757 codec = parser.parse()
758 self.parser.tokenizer.stream.changeEncoding(codec)
759
760 def startTagTitle(self, token):
761 self.parser.parseRCDataRawtext(token, "RCDATA")
762
763 def startTagNoFramesStyle(self, token):
764 # Need to decide whether to implement the scripting-disabled case
765 self.parser.parseRCDataRawtext(token, "RAWTEXT")
766
767 def startTagNoscript(self, token):
768 if self.parser.scripting:
769 self.parser.parseRCDataRawtext(token, "RAWTEXT")
770 else:
771 self.tree.insertElement(token)
772 self.parser.phase = self.parser.phases["inHeadNoscript"]
773
774 def startTagScript(self, token):
775 self.tree.insertElement(token)
776 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
777 self.parser.originalPhase = self.parser.phase
778 self.parser.phase = self.parser.phases["text"]
779
780 def startTagOther(self, token):
781 self.anythingElse()
782 return token
783
784 def endTagHead(self, token):
785 node = self.parser.tree.openElements.pop()
786 assert node.name == "head", "Expected head got %s" % node.name
787 self.parser.phase = self.parser.phases["afterHead"]
788
789 def endTagHtmlBodyBr(self, token):
790 self.anythingElse()
791 return token
792
793 def endTagOther(self, token):
794 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
795
796 def anythingElse(self):
797 self.endTagHead(impliedTagToken("head"))
798
799 class InHeadNoscriptPhase(Phase):
800 def __init__(self, parser, tree):
801 Phase.__init__(self, parser, tree)
802
803 self.startTagHandler = _utils.MethodDispatcher([
804 ("html", self.startTagHtml),
805 (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
806 (("head", "noscript"), self.startTagHeadNoscript),
807 ])
808 self.startTagHandler.default = self.startTagOther
809
810 self.endTagHandler = _utils.MethodDispatcher([
811 ("noscript", self.endTagNoscript),
812 ("br", self.endTagBr),
813 ])
814 self.endTagHandler.default = self.endTagOther
815
816 def processEOF(self):
817 self.parser.parseError("eof-in-head-noscript")
818 self.anythingElse()
819 return True
820
821 def processComment(self, token):
822 return self.parser.phases["inHead"].processComment(token)
823
824 def processCharacters(self, token):
825 self.parser.parseError("char-in-head-noscript")
826 self.anythingElse()
827 return token
828
829 def processSpaceCharacters(self, token):
830 return self.parser.phases["inHead"].processSpaceCharacters(token)
831
832 def startTagHtml(self, token):
833 return self.parser.phases["inBody"].processStartTag(token)
834
835 def startTagBaseLinkCommand(self, token):
836 return self.parser.phases["inHead"].processStartTag(token)
837
838 def startTagHeadNoscript(self, token):
839 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
840
841 def startTagOther(self, token):
842 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
843 self.anythingElse()
844 return token
845
846 def endTagNoscript(self, token):
847 node = self.parser.tree.openElements.pop()
848 assert node.name == "noscript", "Expected noscript got %s" % node.name
849 self.parser.phase = self.parser.phases["inHead"]
850
851 def endTagBr(self, token):
852 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
853 self.anythingElse()
854 return token
855
856 def endTagOther(self, token):
857 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
858
859 def anythingElse(self):
860 # Caller must raise parse error first!
861 self.endTagNoscript(impliedTagToken("noscript"))
862
863 class AfterHeadPhase(Phase):
864 def __init__(self, parser, tree):
865 Phase.__init__(self, parser, tree)
866
867 self.startTagHandler = _utils.MethodDispatcher([
868 ("html", self.startTagHtml),
869 ("body", self.startTagBody),
870 ("frameset", self.startTagFrameset),
871 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
872 "style", "title"),
873 self.startTagFromHead),
874 ("head", self.startTagHead)
875 ])
876 self.startTagHandler.default = self.startTagOther
877 self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
878 self.endTagHtmlBodyBr)])
879 self.endTagHandler.default = self.endTagOther
880
881 def processEOF(self):
882 self.anythingElse()
883 return True
884
885 def processCharacters(self, token):
886 self.anythingElse()
887 return token
888
889 def startTagHtml(self, token):
890 return self.parser.phases["inBody"].processStartTag(token)
891
892 def startTagBody(self, token):
893 self.parser.framesetOK = False
894 self.tree.insertElement(token)
895 self.parser.phase = self.parser.phases["inBody"]
896
897 def startTagFrameset(self, token):
898 self.tree.insertElement(token)
899 self.parser.phase = self.parser.phases["inFrameset"]
900
901 def startTagFromHead(self, token):
902 self.parser.parseError("unexpected-start-tag-out-of-my-head",
903 {"name": token["name"]})
904 self.tree.openElements.append(self.tree.headPointer)
905 self.parser.phases["inHead"].processStartTag(token)
906 for node in self.tree.openElements[::-1]:
907 if node.name == "head":
908 self.tree.openElements.remove(node)
909 break
910
911 def startTagHead(self, token):
912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
913
914 def startTagOther(self, token):
915 self.anythingElse()
916 return token
917
918 def endTagHtmlBodyBr(self, token):
919 self.anythingElse()
920 return token
921
922 def endTagOther(self, token):
923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
924
925 def anythingElse(self):
926 self.tree.insertElement(impliedTagToken("body", "StartTag"))
927 self.parser.phase = self.parser.phases["inBody"]
928 self.parser.framesetOK = True
929
930 class InBodyPhase(Phase):
931 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
932 # the really-really-really-very crazy mode
933 def __init__(self, parser, tree):
934 Phase.__init__(self, parser, tree)
935
936 # Set this to the default handler
937 self.processSpaceCharacters = self.processSpaceCharactersNonPre
938
939 self.startTagHandler = _utils.MethodDispatcher([
940 ("html", self.startTagHtml),
941 (("base", "basefont", "bgsound", "command", "link", "meta",
942 "script", "style", "title"),
943 self.startTagProcessInHead),
944 ("body", self.startTagBody),
945 ("frameset", self.startTagFrameset),
946 (("address", "article", "aside", "blockquote", "center", "details",
947 "dir", "div", "dl", "fieldset", "figcaption", "figure",
948 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
949 "section", "summary", "ul"),
950 self.startTagCloseP),
951 (headingElements, self.startTagHeading),
952 (("pre", "listing"), self.startTagPreListing),
953 ("form", self.startTagForm),
954 (("li", "dd", "dt"), self.startTagListItem),
955 ("plaintext", self.startTagPlaintext),
956 ("a", self.startTagA),
957 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
958 "strong", "tt", "u"), self.startTagFormatting),
959 ("nobr", self.startTagNobr),
960 ("button", self.startTagButton),
961 (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
962 ("xmp", self.startTagXmp),
963 ("table", self.startTagTable),
964 (("area", "br", "embed", "img", "keygen", "wbr"),
965 self.startTagVoidFormatting),
966 (("param", "source", "track"), self.startTagParamSource),
967 ("input", self.startTagInput),
968 ("hr", self.startTagHr),
969 ("image", self.startTagImage),
970 ("isindex", self.startTagIsIndex),
971 ("textarea", self.startTagTextarea),
972 ("iframe", self.startTagIFrame),
973 ("noscript", self.startTagNoscript),
974 (("noembed", "noframes"), self.startTagRawtext),
975 ("select", self.startTagSelect),
976 (("rp", "rt"), self.startTagRpRt),
977 (("option", "optgroup"), self.startTagOpt),
978 (("math"), self.startTagMath),
979 (("svg"), self.startTagSvg),
980 (("caption", "col", "colgroup", "frame", "head",
981 "tbody", "td", "tfoot", "th", "thead",
982 "tr"), self.startTagMisplaced)
983 ])
984 self.startTagHandler.default = self.startTagOther
985
986 self.endTagHandler = _utils.MethodDispatcher([
987 ("body", self.endTagBody),
988 ("html", self.endTagHtml),
989 (("address", "article", "aside", "blockquote", "button", "center",
990 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
991 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
992 "section", "summary", "ul"), self.endTagBlock),
993 ("form", self.endTagForm),
994 ("p", self.endTagP),
995 (("dd", "dt", "li"), self.endTagListItem),
996 (headingElements, self.endTagHeading),
997 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
998 "strike", "strong", "tt", "u"), self.endTagFormatting),
999 (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
1000 ("br", self.endTagBr),
1001 ])
1002 self.endTagHandler.default = self.endTagOther
1003
1004 def isMatchingFormattingElement(self, node1, node2):
1005 return (node1.name == node2.name and
1006 node1.namespace == node2.namespace and
1007 node1.attributes == node2.attributes)
1008
1009 # helper
1010 def addFormattingElement(self, token):
1011 self.tree.insertElement(token)
1012 element = self.tree.openElements[-1]
1013
1014 matchingElements = []
1015 for node in self.tree.activeFormattingElements[::-1]:
1016 if node is Marker:
1017 break
1018 elif self.isMatchingFormattingElement(node, element):
1019 matchingElements.append(node)
1020
1021 assert len(matchingElements) <= 3
1022 if len(matchingElements) == 3:
1023 self.tree.activeFormattingElements.remove(matchingElements[-1])
1024 self.tree.activeFormattingElements.append(element)
1025
1026 # the real deal
1027 def processEOF(self):
1028 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
1029 "tfoot", "th", "thead", "tr", "body",
1030 "html"))
1031 for node in self.tree.openElements[::-1]:
1032 if node.name not in allowed_elements:
1033 self.parser.parseError("expected-closing-tag-but-got-eof")
1034 break
1035 # Stop parsing
1036
1037 def processSpaceCharactersDropNewline(self, token):
1038 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
1039 # want to drop leading newlines
1040 data = token["data"]
1041 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1042 if (data.startswith("\n") and
1043 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
1044 not self.tree.openElements[-1].hasContent()):
1045 data = data[1:]
1046 if data:
1047 self.tree.reconstructActiveFormattingElements()
1048 self.tree.insertText(data)
1049
1050 def processCharacters(self, token):
1051 if token["data"] == "\u0000":
1052 # The tokenizer should always emit null on its own
1053 return
1054 self.tree.reconstructActiveFormattingElements()
1055 self.tree.insertText(token["data"])
1056 # This must be bad for performance
1057 if (self.parser.framesetOK and
1058 any([char not in spaceCharacters
1059 for char in token["data"]])):
1060 self.parser.framesetOK = False
1061
1062 def processSpaceCharactersNonPre(self, token):
1063 self.tree.reconstructActiveFormattingElements()
1064 self.tree.insertText(token["data"])
1065
1066 def startTagProcessInHead(self, token):
1067 return self.parser.phases["inHead"].processStartTag(token)
1068
1069 def startTagBody(self, token):
1070 self.parser.parseError("unexpected-start-tag", {"name": "body"})
1071 if (len(self.tree.openElements) == 1 or
1072 self.tree.openElements[1].name != "body"):
1073 assert self.parser.innerHTML
1074 else:
1075 self.parser.framesetOK = False
1076 for attr, value in token["data"].items():
1077 if attr not in self.tree.openElements[1].attributes:
1078 self.tree.openElements[1].attributes[attr] = value
1079
1080 def startTagFrameset(self, token):
1081 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1082 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1083 assert self.parser.innerHTML
1084 elif not self.parser.framesetOK:
1085 pass
1086 else:
1087 if self.tree.openElements[1].parent:
1088 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1089 while self.tree.openElements[-1].name != "html":
1090 self.tree.openElements.pop()
1091 self.tree.insertElement(token)
1092 self.parser.phase = self.parser.phases["inFrameset"]
1093
1094 def startTagCloseP(self, token):
1095 if self.tree.elementInScope("p", variant="button"):
1096 self.endTagP(impliedTagToken("p"))
1097 self.tree.insertElement(token)
1098
1099 def startTagPreListing(self, token):
1100 if self.tree.elementInScope("p", variant="button"):
1101 self.endTagP(impliedTagToken("p"))
1102 self.tree.insertElement(token)
1103 self.parser.framesetOK = False
1104 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1105
1106 def startTagForm(self, token):
1107 if self.tree.formPointer:
1108 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1109 else:
1110 if self.tree.elementInScope("p", variant="button"):
1111 self.endTagP(impliedTagToken("p"))
1112 self.tree.insertElement(token)
1113 self.tree.formPointer = self.tree.openElements[-1]
1114
1115 def startTagListItem(self, token):
1116 self.parser.framesetOK = False
1117
1118 stopNamesMap = {"li": ["li"],
1119 "dt": ["dt", "dd"],
1120 "dd": ["dt", "dd"]}
1121 stopNames = stopNamesMap[token["name"]]
1122 for node in reversed(self.tree.openElements):
1123 if node.name in stopNames:
1124 self.parser.phase.processEndTag(
1125 impliedTagToken(node.name, "EndTag"))
1126 break
1127 if (node.nameTuple in specialElements and
1128 node.name not in ("address", "div", "p")):
1129 break
1130
1131 if self.tree.elementInScope("p", variant="button"):
1132 self.parser.phase.processEndTag(
1133 impliedTagToken("p", "EndTag"))
1134
1135 self.tree.insertElement(token)
1136
1137 def startTagPlaintext(self, token):
1138 if self.tree.elementInScope("p", variant="button"):
1139 self.endTagP(impliedTagToken("p"))
1140 self.tree.insertElement(token)
1141 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1142
1143 def startTagHeading(self, token):
1144 if self.tree.elementInScope("p", variant="button"):
1145 self.endTagP(impliedTagToken("p"))
1146 if self.tree.openElements[-1].name in headingElements:
1147 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1148 self.tree.openElements.pop()
1149 self.tree.insertElement(token)
1150
1151 def startTagA(self, token):
1152 afeAElement = self.tree.elementInActiveFormattingElements("a")
1153 if afeAElement:
1154 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1155 {"startName": "a", "endName": "a"})
1156 self.endTagFormatting(impliedTagToken("a"))
1157 if afeAElement in self.tree.openElements:
1158 self.tree.openElements.remove(afeAElement)
1159 if afeAElement in self.tree.activeFormattingElements:
1160 self.tree.activeFormattingElements.remove(afeAElement)
1161 self.tree.reconstructActiveFormattingElements()
1162 self.addFormattingElement(token)
1163
1164 def startTagFormatting(self, token):
1165 self.tree.reconstructActiveFormattingElements()
1166 self.addFormattingElement(token)
1167
1168 def startTagNobr(self, token):
1169 self.tree.reconstructActiveFormattingElements()
1170 if self.tree.elementInScope("nobr"):
1171 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1172 {"startName": "nobr", "endName": "nobr"})
1173 self.processEndTag(impliedTagToken("nobr"))
1174 # XXX Need tests that trigger the following
1175 self.tree.reconstructActiveFormattingElements()
1176 self.addFormattingElement(token)
1177
1178 def startTagButton(self, token):
1179 if self.tree.elementInScope("button"):
1180 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1181 {"startName": "button", "endName": "button"})
1182 self.processEndTag(impliedTagToken("button"))
1183 return token
1184 else:
1185 self.tree.reconstructActiveFormattingElements()
1186 self.tree.insertElement(token)
1187 self.parser.framesetOK = False
1188
1189 def startTagAppletMarqueeObject(self, token):
1190 self.tree.reconstructActiveFormattingElements()
1191 self.tree.insertElement(token)
1192 self.tree.activeFormattingElements.append(Marker)
1193 self.parser.framesetOK = False
1194
1195 def startTagXmp(self, token):
1196 if self.tree.elementInScope("p", variant="button"):
1197 self.endTagP(impliedTagToken("p"))
1198 self.tree.reconstructActiveFormattingElements()
1199 self.parser.framesetOK = False
1200 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1201
1202 def startTagTable(self, token):
1203 if self.parser.compatMode != "quirks":
1204 if self.tree.elementInScope("p", variant="button"):
1205 self.processEndTag(impliedTagToken("p"))
1206 self.tree.insertElement(token)
1207 self.parser.framesetOK = False
1208 self.parser.phase = self.parser.phases["inTable"]
1209
1210 def startTagVoidFormatting(self, token):
1211 self.tree.reconstructActiveFormattingElements()
1212 self.tree.insertElement(token)
1213 self.tree.openElements.pop()
1214 token["selfClosingAcknowledged"] = True
1215 self.parser.framesetOK = False
1216
1217 def startTagInput(self, token):
1218 framesetOK = self.parser.framesetOK
1219 self.startTagVoidFormatting(token)
1220 if ("type" in token["data"] and
1221 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1222 # input type=hidden doesn't change framesetOK
1223 self.parser.framesetOK = framesetOK
1224
1225 def startTagParamSource(self, token):
1226 self.tree.insertElement(token)
1227 self.tree.openElements.pop()
1228 token["selfClosingAcknowledged"] = True
1229
1230 def startTagHr(self, token):
1231 if self.tree.elementInScope("p", variant="button"):
1232 self.endTagP(impliedTagToken("p"))
1233 self.tree.insertElement(token)
1234 self.tree.openElements.pop()
1235 token["selfClosingAcknowledged"] = True
1236 self.parser.framesetOK = False
1237
1238 def startTagImage(self, token):
1239 # No really...
1240 self.parser.parseError("unexpected-start-tag-treated-as",
1241 {"originalName": "image", "newName": "img"})
1242 self.processStartTag(impliedTagToken("img", "StartTag",
1243 attributes=token["data"],
1244 selfClosing=token["selfClosing"]))
1245
1246 def startTagIsIndex(self, token):
1247 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1248 if self.tree.formPointer:
1249 return
1250 form_attrs = {}
1251 if "action" in token["data"]:
1252 form_attrs["action"] = token["data"]["action"]
1253 self.processStartTag(impliedTagToken("form", "StartTag",
1254 attributes=form_attrs))
1255 self.processStartTag(impliedTagToken("hr", "StartTag"))
1256 self.processStartTag(impliedTagToken("label", "StartTag"))
1257 # XXX Localization ...
1258 if "prompt" in token["data"]:
1259 prompt = token["data"]["prompt"]
1260 else:
1261 prompt = "This is a searchable index. Enter search keywords: "
1262 self.processCharacters(
1263 {"type": tokenTypes["Characters"], "data": prompt})
1264 attributes = token["data"].copy()
1265 if "action" in attributes:
1266 del attributes["action"]
1267 if "prompt" in attributes:
1268 del attributes["prompt"]
1269 attributes["name"] = "isindex"
1270 self.processStartTag(impliedTagToken("input", "StartTag",
1271 attributes=attributes,
1272 selfClosing=token["selfClosing"]))
1273 self.processEndTag(impliedTagToken("label"))
1274 self.processStartTag(impliedTagToken("hr", "StartTag"))
1275 self.processEndTag(impliedTagToken("form"))
1276
1277 def startTagTextarea(self, token):
1278 self.tree.insertElement(token)
1279 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1280 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1281 self.parser.framesetOK = False
1282
1283 def startTagIFrame(self, token):
1284 self.parser.framesetOK = False
1285 self.startTagRawtext(token)
1286
1287 def startTagNoscript(self, token):
1288 if self.parser.scripting:
1289 self.startTagRawtext(token)
1290 else:
1291 self.startTagOther(token)
1292
1293 def startTagRawtext(self, token):
1294 """iframe, noembed noframes, noscript(if scripting enabled)"""
1295 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1296
1297 def startTagOpt(self, token):
1298 if self.tree.openElements[-1].name == "option":
1299 self.parser.phase.processEndTag(impliedTagToken("option"))
1300 self.tree.reconstructActiveFormattingElements()
1301 self.parser.tree.insertElement(token)
1302
1303 def startTagSelect(self, token):
1304 self.tree.reconstructActiveFormattingElements()
1305 self.tree.insertElement(token)
1306 self.parser.framesetOK = False
1307 if self.parser.phase in (self.parser.phases["inTable"],
1308 self.parser.phases["inCaption"],
1309 self.parser.phases["inColumnGroup"],
1310 self.parser.phases["inTableBody"],
1311 self.parser.phases["inRow"],
1312 self.parser.phases["inCell"]):
1313 self.parser.phase = self.parser.phases["inSelectInTable"]
1314 else:
1315 self.parser.phase = self.parser.phases["inSelect"]
1316
1317 def startTagRpRt(self, token):
1318 if self.tree.elementInScope("ruby"):
1319 self.tree.generateImpliedEndTags()
1320 if self.tree.openElements[-1].name != "ruby":
1321 self.parser.parseError()
1322 self.tree.insertElement(token)
1323
1324 def startTagMath(self, token):
1325 self.tree.reconstructActiveFormattingElements()
1326 self.parser.adjustMathMLAttributes(token)
1327 self.parser.adjustForeignAttributes(token)
1328 token["namespace"] = namespaces["mathml"]
1329 self.tree.insertElement(token)
1330 # Need to get the parse error right for the case where the token
1331 # has a namespace not equal to the xmlns attribute
1332 if token["selfClosing"]:
1333 self.tree.openElements.pop()
1334 token["selfClosingAcknowledged"] = True
1335
1336 def startTagSvg(self, token):
1337 self.tree.reconstructActiveFormattingElements()
1338 self.parser.adjustSVGAttributes(token)
1339 self.parser.adjustForeignAttributes(token)
1340 token["namespace"] = namespaces["svg"]
1341 self.tree.insertElement(token)
1342 # Need to get the parse error right for the case where the token
1343 # has a namespace not equal to the xmlns attribute
1344 if token["selfClosing"]:
1345 self.tree.openElements.pop()
1346 token["selfClosingAcknowledged"] = True
1347
1348 def startTagMisplaced(self, token):
1349 """ Elements that should be children of other elements that have a
1350 different insertion mode; here they are ignored
1351 "caption", "col", "colgroup", "frame", "frameset", "head",
1352 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1353 "tr", "noscript"
1354 """
1355 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1356
1357 def startTagOther(self, token):
1358 self.tree.reconstructActiveFormattingElements()
1359 self.tree.insertElement(token)
1360
1361 def endTagP(self, token):
1362 if not self.tree.elementInScope("p", variant="button"):
1363 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1364 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1365 self.endTagP(impliedTagToken("p", "EndTag"))
1366 else:
1367 self.tree.generateImpliedEndTags("p")
1368 if self.tree.openElements[-1].name != "p":
1369 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1370 node = self.tree.openElements.pop()
1371 while node.name != "p":
1372 node = self.tree.openElements.pop()
1373
1374 def endTagBody(self, token):
1375 if not self.tree.elementInScope("body"):
1376 self.parser.parseError()
1377 return
1378 elif self.tree.openElements[-1].name != "body":
1379 for node in self.tree.openElements[2:]:
1380 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1381 "option", "p", "rp", "rt",
1382 "tbody", "td", "tfoot",
1383 "th", "thead", "tr", "body",
1384 "html")):
1385 # Not sure this is the correct name for the parse error
1386 self.parser.parseError(
1387 "expected-one-end-tag-but-got-another",
1388 {"gotName": "body", "expectedName": node.name})
1389 break
1390 self.parser.phase = self.parser.phases["afterBody"]
1391
1392 def endTagHtml(self, token):
1393 # We repeat the test for the body end tag token being ignored here
1394 if self.tree.elementInScope("body"):
1395 self.endTagBody(impliedTagToken("body"))
1396 return token
1397
1398 def endTagBlock(self, token):
1399 # Put us back in the right whitespace handling mode
1400 if token["name"] == "pre":
1401 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1402 inScope = self.tree.elementInScope(token["name"])
1403 if inScope:
1404 self.tree.generateImpliedEndTags()
1405 if self.tree.openElements[-1].name != token["name"]:
1406 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1407 if inScope:
1408 node = self.tree.openElements.pop()
1409 while node.name != token["name"]:
1410 node = self.tree.openElements.pop()
1411
1412 def endTagForm(self, token):
1413 node = self.tree.formPointer
1414 self.tree.formPointer = None
1415 if node is None or not self.tree.elementInScope(node):
1416 self.parser.parseError("unexpected-end-tag",
1417 {"name": "form"})
1418 else:
1419 self.tree.generateImpliedEndTags()
1420 if self.tree.openElements[-1] != node:
1421 self.parser.parseError("end-tag-too-early-ignored",
1422 {"name": "form"})
1423 self.tree.openElements.remove(node)
1424
1425 def endTagListItem(self, token):
1426 if token["name"] == "li":
1427 variant = "list"
1428 else:
1429 variant = None
1430 if not self.tree.elementInScope(token["name"], variant=variant):
1431 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1432 else:
1433 self.tree.generateImpliedEndTags(exclude=token["name"])
1434 if self.tree.openElements[-1].name != token["name"]:
1435 self.parser.parseError(
1436 "end-tag-too-early",
1437 {"name": token["name"]})
1438 node = self.tree.openElements.pop()
1439 while node.name != token["name"]:
1440 node = self.tree.openElements.pop()
1441
1442 def endTagHeading(self, token):
1443 for item in headingElements:
1444 if self.tree.elementInScope(item):
1445 self.tree.generateImpliedEndTags()
1446 break
1447 if self.tree.openElements[-1].name != token["name"]:
1448 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1449
1450 for item in headingElements:
1451 if self.tree.elementInScope(item):
1452 item = self.tree.openElements.pop()
1453 while item.name not in headingElements:
1454 item = self.tree.openElements.pop()
1455 break
1456
1457 def endTagFormatting(self, token):
1458 """The much-feared adoption agency algorithm"""
1459 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1460 # XXX Better parseError messages appreciated.
1461
1462 # Step 1
1463 outerLoopCounter = 0
1464
1465 # Step 2
1466 while outerLoopCounter < 8:
1467
1468 # Step 3
1469 outerLoopCounter += 1
1470
1471 # Step 4:
1472
1473 # Let the formatting element be the last element in
1474 # the list of active formatting elements that:
1475 # - is between the end of the list and the last scope
1476 # marker in the list, if any, or the start of the list
1477 # otherwise, and
1478 # - has the same tag name as the token.
1479 formattingElement = self.tree.elementInActiveFormattingElements(
1480 token["name"])
1481 if (not formattingElement or
1482 (formattingElement in self.tree.openElements and
1483 not self.tree.elementInScope(formattingElement.name))):
1484 # If there is no such node, then abort these steps
1485 # and instead act as described in the "any other
1486 # end tag" entry below.
1487 self.endTagOther(token)
1488 return
1489
1490 # Otherwise, if there is such a node, but that node is
1491 # not in the stack of open elements, then this is a
1492 # parse error; remove the element from the list, and
1493 # abort these steps.
1494 elif formattingElement not in self.tree.openElements:
1495 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1496 self.tree.activeFormattingElements.remove(formattingElement)
1497 return
1498
1499 # Otherwise, if there is such a node, and that node is
1500 # also in the stack of open elements, but the element
1501 # is not in scope, then this is a parse error; ignore
1502 # the token, and abort these steps.
1503 elif not self.tree.elementInScope(formattingElement.name):
1504 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1505 return
1506
1507 # Otherwise, there is a formatting element and that
1508 # element is in the stack and is in scope. If the
1509 # element is not the current node, this is a parse
1510 # error. In any case, proceed with the algorithm as
1511 # written in the following steps.
1512 else:
1513 if formattingElement != self.tree.openElements[-1]:
1514 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1515
1516 # Step 5:
1517
1518 # Let the furthest block be the topmost node in the
1519 # stack of open elements that is lower in the stack
1520 # than the formatting element, and is an element in
1521 # the special category. There might not be one.
1522 afeIndex = self.tree.openElements.index(formattingElement)
1523 furthestBlock = None
1524 for element in self.tree.openElements[afeIndex:]:
1525 if element.nameTuple in specialElements:
1526 furthestBlock = element
1527 break
1528
1529 # Step 6:
1530
1531 # If there is no furthest block, then the UA must
1532 # first pop all the nodes from the bottom of the stack
1533 # of open elements, from the current node up to and
1534 # including the formatting element, then remove the
1535 # formatting element from the list of active
1536 # formatting elements, and finally abort these steps.
1537 if furthestBlock is None:
1538 element = self.tree.openElements.pop()
1539 while element != formattingElement:
1540 element = self.tree.openElements.pop()
1541 self.tree.activeFormattingElements.remove(element)
1542 return
1543
1544 # Step 7
1545 commonAncestor = self.tree.openElements[afeIndex - 1]
1546
1547 # Step 8:
1548 # The bookmark is supposed to help us identify where to reinsert
1549 # nodes in step 15. We have to ensure that we reinsert nodes after
1550 # the node before the active formatting element. Note the bookmark
1551 # can move in step 9.7
1552 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1553
1554 # Step 9
1555 lastNode = node = furthestBlock
1556 innerLoopCounter = 0
1557
1558 index = self.tree.openElements.index(node)
1559 while innerLoopCounter < 3:
1560 innerLoopCounter += 1
1561 # Node is element before node in open elements
1562 index -= 1
1563 node = self.tree.openElements[index]
1564 if node not in self.tree.activeFormattingElements:
1565 self.tree.openElements.remove(node)
1566 continue
1567 # Step 9.6
1568 if node == formattingElement:
1569 break
1570 # Step 9.7
1571 if lastNode == furthestBlock:
1572 bookmark = self.tree.activeFormattingElements.index(node) + 1
1573 # Step 9.8
1574 clone = node.cloneNode()
1575 # Replace node with clone
1576 self.tree.activeFormattingElements[
1577 self.tree.activeFormattingElements.index(node)] = clone
1578 self.tree.openElements[
1579 self.tree.openElements.index(node)] = clone
1580 node = clone
1581 # Step 9.9
1582 # Remove lastNode from its parents, if any
1583 if lastNode.parent:
1584 lastNode.parent.removeChild(lastNode)
1585 node.appendChild(lastNode)
1586 # Step 9.10
1587 lastNode = node
1588
1589 # Step 10
1590 # Foster parent lastNode if commonAncestor is a
1591 # table, tbody, tfoot, thead, or tr we need to foster
1592 # parent the lastNode
1593 if lastNode.parent:
1594 lastNode.parent.removeChild(lastNode)
1595
1596 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1597 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1598 parent.insertBefore(lastNode, insertBefore)
1599 else:
1600 commonAncestor.appendChild(lastNode)
1601
1602 # Step 11
1603 clone = formattingElement.cloneNode()
1604
1605 # Step 12
1606 furthestBlock.reparentChildren(clone)
1607
1608 # Step 13
1609 furthestBlock.appendChild(clone)
1610
1611 # Step 14
1612 self.tree.activeFormattingElements.remove(formattingElement)
1613 self.tree.activeFormattingElements.insert(bookmark, clone)
1614
1615 # Step 15
1616 self.tree.openElements.remove(formattingElement)
1617 self.tree.openElements.insert(
1618 self.tree.openElements.index(furthestBlock) + 1, clone)
1619
1620 def endTagAppletMarqueeObject(self, token):
1621 if self.tree.elementInScope(token["name"]):
1622 self.tree.generateImpliedEndTags()
1623 if self.tree.openElements[-1].name != token["name"]:
1624 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1625
1626 if self.tree.elementInScope(token["name"]):
1627 element = self.tree.openElements.pop()
1628 while element.name != token["name"]:
1629 element = self.tree.openElements.pop()
1630 self.tree.clearActiveFormattingElements()
1631
1632 def endTagBr(self, token):
1633 self.parser.parseError("unexpected-end-tag-treated-as",
1634 {"originalName": "br", "newName": "br element"})
1635 self.tree.reconstructActiveFormattingElements()
1636 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1637 self.tree.openElements.pop()
1638
1639 def endTagOther(self, token):
1640 for node in self.tree.openElements[::-1]:
1641 if node.name == token["name"]:
1642 self.tree.generateImpliedEndTags(exclude=token["name"])
1643 if self.tree.openElements[-1].name != token["name"]:
1644 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1645 while self.tree.openElements.pop() != node:
1646 pass
1647 break
1648 else:
1649 if node.nameTuple in specialElements:
1650 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1651 break
1652
1653 class TextPhase(Phase):
1654 def __init__(self, parser, tree):
1655 Phase.__init__(self, parser, tree)
1656 self.startTagHandler = _utils.MethodDispatcher([])
1657 self.startTagHandler.default = self.startTagOther
1658 self.endTagHandler = _utils.MethodDispatcher([
1659 ("script", self.endTagScript)])
1660 self.endTagHandler.default = self.endTagOther
1661
1662 def processCharacters(self, token):
1663 self.tree.insertText(token["data"])
1664
1665 def processEOF(self):
1666 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1667 {"name": self.tree.openElements[-1].name})
1668 self.tree.openElements.pop()
1669 self.parser.phase = self.parser.originalPhase
1670 return True
1671
1672 def startTagOther(self, token):
1673 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1674
1675 def endTagScript(self, token):
1676 node = self.tree.openElements.pop()
1677 assert node.name == "script"
1678 self.parser.phase = self.parser.originalPhase
1679 # The rest of this method is all stuff that only happens if
1680 # document.write works
1681
1682 def endTagOther(self, token):
1683 self.tree.openElements.pop()
1684 self.parser.phase = self.parser.originalPhase
1685
1686 class InTablePhase(Phase):
1687 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1688 def __init__(self, parser, tree):
1689 Phase.__init__(self, parser, tree)
1690 self.startTagHandler = _utils.MethodDispatcher([
1691 ("html", self.startTagHtml),
1692 ("caption", self.startTagCaption),
1693 ("colgroup", self.startTagColgroup),
1694 ("col", self.startTagCol),
1695 (("tbody", "tfoot", "thead"), self.startTagRowGroup),
1696 (("td", "th", "tr"), self.startTagImplyTbody),
1697 ("table", self.startTagTable),
1698 (("style", "script"), self.startTagStyleScript),
1699 ("input", self.startTagInput),
1700 ("form", self.startTagForm)
1701 ])
1702 self.startTagHandler.default = self.startTagOther
1703
1704 self.endTagHandler = _utils.MethodDispatcher([
1705 ("table", self.endTagTable),
1706 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1707 "tfoot", "th", "thead", "tr"), self.endTagIgnore)
1708 ])
1709 self.endTagHandler.default = self.endTagOther
1710
1711 # helper methods
1712 def clearStackToTableContext(self):
1713 # "clear the stack back to a table context"
1714 while self.tree.openElements[-1].name not in ("table", "html"):
1715 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1716 # {"name": self.tree.openElements[-1].name})
1717 self.tree.openElements.pop()
1718 # When the current node is <html> it's an innerHTML case
1719
1720 # processing methods
1721 def processEOF(self):
1722 if self.tree.openElements[-1].name != "html":
1723 self.parser.parseError("eof-in-table")
1724 else:
1725 assert self.parser.innerHTML
1726 # Stop parsing
1727
1728 def processSpaceCharacters(self, token):
1729 originalPhase = self.parser.phase
1730 self.parser.phase = self.parser.phases["inTableText"]
1731 self.parser.phase.originalPhase = originalPhase
1732 self.parser.phase.processSpaceCharacters(token)
1733
1734 def processCharacters(self, token):
1735 originalPhase = self.parser.phase
1736 self.parser.phase = self.parser.phases["inTableText"]
1737 self.parser.phase.originalPhase = originalPhase
1738 self.parser.phase.processCharacters(token)
1739
1740 def insertText(self, token):
1741 # If we get here there must be at least one non-whitespace character
1742 # Do the table magic!
1743 self.tree.insertFromTable = True
1744 self.parser.phases["inBody"].processCharacters(token)
1745 self.tree.insertFromTable = False
1746
1747 def startTagCaption(self, token):
1748 self.clearStackToTableContext()
1749 self.tree.activeFormattingElements.append(Marker)
1750 self.tree.insertElement(token)
1751 self.parser.phase = self.parser.phases["inCaption"]
1752
1753 def startTagColgroup(self, token):
1754 self.clearStackToTableContext()
1755 self.tree.insertElement(token)
1756 self.parser.phase = self.parser.phases["inColumnGroup"]
1757
1758 def startTagCol(self, token):
1759 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1760 return token
1761
1762 def startTagRowGroup(self, token):
1763 self.clearStackToTableContext()
1764 self.tree.insertElement(token)
1765 self.parser.phase = self.parser.phases["inTableBody"]
1766
1767 def startTagImplyTbody(self, token):
1768 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1769 return token
1770
1771 def startTagTable(self, token):
1772 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1773 {"startName": "table", "endName": "table"})
1774 self.parser.phase.processEndTag(impliedTagToken("table"))
1775 if not self.parser.innerHTML:
1776 return token
1777
1778 def startTagStyleScript(self, token):
1779 return self.parser.phases["inHead"].processStartTag(token)
1780
1781 def startTagInput(self, token):
1782 if ("type" in token["data"] and
1783 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1784 self.parser.parseError("unexpected-hidden-input-in-table")
1785 self.tree.insertElement(token)
1786 # XXX associate with form
1787 self.tree.openElements.pop()
1788 else:
1789 self.startTagOther(token)
1790
1791 def startTagForm(self, token):
1792 self.parser.parseError("unexpected-form-in-table")
1793 if self.tree.formPointer is None:
1794 self.tree.insertElement(token)
1795 self.tree.formPointer = self.tree.openElements[-1]
1796 self.tree.openElements.pop()
1797
1798 def startTagOther(self, token):
1799 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1800 # Do the table magic!
1801 self.tree.insertFromTable = True
1802 self.parser.phases["inBody"].processStartTag(token)
1803 self.tree.insertFromTable = False
1804
1805 def endTagTable(self, token):
1806 if self.tree.elementInScope("table", variant="table"):
1807 self.tree.generateImpliedEndTags()
1808 if self.tree.openElements[-1].name != "table":
1809 self.parser.parseError("end-tag-too-early-named",
1810 {"gotName": "table",
1811 "expectedName": self.tree.openElements[-1].name})
1812 while self.tree.openElements[-1].name != "table":
1813 self.tree.openElements.pop()
1814 self.tree.openElements.pop()
1815 self.parser.resetInsertionMode()
1816 else:
1817 # innerHTML case
1818 assert self.parser.innerHTML
1819 self.parser.parseError()
1820
1821 def endTagIgnore(self, token):
1822 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1823
1824 def endTagOther(self, token):
1825 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1826 # Do the table magic!
1827 self.tree.insertFromTable = True
1828 self.parser.phases["inBody"].processEndTag(token)
1829 self.tree.insertFromTable = False
1830
1831 class InTableTextPhase(Phase):
1832 def __init__(self, parser, tree):
1833 Phase.__init__(self, parser, tree)
1834 self.originalPhase = None
1835 self.characterTokens = []
1836
1837 def flushCharacters(self):
1838 data = "".join([item["data"] for item in self.characterTokens])
1839 if any([item not in spaceCharacters for item in data]):
1840 token = {"type": tokenTypes["Characters"], "data": data}
1841 self.parser.phases["inTable"].insertText(token)
1842 elif data:
1843 self.tree.insertText(data)
1844 self.characterTokens = []
1845
1846 def processComment(self, token):
1847 self.flushCharacters()
1848 self.parser.phase = self.originalPhase
1849 return token
1850
1851 def processEOF(self):
1852 self.flushCharacters()
1853 self.parser.phase = self.originalPhase
1854 return True
1855
1856 def processCharacters(self, token):
1857 if token["data"] == "\u0000":
1858 return
1859 self.characterTokens.append(token)
1860
1861 def processSpaceCharacters(self, token):
1862 # pretty sure we should never reach here
1863 self.characterTokens.append(token)
1864 # assert False
1865
1866 def processStartTag(self, token):
1867 self.flushCharacters()
1868 self.parser.phase = self.originalPhase
1869 return token
1870
1871 def processEndTag(self, token):
1872 self.flushCharacters()
1873 self.parser.phase = self.originalPhase
1874 return token
1875
1876 class InCaptionPhase(Phase):
1877 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1878 def __init__(self, parser, tree):
1879 Phase.__init__(self, parser, tree)
1880
1881 self.startTagHandler = _utils.MethodDispatcher([
1882 ("html", self.startTagHtml),
1883 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1884 "thead", "tr"), self.startTagTableElement)
1885 ])
1886 self.startTagHandler.default = self.startTagOther
1887
1888 self.endTagHandler = _utils.MethodDispatcher([
1889 ("caption", self.endTagCaption),
1890 ("table", self.endTagTable),
1891 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1892 "thead", "tr"), self.endTagIgnore)
1893 ])
1894 self.endTagHandler.default = self.endTagOther
1895
1896 def ignoreEndTagCaption(self):
1897 return not self.tree.elementInScope("caption", variant="table")
1898
1899 def processEOF(self):
1900 self.parser.phases["inBody"].processEOF()
1901
1902 def processCharacters(self, token):
1903 return self.parser.phases["inBody"].processCharacters(token)
1904
1905 def startTagTableElement(self, token):
1906 self.parser.parseError()
1907 # XXX Have to duplicate logic here to find out if the tag is ignored
1908 ignoreEndTag = self.ignoreEndTagCaption()
1909 self.parser.phase.processEndTag(impliedTagToken("caption"))
1910 if not ignoreEndTag:
1911 return token
1912
1913 def startTagOther(self, token):
1914 return self.parser.phases["inBody"].processStartTag(token)
1915
1916 def endTagCaption(self, token):
1917 if not self.ignoreEndTagCaption():
1918 # AT this code is quite similar to endTagTable in "InTable"
1919 self.tree.generateImpliedEndTags()
1920 if self.tree.openElements[-1].name != "caption":
1921 self.parser.parseError("expected-one-end-tag-but-got-another",
1922 {"gotName": "caption",
1923 "expectedName": self.tree.openElements[-1].name})
1924 while self.tree.openElements[-1].name != "caption":
1925 self.tree.openElements.pop()
1926 self.tree.openElements.pop()
1927 self.tree.clearActiveFormattingElements()
1928 self.parser.phase = self.parser.phases["inTable"]
1929 else:
1930 # innerHTML case
1931 assert self.parser.innerHTML
1932 self.parser.parseError()
1933
1934 def endTagTable(self, token):
1935 self.parser.parseError()
1936 ignoreEndTag = self.ignoreEndTagCaption()
1937 self.parser.phase.processEndTag(impliedTagToken("caption"))
1938 if not ignoreEndTag:
1939 return token
1940
1941 def endTagIgnore(self, token):
1942 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1943
1944 def endTagOther(self, token):
1945 return self.parser.phases["inBody"].processEndTag(token)
1946
1947 class InColumnGroupPhase(Phase):
1948 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1949
1950 def __init__(self, parser, tree):
1951 Phase.__init__(self, parser, tree)
1952
1953 self.startTagHandler = _utils.MethodDispatcher([
1954 ("html", self.startTagHtml),
1955 ("col", self.startTagCol)
1956 ])
1957 self.startTagHandler.default = self.startTagOther
1958
1959 self.endTagHandler = _utils.MethodDispatcher([
1960 ("colgroup", self.endTagColgroup),
1961 ("col", self.endTagCol)
1962 ])
1963 self.endTagHandler.default = self.endTagOther
1964
1965 def ignoreEndTagColgroup(self):
1966 return self.tree.openElements[-1].name == "html"
1967
1968 def processEOF(self):
1969 if self.tree.openElements[-1].name == "html":
1970 assert self.parser.innerHTML
1971 return
1972 else:
1973 ignoreEndTag = self.ignoreEndTagColgroup()
1974 self.endTagColgroup(impliedTagToken("colgroup"))
1975 if not ignoreEndTag:
1976 return True
1977
1978 def processCharacters(self, token):
1979 ignoreEndTag = self.ignoreEndTagColgroup()
1980 self.endTagColgroup(impliedTagToken("colgroup"))
1981 if not ignoreEndTag:
1982 return token
1983
1984 def startTagCol(self, token):
1985 self.tree.insertElement(token)
1986 self.tree.openElements.pop()
1987 token["selfClosingAcknowledged"] = True
1988
1989 def startTagOther(self, token):
1990 ignoreEndTag = self.ignoreEndTagColgroup()
1991 self.endTagColgroup(impliedTagToken("colgroup"))
1992 if not ignoreEndTag:
1993 return token
1994
1995 def endTagColgroup(self, token):
1996 if self.ignoreEndTagColgroup():
1997 # innerHTML case
1998 assert self.parser.innerHTML
1999 self.parser.parseError()
2000 else:
2001 self.tree.openElements.pop()
2002 self.parser.phase = self.parser.phases["inTable"]
2003
2004 def endTagCol(self, token):
2005 self.parser.parseError("no-end-tag", {"name": "col"})
2006
2007 def endTagOther(self, token):
2008 ignoreEndTag = self.ignoreEndTagColgroup()
2009 self.endTagColgroup(impliedTagToken("colgroup"))
2010 if not ignoreEndTag:
2011 return token
2012
2013 class InTableBodyPhase(Phase):
2014 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2015 def __init__(self, parser, tree):
2016 Phase.__init__(self, parser, tree)
2017 self.startTagHandler = _utils.MethodDispatcher([
2018 ("html", self.startTagHtml),
2019 ("tr", self.startTagTr),
2020 (("td", "th"), self.startTagTableCell),
2021 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2022 self.startTagTableOther)
2023 ])
2024 self.startTagHandler.default = self.startTagOther
2025
2026 self.endTagHandler = _utils.MethodDispatcher([
2027 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2028 ("table", self.endTagTable),
2029 (("body", "caption", "col", "colgroup", "html", "td", "th",
2030 "tr"), self.endTagIgnore)
2031 ])
2032 self.endTagHandler.default = self.endTagOther
2033
2034 # helper methods
2035 def clearStackToTableBodyContext(self):
2036 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2037 "thead", "html"):
2038 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2039 # {"name": self.tree.openElements[-1].name})
2040 self.tree.openElements.pop()
2041 if self.tree.openElements[-1].name == "html":
2042 assert self.parser.innerHTML
2043
2044 # the rest
2045 def processEOF(self):
2046 self.parser.phases["inTable"].processEOF()
2047
2048 def processSpaceCharacters(self, token):
2049 return self.parser.phases["inTable"].processSpaceCharacters(token)
2050
2051 def processCharacters(self, token):
2052 return self.parser.phases["inTable"].processCharacters(token)
2053
2054 def startTagTr(self, token):
2055 self.clearStackToTableBodyContext()
2056 self.tree.insertElement(token)
2057 self.parser.phase = self.parser.phases["inRow"]
2058
2059 def startTagTableCell(self, token):
2060 self.parser.parseError("unexpected-cell-in-table-body",
2061 {"name": token["name"]})
2062 self.startTagTr(impliedTagToken("tr", "StartTag"))
2063 return token
2064
2065 def startTagTableOther(self, token):
2066 # XXX AT Any ideas on how to share this with endTagTable?
2067 if (self.tree.elementInScope("tbody", variant="table") or
2068 self.tree.elementInScope("thead", variant="table") or
2069 self.tree.elementInScope("tfoot", variant="table")):
2070 self.clearStackToTableBodyContext()
2071 self.endTagTableRowGroup(
2072 impliedTagToken(self.tree.openElements[-1].name))
2073 return token
2074 else:
2075 # innerHTML case
2076 assert self.parser.innerHTML
2077 self.parser.parseError()
2078
2079 def startTagOther(self, token):
2080 return self.parser.phases["inTable"].processStartTag(token)
2081
2082 def endTagTableRowGroup(self, token):
2083 if self.tree.elementInScope(token["name"], variant="table"):
2084 self.clearStackToTableBodyContext()
2085 self.tree.openElements.pop()
2086 self.parser.phase = self.parser.phases["inTable"]
2087 else:
2088 self.parser.parseError("unexpected-end-tag-in-table-body",
2089 {"name": token["name"]})
2090
2091 def endTagTable(self, token):
2092 if (self.tree.elementInScope("tbody", variant="table") or
2093 self.tree.elementInScope("thead", variant="table") or
2094 self.tree.elementInScope("tfoot", variant="table")):
2095 self.clearStackToTableBodyContext()
2096 self.endTagTableRowGroup(
2097 impliedTagToken(self.tree.openElements[-1].name))
2098 return token
2099 else:
2100 # innerHTML case
2101 assert self.parser.innerHTML
2102 self.parser.parseError()
2103
2104 def endTagIgnore(self, token):
2105 self.parser.parseError("unexpected-end-tag-in-table-body",
2106 {"name": token["name"]})
2107
2108 def endTagOther(self, token):
2109 return self.parser.phases["inTable"].processEndTag(token)
2110
2111 class InRowPhase(Phase):
2112 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2113 def __init__(self, parser, tree):
2114 Phase.__init__(self, parser, tree)
2115 self.startTagHandler = _utils.MethodDispatcher([
2116 ("html", self.startTagHtml),
2117 (("td", "th"), self.startTagTableCell),
2118 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2119 "tr"), self.startTagTableOther)
2120 ])
2121 self.startTagHandler.default = self.startTagOther
2122
2123 self.endTagHandler = _utils.MethodDispatcher([
2124 ("tr", self.endTagTr),
2125 ("table", self.endTagTable),
2126 (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2127 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2128 self.endTagIgnore)
2129 ])
2130 self.endTagHandler.default = self.endTagOther
2131
2132 # helper methods (XXX unify this with other table helper methods)
2133 def clearStackToTableRowContext(self):
2134 while self.tree.openElements[-1].name not in ("tr", "html"):
2135 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2136 {"name": self.tree.openElements[-1].name})
2137 self.tree.openElements.pop()
2138
2139 def ignoreEndTagTr(self):
2140 return not self.tree.elementInScope("tr", variant="table")
2141
2142 # the rest
2143 def processEOF(self):
2144 self.parser.phases["inTable"].processEOF()
2145
2146 def processSpaceCharacters(self, token):
2147 return self.parser.phases["inTable"].processSpaceCharacters(token)
2148
2149 def processCharacters(self, token):
2150 return self.parser.phases["inTable"].processCharacters(token)
2151
2152 def startTagTableCell(self, token):
2153 self.clearStackToTableRowContext()
2154 self.tree.insertElement(token)
2155 self.parser.phase = self.parser.phases["inCell"]
2156 self.tree.activeFormattingElements.append(Marker)
2157
2158 def startTagTableOther(self, token):
2159 ignoreEndTag = self.ignoreEndTagTr()
2160 self.endTagTr(impliedTagToken("tr"))
2161 # XXX how are we sure it's always ignored in the innerHTML case?
2162 if not ignoreEndTag:
2163 return token
2164
2165 def startTagOther(self, token):
2166 return self.parser.phases["inTable"].processStartTag(token)
2167
2168 def endTagTr(self, token):
2169 if not self.ignoreEndTagTr():
2170 self.clearStackToTableRowContext()
2171 self.tree.openElements.pop()
2172 self.parser.phase = self.parser.phases["inTableBody"]
2173 else:
2174 # innerHTML case
2175 assert self.parser.innerHTML
2176 self.parser.parseError()
2177
2178 def endTagTable(self, token):
2179 ignoreEndTag = self.ignoreEndTagTr()
2180 self.endTagTr(impliedTagToken("tr"))
2181 # Reprocess the current tag if the tr end tag was not ignored
2182 # XXX how are we sure it's always ignored in the innerHTML case?
2183 if not ignoreEndTag:
2184 return token
2185
2186 def endTagTableRowGroup(self, token):
2187 if self.tree.elementInScope(token["name"], variant="table"):
2188 self.endTagTr(impliedTagToken("tr"))
2189 return token
2190 else:
2191 self.parser.parseError()
2192
2193 def endTagIgnore(self, token):
2194 self.parser.parseError("unexpected-end-tag-in-table-row",
2195 {"name": token["name"]})
2196
2197 def endTagOther(self, token):
2198 return self.parser.phases["inTable"].processEndTag(token)
2199
2200 class InCellPhase(Phase):
2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2202 def __init__(self, parser, tree):
2203 Phase.__init__(self, parser, tree)
2204 self.startTagHandler = _utils.MethodDispatcher([
2205 ("html", self.startTagHtml),
2206 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2207 "thead", "tr"), self.startTagTableOther)
2208 ])
2209 self.startTagHandler.default = self.startTagOther
2210
2211 self.endTagHandler = _utils.MethodDispatcher([
2212 (("td", "th"), self.endTagTableCell),
2213 (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
2214 (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
2215 ])
2216 self.endTagHandler.default = self.endTagOther
2217
2218 # helper
2219 def closeCell(self):
2220 if self.tree.elementInScope("td", variant="table"):
2221 self.endTagTableCell(impliedTagToken("td"))
2222 elif self.tree.elementInScope("th", variant="table"):
2223 self.endTagTableCell(impliedTagToken("th"))
2224
2225 # the rest
2226 def processEOF(self):
2227 self.parser.phases["inBody"].processEOF()
2228
2229 def processCharacters(self, token):
2230 return self.parser.phases["inBody"].processCharacters(token)
2231
2232 def startTagTableOther(self, token):
2233 if (self.tree.elementInScope("td", variant="table") or
2234 self.tree.elementInScope("th", variant="table")):
2235 self.closeCell()
2236 return token
2237 else:
2238 # innerHTML case
2239 assert self.parser.innerHTML
2240 self.parser.parseError()
2241
2242 def startTagOther(self, token):
2243 return self.parser.phases["inBody"].processStartTag(token)
2244
2245 def endTagTableCell(self, token):
2246 if self.tree.elementInScope(token["name"], variant="table"):
2247 self.tree.generateImpliedEndTags(token["name"])
2248 if self.tree.openElements[-1].name != token["name"]:
2249 self.parser.parseError("unexpected-cell-end-tag",
2250 {"name": token["name"]})
2251 while True:
2252 node = self.tree.openElements.pop()
2253 if node.name == token["name"]:
2254 break
2255 else:
2256 self.tree.openElements.pop()
2257 self.tree.clearActiveFormattingElements()
2258 self.parser.phase = self.parser.phases["inRow"]
2259 else:
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2261
2262 def endTagIgnore(self, token):
2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2264
2265 def endTagImply(self, token):
2266 if self.tree.elementInScope(token["name"], variant="table"):
2267 self.closeCell()
2268 return token
2269 else:
2270 # sometimes innerHTML case
2271 self.parser.parseError()
2272
2273 def endTagOther(self, token):
2274 return self.parser.phases["inBody"].processEndTag(token)
2275
2276 class InSelectPhase(Phase):
2277 def __init__(self, parser, tree):
2278 Phase.__init__(self, parser, tree)
2279
2280 self.startTagHandler = _utils.MethodDispatcher([
2281 ("html", self.startTagHtml),
2282 ("option", self.startTagOption),
2283 ("optgroup", self.startTagOptgroup),
2284 ("select", self.startTagSelect),
2285 (("input", "keygen", "textarea"), self.startTagInput),
2286 ("script", self.startTagScript)
2287 ])
2288 self.startTagHandler.default = self.startTagOther
2289
2290 self.endTagHandler = _utils.MethodDispatcher([
2291 ("option", self.endTagOption),
2292 ("optgroup", self.endTagOptgroup),
2293 ("select", self.endTagSelect)
2294 ])
2295 self.endTagHandler.default = self.endTagOther
2296
2297 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2298 def processEOF(self):
2299 if self.tree.openElements[-1].name != "html":
2300 self.parser.parseError("eof-in-select")
2301 else:
2302 assert self.parser.innerHTML
2303
2304 def processCharacters(self, token):
2305 if token["data"] == "\u0000":
2306 return
2307 self.tree.insertText(token["data"])
2308
2309 def startTagOption(self, token):
2310 # We need to imply </option> if <option> is the current node.
2311 if self.tree.openElements[-1].name == "option":
2312 self.tree.openElements.pop()
2313 self.tree.insertElement(token)
2314
2315 def startTagOptgroup(self, token):
2316 if self.tree.openElements[-1].name == "option":
2317 self.tree.openElements.pop()
2318 if self.tree.openElements[-1].name == "optgroup":
2319 self.tree.openElements.pop()
2320 self.tree.insertElement(token)
2321
2322 def startTagSelect(self, token):
2323 self.parser.parseError("unexpected-select-in-select")
2324 self.endTagSelect(impliedTagToken("select"))
2325
2326 def startTagInput(self, token):
2327 self.parser.parseError("unexpected-input-in-select")
2328 if self.tree.elementInScope("select", variant="select"):
2329 self.endTagSelect(impliedTagToken("select"))
2330 return token
2331 else:
2332 assert self.parser.innerHTML
2333
2334 def startTagScript(self, token):
2335 return self.parser.phases["inHead"].processStartTag(token)
2336
2337 def startTagOther(self, token):
2338 self.parser.parseError("unexpected-start-tag-in-select",
2339 {"name": token["name"]})
2340
2341 def endTagOption(self, token):
2342 if self.tree.openElements[-1].name == "option":
2343 self.tree.openElements.pop()
2344 else:
2345 self.parser.parseError("unexpected-end-tag-in-select",
2346 {"name": "option"})
2347
2348 def endTagOptgroup(self, token):
2349 # </optgroup> implicitly closes <option>
2350 if (self.tree.openElements[-1].name == "option" and
2351 self.tree.openElements[-2].name == "optgroup"):
2352 self.tree.openElements.pop()
2353 # It also closes </optgroup>
2354 if self.tree.openElements[-1].name == "optgroup":
2355 self.tree.openElements.pop()
2356 # But nothing else
2357 else:
2358 self.parser.parseError("unexpected-end-tag-in-select",
2359 {"name": "optgroup"})
2360
2361 def endTagSelect(self, token):
2362 if self.tree.elementInScope("select", variant="select"):
2363 node = self.tree.openElements.pop()
2364 while node.name != "select":
2365 node = self.tree.openElements.pop()
2366 self.parser.resetInsertionMode()
2367 else:
2368 # innerHTML case
2369 assert self.parser.innerHTML
2370 self.parser.parseError()
2371
2372 def endTagOther(self, token):
2373 self.parser.parseError("unexpected-end-tag-in-select",
2374 {"name": token["name"]})
2375
2376 class InSelectInTablePhase(Phase):
2377 def __init__(self, parser, tree):
2378 Phase.__init__(self, parser, tree)
2379
2380 self.startTagHandler = _utils.MethodDispatcher([
2381 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2382 self.startTagTable)
2383 ])
2384 self.startTagHandler.default = self.startTagOther
2385
2386 self.endTagHandler = _utils.MethodDispatcher([
2387 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2388 self.endTagTable)
2389 ])
2390 self.endTagHandler.default = self.endTagOther
2391
2392 def processEOF(self):
2393 self.parser.phases["inSelect"].processEOF()
2394
2395 def processCharacters(self, token):
2396 return self.parser.phases["inSelect"].processCharacters(token)
2397
2398 def startTagTable(self, token):
2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2400 self.endTagOther(impliedTagToken("select"))
2401 return token
2402
2403 def startTagOther(self, token):
2404 return self.parser.phases["inSelect"].processStartTag(token)
2405
2406 def endTagTable(self, token):
2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2408 if self.tree.elementInScope(token["name"], variant="table"):
2409 self.endTagOther(impliedTagToken("select"))
2410 return token
2411
2412 def endTagOther(self, token):
2413 return self.parser.phases["inSelect"].processEndTag(token)
2414
2415 class InForeignContentPhase(Phase):
2416 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2417 "center", "code", "dd", "div", "dl", "dt",
2418 "em", "embed", "h1", "h2", "h3",
2419 "h4", "h5", "h6", "head", "hr", "i", "img",
2420 "li", "listing", "menu", "meta", "nobr",
2421 "ol", "p", "pre", "ruby", "s", "small",
2422 "span", "strong", "strike", "sub", "sup",
2423 "table", "tt", "u", "ul", "var"])
2424
2425 def __init__(self, parser, tree):
2426 Phase.__init__(self, parser, tree)
2427
2428 def adjustSVGTagNames(self, token):
2429 replacements = {"altglyph": "altGlyph",
2430 "altglyphdef": "altGlyphDef",
2431 "altglyphitem": "altGlyphItem",
2432 "animatecolor": "animateColor",
2433 "animatemotion": "animateMotion",
2434 "animatetransform": "animateTransform",
2435 "clippath": "clipPath",
2436 "feblend": "feBlend",
2437 "fecolormatrix": "feColorMatrix",
2438 "fecomponenttransfer": "feComponentTransfer",
2439 "fecomposite": "feComposite",
2440 "feconvolvematrix": "feConvolveMatrix",
2441 "fediffuselighting": "feDiffuseLighting",
2442 "fedisplacementmap": "feDisplacementMap",
2443 "fedistantlight": "feDistantLight",
2444 "feflood": "feFlood",
2445 "fefunca": "feFuncA",
2446 "fefuncb": "feFuncB",
2447 "fefuncg": "feFuncG",
2448 "fefuncr": "feFuncR",
2449 "fegaussianblur": "feGaussianBlur",
2450 "feimage": "feImage",
2451 "femerge": "feMerge",
2452 "femergenode": "feMergeNode",
2453 "femorphology": "feMorphology",
2454 "feoffset": "feOffset",
2455 "fepointlight": "fePointLight",
2456 "fespecularlighting": "feSpecularLighting",
2457 "fespotlight": "feSpotLight",
2458 "fetile": "feTile",
2459 "feturbulence": "feTurbulence",
2460 "foreignobject": "foreignObject",
2461 "glyphref": "glyphRef",
2462 "lineargradient": "linearGradient",
2463 "radialgradient": "radialGradient",
2464 "textpath": "textPath"}
2465
2466 if token["name"] in replacements:
2467 token["name"] = replacements[token["name"]]
2468
2469 def processCharacters(self, token):
2470 if token["data"] == "\u0000":
2471 token["data"] = "\uFFFD"
2472 elif (self.parser.framesetOK and
2473 any(char not in spaceCharacters for char in token["data"])):
2474 self.parser.framesetOK = False
2475 Phase.processCharacters(self, token)
2476
2477 def processStartTag(self, token):
2478 currentNode = self.tree.openElements[-1]
2479 if (token["name"] in self.breakoutElements or
2480 (token["name"] == "font" and
2481 set(token["data"].keys()) & set(["color", "face", "size"]))):
2482 self.parser.parseError("unexpected-html-element-in-foreign-content",
2483 {"name": token["name"]})
2484 while (self.tree.openElements[-1].namespace !=
2485 self.tree.defaultNamespace and
2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2488 self.tree.openElements.pop()
2489 return token
2490
2491 else:
2492 if currentNode.namespace == namespaces["mathml"]:
2493 self.parser.adjustMathMLAttributes(token)
2494 elif currentNode.namespace == namespaces["svg"]:
2495 self.adjustSVGTagNames(token)
2496 self.parser.adjustSVGAttributes(token)
2497 self.parser.adjustForeignAttributes(token)
2498 token["namespace"] = currentNode.namespace
2499 self.tree.insertElement(token)
2500 if token["selfClosing"]:
2501 self.tree.openElements.pop()
2502 token["selfClosingAcknowledged"] = True
2503
2504 def processEndTag(self, token):
2505 nodeIndex = len(self.tree.openElements) - 1
2506 node = self.tree.openElements[-1]
2507 if node.name.translate(asciiUpper2Lower) != token["name"]:
2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2509
2510 while True:
2511 if node.name.translate(asciiUpper2Lower) == token["name"]:
2512 # XXX this isn't in the spec but it seems necessary
2513 if self.parser.phase == self.parser.phases["inTableText"]:
2514 self.parser.phase.flushCharacters()
2515 self.parser.phase = self.parser.phase.originalPhase
2516 while self.tree.openElements.pop() != node:
2517 assert self.tree.openElements
2518 new_token = None
2519 break
2520 nodeIndex -= 1
2521
2522 node = self.tree.openElements[nodeIndex]
2523 if node.namespace != self.tree.defaultNamespace:
2524 continue
2525 else:
2526 new_token = self.parser.phase.processEndTag(token)
2527 break
2528 return new_token
2529
2530 class AfterBodyPhase(Phase):
2531 def __init__(self, parser, tree):
2532 Phase.__init__(self, parser, tree)
2533
2534 self.startTagHandler = _utils.MethodDispatcher([
2535 ("html", self.startTagHtml)
2536 ])
2537 self.startTagHandler.default = self.startTagOther
2538
2539 self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
2540 self.endTagHandler.default = self.endTagOther
2541
2542 def processEOF(self):
2543 # Stop parsing
2544 pass
2545
2546 def processComment(self, token):
2547 # This is needed because data is to be appended to the <html> element
2548 # here and not to whatever is currently open.
2549 self.tree.insertComment(token, self.tree.openElements[0])
2550
2551 def processCharacters(self, token):
2552 self.parser.parseError("unexpected-char-after-body")
2553 self.parser.phase = self.parser.phases["inBody"]
2554 return token
2555
2556 def startTagHtml(self, token):
2557 return self.parser.phases["inBody"].processStartTag(token)
2558
2559 def startTagOther(self, token):
2560 self.parser.parseError("unexpected-start-tag-after-body",
2561 {"name": token["name"]})
2562 self.parser.phase = self.parser.phases["inBody"]
2563 return token
2564
2565 def endTagHtml(self, name):
2566 if self.parser.innerHTML:
2567 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2568 else:
2569 self.parser.phase = self.parser.phases["afterAfterBody"]
2570
2571 def endTagOther(self, token):
2572 self.parser.parseError("unexpected-end-tag-after-body",
2573 {"name": token["name"]})
2574 self.parser.phase = self.parser.phases["inBody"]
2575 return token
2576
2577 class InFramesetPhase(Phase):
2578 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2579 def __init__(self, parser, tree):
2580 Phase.__init__(self, parser, tree)
2581
2582 self.startTagHandler = _utils.MethodDispatcher([
2583 ("html", self.startTagHtml),
2584 ("frameset", self.startTagFrameset),
2585 ("frame", self.startTagFrame),
2586 ("noframes", self.startTagNoframes)
2587 ])
2588 self.startTagHandler.default = self.startTagOther
2589
2590 self.endTagHandler = _utils.MethodDispatcher([
2591 ("frameset", self.endTagFrameset)
2592 ])
2593 self.endTagHandler.default = self.endTagOther
2594
2595 def processEOF(self):
2596 if self.tree.openElements[-1].name != "html":
2597 self.parser.parseError("eof-in-frameset")
2598 else:
2599 assert self.parser.innerHTML
2600
2601 def processCharacters(self, token):
2602 self.parser.parseError("unexpected-char-in-frameset")
2603
2604 def startTagFrameset(self, token):
2605 self.tree.insertElement(token)
2606
2607 def startTagFrame(self, token):
2608 self.tree.insertElement(token)
2609 self.tree.openElements.pop()
2610
2611 def startTagNoframes(self, token):
2612 return self.parser.phases["inBody"].processStartTag(token)
2613
2614 def startTagOther(self, token):
2615 self.parser.parseError("unexpected-start-tag-in-frameset",
2616 {"name": token["name"]})
2617
2618 def endTagFrameset(self, token):
2619 if self.tree.openElements[-1].name == "html":
2620 # innerHTML case
2621 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2622 else:
2623 self.tree.openElements.pop()
2624 if (not self.parser.innerHTML and
2625 self.tree.openElements[-1].name != "frameset"):
2626 # If we're not in innerHTML mode and the current node is not a
2627 # "frameset" element (anymore) then switch.
2628 self.parser.phase = self.parser.phases["afterFrameset"]
2629
2630 def endTagOther(self, token):
2631 self.parser.parseError("unexpected-end-tag-in-frameset",
2632 {"name": token["name"]})
2633
2634 class AfterFramesetPhase(Phase):
2635 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2636 def __init__(self, parser, tree):
2637 Phase.__init__(self, parser, tree)
2638
2639 self.startTagHandler = _utils.MethodDispatcher([
2640 ("html", self.startTagHtml),
2641 ("noframes", self.startTagNoframes)
2642 ])
2643 self.startTagHandler.default = self.startTagOther
2644
2645 self.endTagHandler = _utils.MethodDispatcher([
2646 ("html", self.endTagHtml)
2647 ])
2648 self.endTagHandler.default = self.endTagOther
2649
2650 def processEOF(self):
2651 # Stop parsing
2652 pass
2653
2654 def processCharacters(self, token):
2655 self.parser.parseError("unexpected-char-after-frameset")
2656
2657 def startTagNoframes(self, token):
2658 return self.parser.phases["inHead"].processStartTag(token)
2659
2660 def startTagOther(self, token):
2661 self.parser.parseError("unexpected-start-tag-after-frameset",
2662 {"name": token["name"]})
2663
2664 def endTagHtml(self, token):
2665 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2666
2667 def endTagOther(self, token):
2668 self.parser.parseError("unexpected-end-tag-after-frameset",
2669 {"name": token["name"]})
2670
2671 class AfterAfterBodyPhase(Phase):
2672 def __init__(self, parser, tree):
2673 Phase.__init__(self, parser, tree)
2674
2675 self.startTagHandler = _utils.MethodDispatcher([
2676 ("html", self.startTagHtml)
2677 ])
2678 self.startTagHandler.default = self.startTagOther
2679
2680 def processEOF(self):
2681 pass
2682
2683 def processComment(self, token):
2684 self.tree.insertComment(token, self.tree.document)
2685
2686 def processSpaceCharacters(self, token):
2687 return self.parser.phases["inBody"].processSpaceCharacters(token)
2688
2689 def processCharacters(self, token):
2690 self.parser.parseError("expected-eof-but-got-char")
2691 self.parser.phase = self.parser.phases["inBody"]
2692 return token
2693
2694 def startTagHtml(self, token):
2695 return self.parser.phases["inBody"].processStartTag(token)
2696
2697 def startTagOther(self, token):
2698 self.parser.parseError("expected-eof-but-got-start-tag",
2699 {"name": token["name"]})
2700 self.parser.phase = self.parser.phases["inBody"]
2701 return token
2702
2703 def processEndTag(self, token):
2704 self.parser.parseError("expected-eof-but-got-end-tag",
2705 {"name": token["name"]})
2706 self.parser.phase = self.parser.phases["inBody"]
2707 return token
2708
2709 class AfterAfterFramesetPhase(Phase):
2710 def __init__(self, parser, tree):
2711 Phase.__init__(self, parser, tree)
2712
2713 self.startTagHandler = _utils.MethodDispatcher([
2714 ("html", self.startTagHtml),
2715 ("noframes", self.startTagNoFrames)
2716 ])
2717 self.startTagHandler.default = self.startTagOther
2718
2719 def processEOF(self):
2720 pass
2721
2722 def processComment(self, token):
2723 self.tree.insertComment(token, self.tree.document)
2724
2725 def processSpaceCharacters(self, token):
2726 return self.parser.phases["inBody"].processSpaceCharacters(token)
2727
2728 def processCharacters(self, token):
2729 self.parser.parseError("expected-eof-but-got-char")
2730
2731 def startTagHtml(self, token):
2732 return self.parser.phases["inBody"].processStartTag(token)
2733
2734 def startTagNoFrames(self, token):
2735 return self.parser.phases["inHead"].processStartTag(token)
2736
2737 def startTagOther(self, token):
2738 self.parser.parseError("expected-eof-but-got-start-tag",
2739 {"name": token["name"]})
2740
2741 def processEndTag(self, token):
2742 self.parser.parseError("expected-eof-but-got-end-tag",
2743 {"name": token["name"]})
2744 # pylint:enable=unused-argument
2745
2746 return {
2747 "initial": InitialPhase,
2748 "beforeHtml": BeforeHtmlPhase,
2749 "beforeHead": BeforeHeadPhase,
2750 "inHead": InHeadPhase,
2751 "inHeadNoscript": InHeadNoscriptPhase,
2752 "afterHead": AfterHeadPhase,
2753 "inBody": InBodyPhase,
2754 "text": TextPhase,
2755 "inTable": InTablePhase,
2756 "inTableText": InTableTextPhase,
2757 "inCaption": InCaptionPhase,
2758 "inColumnGroup": InColumnGroupPhase,
2759 "inTableBody": InTableBodyPhase,
2760 "inRow": InRowPhase,
2761 "inCell": InCellPhase,
2762 "inSelect": InSelectPhase,
2763 "inSelectInTable": InSelectInTablePhase,
2764 "inForeignContent": InForeignContentPhase,
2765 "afterBody": AfterBodyPhase,
2766 "inFrameset": InFramesetPhase,
2767 "afterFrameset": AfterFramesetPhase,
2768 "afterAfterBody": AfterAfterBodyPhase,
2769 "afterAfterFrameset": AfterAfterFramesetPhase,
2770 # XXX after after frameset
2771 }
2772
2773
2774 def adjust_attributes(token, replacements):
2775 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2776 if needs_adjustment:
2777 token['data'] = OrderedDict((replacements.get(k, k), v)
2778 for k, v in token['data'].items())
2779
2780
2781 def impliedTagToken(name, type="EndTag", attributes=None,
2782 selfClosing=False):
2783 if attributes is None:
2784 attributes = {}
2785 return {"type": tokenTypes[type], "name": name, "data": attributes,
2786 "selfClosing": selfClosing}
2787
2788
2789 class ParseError(Exception):
2790 """Error in parsed document"""
2791 pass