Mercurial > repos > guerler > hhblits
diff lib/python3.8/site-packages/pip/_vendor/html5lib/html5parser.py @ 0:9e54283cc701 draft
"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author | guerler |
---|---|
date | Mon, 27 Jul 2020 03:47:31 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python3.8/site-packages/pip/_vendor/html5lib/html5parser.py Mon Jul 27 03:47:31 2020 -0400 @@ -0,0 +1,2791 @@ +from __future__ import absolute_import, division, unicode_literals +from pip._vendor.six import with_metaclass, viewkeys + +import types +from collections import OrderedDict + +from . import _inputstream +from . import _tokenizer + +from . import treebuilders +from .treebuilders.base import Marker + +from . import _utils +from .constants import ( + spaceCharacters, asciiUpper2Lower, + specialElements, headingElements, cdataElements, rcdataElements, + tokenTypes, tagTokenTypes, + namespaces, + htmlIntegrationPointElements, mathmlTextIntegrationPointElements, + adjustForeignAttributes as adjustForeignAttributesMap, + adjustMathMLAttributes, adjustSVGAttributes, + E, + _ReparseException +) + + +def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): + """Parse an HTML document as a string or file-like object into a tree + + :arg doc: the document to parse as a string or file-like object + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import parse + >>> parse('<html><body><p>This is a doc</p></body></html>') + <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> + + """ + tb = treebuilders.getTreeBuilder(treebuilder) + p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) + return p.parse(doc, **kwargs) + + +def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): + """Parse an HTML fragment as a string or file-like object into a tree + + :arg doc: the fragment to parse as a string or file-like object + + :arg container: the container context to parse the fragment in + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import parseFragment + >>> parseFragment('<b>this is a fragment</b>') + <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> + + """ + tb = treebuilders.getTreeBuilder(treebuilder) + p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) + return p.parseFragment(doc, container=container, **kwargs) + + +def method_decorator_metaclass(function): + class Decorated(type): + def __new__(meta, classname, bases, classDict): + for attributeName, attribute in classDict.items(): + if isinstance(attribute, types.FunctionType): + attribute = function(attribute) + + classDict[attributeName] = attribute + return type.__new__(meta, classname, bases, classDict) + return Decorated + + +class HTMLParser(object): + """HTML parser + + Generates a tree structure from a stream of (possibly malformed) HTML. + + """ + + def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): + """ + :arg tree: a treebuilder class controlling the type of tree that will be + returned. Built in treebuilders can be accessed through + html5lib.treebuilders.getTreeBuilder(treeType) + + :arg strict: raise an exception when a parse error is encountered + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :arg debug: whether or not to enable debug mode which logs things + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() # generates parser with etree builder + >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict + + """ + + # Raise an exception on the first error encountered + self.strict = strict + + if tree is None: + tree = treebuilders.getTreeBuilder("etree") + self.tree = tree(namespaceHTMLElements) + self.errors = [] + + self.phases = dict([(name, cls(self, self.tree)) for name, cls in + getPhases(debug).items()]) + + def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): + + self.innerHTMLMode = innerHTML + self.container = container + self.scripting = scripting + self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) + self.reset() + + try: + self.mainLoop() + except _ReparseException: + self.reset() + self.mainLoop() + + def reset(self): + self.tree.reset() + self.firstStartTag = False + self.errors = [] + self.log = [] # only used with debug mode + # "quirks" / "limited quirks" / "no quirks" + self.compatMode = "no quirks" + + if self.innerHTMLMode: + self.innerHTML = self.container.lower() + + if self.innerHTML in cdataElements: + self.tokenizer.state = self.tokenizer.rcdataState + elif self.innerHTML in rcdataElements: + self.tokenizer.state = self.tokenizer.rawtextState + elif self.innerHTML == 'plaintext': + self.tokenizer.state = self.tokenizer.plaintextState + else: + # state already is data state + # self.tokenizer.state = self.tokenizer.dataState + pass + self.phase = self.phases["beforeHtml"] + self.phase.insertHtmlElement() + self.resetInsertionMode() + else: + self.innerHTML = False # pylint:disable=redefined-variable-type + self.phase = self.phases["initial"] + + self.lastPhase = None + + self.beforeRCDataPhase = None + + self.framesetOK = True + + @property + def documentEncoding(self): + """Name of the character encoding that was used to decode the input stream, or + :obj:`None` if that is not determined yet + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.charEncoding[0].name + + def isHTMLIntegrationPoint(self, element): + if (element.name == "annotation-xml" and + element.namespace == namespaces["mathml"]): + return ("encoding" in element.attributes and + element.attributes["encoding"].translate( + asciiUpper2Lower) in + ("text/html", "application/xhtml+xml")) + else: + return (element.namespace, element.name) in htmlIntegrationPointElements + + def isMathMLTextIntegrationPoint(self, element): + return (element.namespace, element.name) in mathmlTextIntegrationPointElements + + def mainLoop(self): + CharactersToken = tokenTypes["Characters"] + SpaceCharactersToken = tokenTypes["SpaceCharacters"] + StartTagToken = tokenTypes["StartTag"] + EndTagToken = tokenTypes["EndTag"] + CommentToken = tokenTypes["Comment"] + DoctypeToken = tokenTypes["Doctype"] + ParseErrorToken = tokenTypes["ParseError"] + + for token in self.normalizedTokens(): + prev_token = None + new_token = token + while new_token is not None: + prev_token = new_token + currentNode = self.tree.openElements[-1] if self.tree.openElements else None + currentNodeNamespace = currentNode.namespace if currentNode else None + currentNodeName = currentNode.name if currentNode else None + + type = new_token["type"] + + if type == ParseErrorToken: + self.parseError(new_token["data"], new_token.get("datavars", {})) + new_token = None + else: + if (len(self.tree.openElements) == 0 or + currentNodeNamespace == self.tree.defaultNamespace or + (self.isMathMLTextIntegrationPoint(currentNode) and + ((type == StartTagToken and + token["name"] not in frozenset(["mglyph", "malignmark"])) or + type in (CharactersToken, SpaceCharactersToken))) or + (currentNodeNamespace == namespaces["mathml"] and + currentNodeName == "annotation-xml" and + type == StartTagToken and + token["name"] == "svg") or + (self.isHTMLIntegrationPoint(currentNode) and + type in (StartTagToken, CharactersToken, SpaceCharactersToken))): + phase = self.phase + else: + phase = self.phases["inForeignContent"] + + if type == CharactersToken: + new_token = phase.processCharacters(new_token) + elif type == SpaceCharactersToken: + new_token = phase.processSpaceCharacters(new_token) + elif type == StartTagToken: + new_token = phase.processStartTag(new_token) + elif type == EndTagToken: + new_token = phase.processEndTag(new_token) + elif type == CommentToken: + new_token = phase.processComment(new_token) + elif type == DoctypeToken: + new_token = phase.processDoctype(new_token) + + if (type == StartTagToken and prev_token["selfClosing"] and + not prev_token["selfClosingAcknowledged"]): + self.parseError("non-void-element-with-trailing-solidus", + {"name": prev_token["name"]}) + + # When the loop finishes it's EOF + reprocess = True + phases = [] + while reprocess: + phases.append(self.phase) + reprocess = self.phase.processEOF() + if reprocess: + assert self.phase not in phases + + def normalizedTokens(self): + for token in self.tokenizer: + yield self.normalizeToken(token) + + def parse(self, stream, *args, **kwargs): + """Parse a HTML document into a well-formed tree + + :arg stream: a file-like object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element). + + :arg scripting: treat noscript elements as if JavaScript was turned on + + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parse('<html><body><p>This is a doc</p></body></html>') + <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> + + """ + self._parse(stream, False, None, *args, **kwargs) + return self.tree.getDocument() + + def parseFragment(self, stream, *args, **kwargs): + """Parse a HTML fragment into a well-formed tree fragment + + :arg container: name of the element we're setting the innerHTML + property if set to None, default to 'div' + + :arg stream: a file-like object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + + :arg scripting: treat noscript elements as if JavaScript was turned on + + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parseFragment('<b>this is a fragment</b>') + <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> + + """ + self._parse(stream, True, *args, **kwargs) + return self.tree.getFragment() + + def parseError(self, errorcode="XXX-undefined-error", datavars=None): + # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} + self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) + if self.strict: + raise ParseError(E[errorcode] % datavars) + + def normalizeToken(self, token): + # HTML5 specific normalizations to the token stream + if token["type"] == tokenTypes["StartTag"]: + raw = token["data"] + token["data"] = OrderedDict(raw) + if len(raw) > len(token["data"]): + # we had some duplicated attribute, fix so first wins + token["data"].update(raw[::-1]) + + return token + + def adjustMathMLAttributes(self, token): + adjust_attributes(token, adjustMathMLAttributes) + + def adjustSVGAttributes(self, token): + adjust_attributes(token, adjustSVGAttributes) + + def adjustForeignAttributes(self, token): + adjust_attributes(token, adjustForeignAttributesMap) + + def reparseTokenNormal(self, token): + # pylint:disable=unused-argument + self.parser.phase() + + def resetInsertionMode(self): + # The name of this method is mostly historical. (It's also used in the + # specification.) + last = False + newModes = { + "select": "inSelect", + "td": "inCell", + "th": "inCell", + "tr": "inRow", + "tbody": "inTableBody", + "thead": "inTableBody", + "tfoot": "inTableBody", + "caption": "inCaption", + "colgroup": "inColumnGroup", + "table": "inTable", + "head": "inBody", + "body": "inBody", + "frameset": "inFrameset", + "html": "beforeHead" + } + for node in self.tree.openElements[::-1]: + nodeName = node.name + new_phase = None + if node == self.tree.openElements[0]: + assert self.innerHTML + last = True + nodeName = self.innerHTML + # Check for conditions that should only happen in the innerHTML + # case + if nodeName in ("select", "colgroup", "head", "html"): + assert self.innerHTML + + if not last and node.namespace != self.tree.defaultNamespace: + continue + + if nodeName in newModes: + new_phase = self.phases[newModes[nodeName]] + break + elif last: + new_phase = self.phases["inBody"] + break + + self.phase = new_phase + + def parseRCDataRawtext(self, token, contentType): + # Generic RCDATA/RAWTEXT Parsing algorithm + assert contentType in ("RAWTEXT", "RCDATA") + + self.tree.insertElement(token) + + if contentType == "RAWTEXT": + self.tokenizer.state = self.tokenizer.rawtextState + else: + self.tokenizer.state = self.tokenizer.rcdataState + + self.originalPhase = self.phase + + self.phase = self.phases["text"] + + +@_utils.memoize +def getPhases(debug): + def log(function): + """Logger that records which phase processes each token""" + type_names = dict((value, key) for key, value in + tokenTypes.items()) + + def wrapped(self, *args, **kwargs): + if function.__name__.startswith("process") and len(args) > 0: + token = args[0] + try: + info = {"type": type_names[token['type']]} + except: + raise + if token['type'] in tagTokenTypes: + info["name"] = token['name'] + + self.parser.log.append((self.parser.tokenizer.state.__name__, + self.parser.phase.__class__.__name__, + self.__class__.__name__, + function.__name__, + info)) + return function(self, *args, **kwargs) + else: + return function(self, *args, **kwargs) + return wrapped + + def getMetaclass(use_metaclass, metaclass_func): + if use_metaclass: + return method_decorator_metaclass(metaclass_func) + else: + return type + + # pylint:disable=unused-argument + class Phase(with_metaclass(getMetaclass(debug, log))): + """Base class for helper object that implements each phase of processing + """ + + def __init__(self, parser, tree): + self.parser = parser + self.tree = tree + + def processEOF(self): + raise NotImplementedError + + def processComment(self, token): + # For most phases the following is correct. Where it's not it will be + # overridden. + self.tree.insertComment(token, self.tree.openElements[-1]) + + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) + + def processStartTag(self, token): + return self.startTagHandler[token["name"]](token) + + def startTagHtml(self, token): + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") + # XXX Need a check here to see if the first start tag token emitted is + # this token... If it's not, invoke self.parser.parseError(). + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[0].attributes: + self.tree.openElements[0].attributes[attr] = value + self.parser.firstStartTag = False + + def processEndTag(self, token): + return self.endTagHandler[token["name"]](token) + + class InitialPhase(Phase): + def processSpaceCharacters(self, token): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): + self.parser.parseError("unknown-doctype") + + if publicId is None: + publicId = "" + + self.tree.insertDoctype(token) + + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if (not correct or token["name"] != "html" or + publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + self.parser.compatMode = "quirks" + elif (publicId.startswith( + ("-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is not None): + self.parser.compatMode = "limited quirks" + + self.parser.phase = self.parser.phases["beforeHtml"] + + def anythingElse(self): + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.anythingElse() + return token + + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEOF(self): + self.parser.parseError("expected-doctype-but-got-eof") + self.anythingElse() + return True + + class BeforeHtmlPhase(Phase): + # helper methods + def insertHtmlElement(self): + self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.parser.phase = self.parser.phases["beforeHead"] + + # other + def processEOF(self): + self.insertHtmlElement() + return True + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.insertHtmlElement() + return token + + def processStartTag(self, token): + if token["name"] == "html": + self.parser.firstStartTag = True + self.insertHtmlElement() + return token + + def processEndTag(self, token): + if token["name"] not in ("head", "body", "html", "br"): + self.parser.parseError("unexpected-end-tag-before-html", + {"name": token["name"]}) + else: + self.insertHtmlElement() + return token + + class BeforeHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + (("head", "body", "html", "br"), self.endTagImplyHead) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.startTagHead(impliedTagToken("head", "StartTag")) + return True + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagHead(self, token): + self.tree.insertElement(token) + self.tree.headPointer = self.tree.openElements[-1] + self.parser.phase = self.parser.phases["inHead"] + + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token + + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token + + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) + + class InHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("title", self.startTagTitle), + (("noframes", "style"), self.startTagNoFramesStyle), + ("noscript", self.startTagNoscript), + ("script", self.startTagScript), + (("base", "basefont", "bgsound", "command", "link"), + self.startTagBaseLinkCommand), + ("meta", self.startTagMeta), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("head", self.endTagHead), + (("br", "html", "body"), self.endTagHtmlBodyBr) + ]) + self.endTagHandler.default = self.endTagOther + + # the real thing + def processEOF(self): + self.anythingElse() + return True + + def processCharacters(self, token): + self.anythingElse() + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") + + def startTagBaseLinkCommand(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif ("content" in attributes and + "http-equiv" in attributes and + attributes["http-equiv"].lower() == "content-type"): + # Encoding it as UTF-8 here is a hack, as really we should pass + # the abstract Unicode string, and just use the + # ContentAttrParser on that, but using UTF-8 allows all chars + # to be encoded and as a ASCII-superset works. + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataRawtext(token, "RCDATA") + + def startTagNoFramesStyle(self, token): + # Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagNoscript(self, token): + if self.parser.scripting: + self.parser.parseRCDataRawtext(token, "RAWTEXT") + else: + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inHeadNoscript"] + + def startTagScript(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState + self.parser.originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["text"] + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s" % node.name + self.parser.phase = self.parser.phases["afterHead"] + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.endTagHead(impliedTagToken("head")) + + class InHeadNoscriptPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), + (("head", "noscript"), self.startTagHeadNoscript), + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("noscript", self.endTagNoscript), + ("br", self.endTagBr), + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.parser.parseError("eof-in-head-noscript") + self.anythingElse() + return True + + def processComment(self, token): + return self.parser.phases["inHead"].processComment(token) + + def processCharacters(self, token): + self.parser.parseError("char-in-head-noscript") + self.anythingElse() + return token + + def processSpaceCharacters(self, token): + return self.parser.phases["inHead"].processSpaceCharacters(token) + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBaseLinkCommand(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagHeadNoscript(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagNoscript(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "noscript", "Expected noscript got %s" % node.name + self.parser.phase = self.parser.phases["inHead"] + + def endTagBr(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + # Caller must raise parse error first! + self.endTagNoscript(impliedTagToken("noscript")) + + class AfterHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("body", self.startTagBody), + ("frameset", self.startTagFrameset), + (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", + "style", "title"), + self.startTagFromHead), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + self.endTagHtmlBodyBr)]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.anythingElse() + return True + + def processCharacters(self, token): + self.anythingElse() + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inBody"] + + def startTagFrameset(self, token): + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break + + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True + + class InBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody + # the really-really-really-very crazy mode + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("base", "basefont", "bgsound", "command", "link", "meta", + "script", "style", "title"), + self.startTagProcessInHead), + ("body", self.startTagBody), + ("frameset", self.startTagFrameset), + (("address", "article", "aside", "blockquote", "center", "details", + "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", + "section", "summary", "ul"), + self.startTagCloseP), + (headingElements, self.startTagHeading), + (("pre", "listing"), self.startTagPreListing), + ("form", self.startTagForm), + (("li", "dd", "dt"), self.startTagListItem), + ("plaintext", self.startTagPlaintext), + ("a", self.startTagA), + (("b", "big", "code", "em", "font", "i", "s", "small", "strike", + "strong", "tt", "u"), self.startTagFormatting), + ("nobr", self.startTagNobr), + ("button", self.startTagButton), + (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), + ("xmp", self.startTagXmp), + ("table", self.startTagTable), + (("area", "br", "embed", "img", "keygen", "wbr"), + self.startTagVoidFormatting), + (("param", "source", "track"), self.startTagParamSource), + ("input", self.startTagInput), + ("hr", self.startTagHr), + ("image", self.startTagImage), + ("isindex", self.startTagIsIndex), + ("textarea", self.startTagTextarea), + ("iframe", self.startTagIFrame), + ("noscript", self.startTagNoscript), + (("noembed", "noframes"), self.startTagRawtext), + ("select", self.startTagSelect), + (("rp", "rt"), self.startTagRpRt), + (("option", "optgroup"), self.startTagOpt), + (("math"), self.startTagMath), + (("svg"), self.startTagSvg), + (("caption", "col", "colgroup", "frame", "head", + "tbody", "td", "tfoot", "th", "thead", + "tr"), self.startTagMisplaced) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("body", self.endTagBody), + ("html", self.endTagHtml), + (("address", "article", "aside", "blockquote", "button", "center", + "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", + "section", "summary", "ul"), self.endTagBlock), + ("form", self.endTagForm), + ("p", self.endTagP), + (("dd", "dt", "li"), self.endTagListItem), + (headingElements, self.endTagHeading), + (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", + "strike", "strong", "tt", "u"), self.endTagFormatting), + (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), + ("br", self.endTagBr), + ]) + self.endTagHandler.default = self.endTagOther + + def isMatchingFormattingElement(self, node1, node2): + return (node1.name == node2.name and + node1.namespace == node2.namespace and + node1.attributes == node2.attributes) + + # helper + def addFormattingElement(self, token): + self.tree.insertElement(token) + element = self.tree.openElements[-1] + + matchingElements = [] + for node in self.tree.activeFormattingElements[::-1]: + if node is Marker: + break + elif self.isMatchingFormattingElement(node, element): + matchingElements.append(node) + + assert len(matchingElements) <= 3 + if len(matchingElements) == 3: + self.tree.activeFormattingElements.remove(matchingElements[-1]) + self.tree.activeFormattingElements.append(element) + + # the real deal + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + # Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we + # want to drop leading newlines + data = token["data"] + self.processSpaceCharacters = self.processSpaceCharactersNonPre + if (data.startswith("\n") and + self.tree.openElements[-1].name in ("pre", "listing", "textarea") and + not self.tree.openElements[-1].hasContent()): + data = data[1:] + if data: + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(data) + + def processCharacters(self, token): + if token["data"] == "\u0000": + # The tokenizer should always emit null on its own + return + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) + # This must be bad for performance + if (self.parser.framesetOK and + any([char not in spaceCharacters + for char in token["data"]])): + self.parser.framesetOK = False + + def processSpaceCharactersNonPre(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) + + def startTagProcessInHead(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + else: + self.parser.framesetOK = False + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[1].attributes: + self.tree.openElements[1].attributes[attr] = value + + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagCloseP(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + + def startTagPreListing(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.processSpaceCharacters = self.processSpaceCharactersDropNewline + + def startTagForm(self, token): + if self.tree.formPointer: + self.parser.parseError("unexpected-start-tag", {"name": "form"}) + else: + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.tree.formPointer = self.tree.openElements[-1] + + def startTagListItem(self, token): + self.parser.framesetOK = False + + stopNamesMap = {"li": ["li"], + "dt": ["dt", "dd"], + "dd": ["dt", "dd"]} + stopNames = stopNamesMap[token["name"]] + for node in reversed(self.tree.openElements): + if node.name in stopNames: + self.parser.phase.processEndTag( + impliedTagToken(node.name, "EndTag")) + break + if (node.nameTuple in specialElements and + node.name not in ("address", "div", "p")): + break + + if self.tree.elementInScope("p", variant="button"): + self.parser.phase.processEndTag( + impliedTagToken("p", "EndTag")) + + self.tree.insertElement(token) + + def startTagPlaintext(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.plaintextState + + def startTagHeading(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagA(self, token): + afeAElement = self.tree.elementInActiveFormattingElements("a") + if afeAElement: + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) + if afeAElement in self.tree.openElements: + self.tree.openElements.remove(afeAElement) + if afeAElement in self.tree.activeFormattingElements: + self.tree.activeFormattingElements.remove(afeAElement) + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagNobr(self, token): + self.tree.reconstructActiveFormattingElements() + if self.tree.elementInScope("nobr"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) + # XXX Need tests that trigger the following + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagButton(self, token): + if self.tree.elementInScope("button"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + return token + else: + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + + def startTagAppletMarqueeObject(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False + + def startTagXmp(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.reconstructActiveFormattingElements() + self.parser.framesetOK = False + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p", variant="button"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.parser.phase = self.parser.phases["inTable"] + + def startTagVoidFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagInput(self, token): + framesetOK = self.parser.framesetOK + self.startTagVoidFormatting(token) + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + # input type=hidden doesn't change framesetOK + self.parser.framesetOK = framesetOK + + def startTagParamSource(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagHr(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagImage(self, token): + # No really... + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) + + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) + if self.tree.formPointer: + return + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) + # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Enter search keywords: " + self.processCharacters( + {"type": tokenTypes["Characters"], "data": prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] + attributes["name"] = "isindex" + self.processStartTag(impliedTagToken("input", "StartTag", + attributes=attributes, + selfClosing=token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) + + def startTagTextarea(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.rcdataState + self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False + + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagRawtext(token) + + def startTagNoscript(self, token): + if self.parser.scripting: + self.startTagRawtext(token) + else: + self.startTagOther(token) + + def startTagRawtext(self, token): + """iframe, noembed noframes, noscript(if scripting enabled)""" + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagOpt(self, token): + if self.tree.openElements[-1].name == "option": + self.parser.phase.processEndTag(impliedTagToken("option")) + self.tree.reconstructActiveFormattingElements() + self.parser.tree.insertElement(token) + + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] + + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + self.tree.insertElement(token) + + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + # Need to get the parse error right for the case where the token + # has a namespace not equal to the xmlns attribute + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + # Need to get the parse error right for the case where the token + # has a namespace not equal to the xmlns attribute + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMisplaced(self, token): + """ Elements that should be children of other elements that have a + different insertion mode; here they are ignored + "caption", "col", "colgroup", "frame", "frameset", "head", + "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", + "tr", "noscript" + """ + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) + + def startTagOther(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + + def endTagP(self, token): + if not self.tree.elementInScope("p", variant="button"): + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + self.endTagP(impliedTagToken("p", "EndTag")) + else: + self.tree.generateImpliedEndTags("p") + if self.tree.openElements[-1].name != "p": + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + node = self.tree.openElements.pop() + while node.name != "p": + node = self.tree.openElements.pop() + + def endTagBody(self, token): + if not self.tree.elementInScope("body"): + self.parser.parseError() + return + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "optgroup", + "option", "p", "rp", "rt", + "tbody", "td", "tfoot", + "th", "thead", "tr", "body", + "html")): + # Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"gotName": "body", "expectedName": node.name}) + break + self.parser.phase = self.parser.phases["afterBody"] + + def endTagHtml(self, token): + # We repeat the test for the body end tag token being ignored here + if self.tree.elementInScope("body"): + self.endTagBody(impliedTagToken("body")) + return token + + def endTagBlock(self, token): + # Put us back in the right whitespace handling mode + if token["name"] == "pre": + self.processSpaceCharacters = self.processSpaceCharactersNonPre + inScope = self.tree.elementInScope(token["name"]) + if inScope: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + if inScope: + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() + + def endTagForm(self, token): + node = self.tree.formPointer + self.tree.formPointer = None + if node is None or not self.tree.elementInScope(node): + self.parser.parseError("unexpected-end-tag", + {"name": "form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1] != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) + + def endTagListItem(self, token): + if token["name"] == "li": + variant = "list" + else: + variant = None + if not self.tree.elementInScope(token["name"], variant=variant): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + else: + self.tree.generateImpliedEndTags(exclude=token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError( + "end-tag-too-early", + {"name": token["name"]}) + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() + + def endTagHeading(self, token): + for item in headingElements: + if self.tree.elementInScope(item): + self.tree.generateImpliedEndTags() + break + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + + for item in headingElements: + if self.tree.elementInScope(item): + item = self.tree.openElements.pop() + while item.name not in headingElements: + item = self.tree.openElements.pop() + break + + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" + # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 + # XXX Better parseError messages appreciated. + + # Step 1 + outerLoopCounter = 0 + + # Step 2 + while outerLoopCounter < 8: + + # Step 3 + outerLoopCounter += 1 + + # Step 4: + + # Let the formatting element be the last element in + # the list of active formatting elements that: + # - is between the end of the list and the last scope + # marker in the list, if any, or the start of the list + # otherwise, and + # - has the same tag name as the token. + formattingElement = self.tree.elementInActiveFormattingElements( + token["name"]) + if (not formattingElement or + (formattingElement in self.tree.openElements and + not self.tree.elementInScope(formattingElement.name))): + # If there is no such node, then abort these steps + # and instead act as described in the "any other + # end tag" entry below. + self.endTagOther(token) + return + + # Otherwise, if there is such a node, but that node is + # not in the stack of open elements, then this is a + # parse error; remove the element from the list, and + # abort these steps. + elif formattingElement not in self.tree.openElements: + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) + self.tree.activeFormattingElements.remove(formattingElement) + return + + # Otherwise, if there is such a node, and that node is + # also in the stack of open elements, but the element + # is not in scope, then this is a parse error; ignore + # the token, and abort these steps. + elif not self.tree.elementInScope(formattingElement.name): + self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) + return + + # Otherwise, there is a formatting element and that + # element is in the stack and is in scope. If the + # element is not the current node, this is a parse + # error. In any case, proceed with the algorithm as + # written in the following steps. + else: + if formattingElement != self.tree.openElements[-1]: + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) + + # Step 5: + + # Let the furthest block be the topmost node in the + # stack of open elements that is lower in the stack + # than the formatting element, and is an element in + # the special category. There might not be one. + afeIndex = self.tree.openElements.index(formattingElement) + furthestBlock = None + for element in self.tree.openElements[afeIndex:]: + if element.nameTuple in specialElements: + furthestBlock = element + break + + # Step 6: + + # If there is no furthest block, then the UA must + # first pop all the nodes from the bottom of the stack + # of open elements, from the current node up to and + # including the formatting element, then remove the + # formatting element from the list of active + # formatting elements, and finally abort these steps. + if furthestBlock is None: + element = self.tree.openElements.pop() + while element != formattingElement: + element = self.tree.openElements.pop() + self.tree.activeFormattingElements.remove(element) + return + + # Step 7 + commonAncestor = self.tree.openElements[afeIndex - 1] + + # Step 8: + # The bookmark is supposed to help us identify where to reinsert + # nodes in step 15. We have to ensure that we reinsert nodes after + # the node before the active formatting element. Note the bookmark + # can move in step 9.7 + bookmark = self.tree.activeFormattingElements.index(formattingElement) + + # Step 9 + lastNode = node = furthestBlock + innerLoopCounter = 0 + + index = self.tree.openElements.index(node) + while innerLoopCounter < 3: + innerLoopCounter += 1 + # Node is element before node in open elements + index -= 1 + node = self.tree.openElements[index] + if node not in self.tree.activeFormattingElements: + self.tree.openElements.remove(node) + continue + # Step 9.6 + if node == formattingElement: + break + # Step 9.7 + if lastNode == furthestBlock: + bookmark = self.tree.activeFormattingElements.index(node) + 1 + # Step 9.8 + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + # Step 9.9 + # Remove lastNode from its parents, if any + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + node.appendChild(lastNode) + # Step 9.10 + lastNode = node + + # Step 10 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster + # parent the lastNode + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + + if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): + parent, insertBefore = self.tree.getTableMisnestedNodePosition() + parent.insertBefore(lastNode, insertBefore) + else: + commonAncestor.appendChild(lastNode) + + # Step 11 + clone = formattingElement.cloneNode() + + # Step 12 + furthestBlock.reparentChildren(clone) + + # Step 13 + furthestBlock.appendChild(clone) + + # Step 14 + self.tree.activeFormattingElements.remove(formattingElement) + self.tree.activeFormattingElements.insert(bookmark, clone) + + # Step 15 + self.tree.openElements.remove(formattingElement) + self.tree.openElements.insert( + self.tree.openElements.index(furthestBlock) + 1, clone) + + def endTagAppletMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + + if self.tree.elementInScope(token["name"]): + element = self.tree.openElements.pop() + while element.name != token["name"]: + element = self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(impliedTagToken("br", "StartTag")) + self.tree.openElements.pop() + + def endTagOther(self, token): + for node in self.tree.openElements[::-1]: + if node.name == token["name"]: + self.tree.generateImpliedEndTags(exclude=token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + break + else: + if node.nameTuple in specialElements: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break + + class TextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = _utils.MethodDispatcher([]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = _utils.MethodDispatcher([ + ("script", self.endTagScript)]) + self.endTagHandler.default = self.endTagOther + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + return True + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + # The rest of this method is all stuff that only happens if + # document.write works + + def endTagOther(self, token): + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + + class InTablePhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("caption", self.startTagCaption), + ("colgroup", self.startTagColgroup), + ("col", self.startTagCol), + (("tbody", "tfoot", "thead"), self.startTagRowGroup), + (("td", "th", "tr"), self.startTagImplyTbody), + ("table", self.startTagTable), + (("style", "script"), self.startTagStyleScript), + ("input", self.startTagInput), + ("form", self.startTagForm) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("table", self.endTagTable), + (("body", "caption", "col", "colgroup", "html", "tbody", "td", + "tfoot", "th", "thead", "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods + def clearStackToTableContext(self): + # "clear the stack back to a table context" + while self.tree.openElements[-1].name not in ("table", "html"): + # self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + # When the current node is <html> it's an innerHTML case + + # processing methods + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + # Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.processSpaceCharacters(token) + + def processCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.processCharacters(token) + + def insertText(self, token): + # If we get here there must be at least one non-whitespace character + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processCharacters(token) + self.tree.insertFromTable = False + + def startTagCaption(self, token): + self.clearStackToTableContext() + self.tree.activeFormattingElements.append(Marker) + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCaption"] + + def startTagColgroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inColumnGroup"] + + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + return token + + def startTagRowGroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inTableBody"] + + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + return token + + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) + if not self.parser.innerHTML: + return token + + def startTagStyleScript(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") + self.tree.insertElement(token) + # XXX associate with form + self.tree.openElements.pop() + else: + self.startTagOther(token) + + def startTagForm(self, token): + self.parser.parseError("unexpected-form-in-table") + if self.tree.formPointer is None: + self.tree.insertElement(token) + self.tree.formPointer = self.tree.openElements[-1] + self.tree.openElements.pop() + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processStartTag(token) + self.tree.insertFromTable = False + + def endTagTable(self, token): + if self.tree.elementInScope("table", variant="table"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "table": + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "table": + self.tree.openElements.pop() + self.tree.openElements.pop() + self.parser.resetInsertionMode() + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processEndTag(token) + self.tree.insertFromTable = False + + class InTableTextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any([item not in spaceCharacters for item in data]): + token = {"type": tokenTypes["Characters"], "data": data} + self.parser.phases["inTable"].insertText(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + def processEOF(self): + self.flushCharacters() + self.parser.phase = self.originalPhase + return True + + def processCharacters(self, token): + if token["data"] == "\u0000": + return + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + # pretty sure we should never reach here + self.characterTokens.append(token) + # assert False + + def processStartTag(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + def processEndTag(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + class InCaptionPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-caption + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.startTagTableElement) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("caption", self.endTagCaption), + ("table", self.endTagTable), + (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + def ignoreEndTagCaption(self): + return not self.tree.elementInScope("caption", variant="table") + + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + return self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): + self.parser.parseError() + # XXX Have to duplicate logic here to find out if the tag is ignored + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + return token + + def startTagOther(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def endTagCaption(self, token): + if not self.ignoreEndTagCaption(): + # AT this code is quite similar to endTagTable in "InTable" + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "caption": + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "caption": + self.tree.openElements.pop() + self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inTable"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagTable(self, token): + self.parser.parseError() + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + return token + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagOther(self, token): + return self.parser.phases["inBody"].processEndTag(token) + + class InColumnGroupPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-column + + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("col", self.startTagCol) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("colgroup", self.endTagColgroup), + ("col", self.endTagCol) + ]) + self.endTagHandler.default = self.endTagOther + + def ignoreEndTagColgroup(self): + return self.tree.openElements[-1].name == "html" + + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + return True + + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + return token + + def startTagCol(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + return token + + def endTagColgroup(self, token): + if self.ignoreEndTagColgroup(): + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + else: + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] + + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) + + def endTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + return token + + class InTableBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("tr", self.startTagTr), + (("td", "th"), self.startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), + ("table", self.endTagTable), + (("body", "caption", "col", "colgroup", "html", "td", "th", + "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods + def clearStackToTableBodyContext(self): + while self.tree.openElements[-1].name not in ("tbody", "tfoot", + "thead", "html"): + # self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + return self.parser.phases["inTable"].processSpaceCharacters(token) + + def processCharacters(self, token): + return self.parser.phases["inTable"].processCharacters(token) + + def startTagTr(self, token): + self.clearStackToTableBodyContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inRow"] + + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + return token + + def startTagTableOther(self, token): + # XXX AT Any ideas on how to share this with endTagTable? + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + return token + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def startTagOther(self, token): + return self.parser.phases["inTable"].processStartTag(token) + + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.clearStackToTableBodyContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] + else: + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) + + def endTagTable(self, token): + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + return token + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) + + def endTagOther(self, token): + return self.parser.phases["inTable"].processEndTag(token) + + class InRowPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-row + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("td", "th"), self.startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead", + "tr"), self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("tr", self.endTagTr), + ("table", self.endTagTable), + (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), + (("body", "caption", "col", "colgroup", "html", "td", "th"), + self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods (XXX unify this with other table helper methods) + def clearStackToTableRowContext(self): + while self.tree.openElements[-1].name not in ("tr", "html"): + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + + def ignoreEndTagTr(self): + return not self.tree.elementInScope("tr", variant="table") + + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + return self.parser.phases["inTable"].processSpaceCharacters(token) + + def processCharacters(self, token): + return self.parser.phases["inTable"].processCharacters(token) + + def startTagTableCell(self, token): + self.clearStackToTableRowContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCell"] + self.tree.activeFormattingElements.append(Marker) + + def startTagTableOther(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr(impliedTagToken("tr")) + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + return token + + def startTagOther(self, token): + return self.parser.phases["inTable"].processStartTag(token) + + def endTagTr(self, token): + if not self.ignoreEndTagTr(): + self.clearStackToTableRowContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTableBody"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagTable(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr(impliedTagToken("tr")) + # Reprocess the current tag if the tr end tag was not ignored + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + return token + + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagTr(impliedTagToken("tr")) + return token + else: + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) + + def endTagOther(self, token): + return self.parser.phases["inTable"].processEndTag(token) + + class InCellPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-cell + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + (("td", "th"), self.endTagTableCell), + (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), + (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) + ]) + self.endTagHandler.default = self.endTagOther + + # helper + def closeCell(self): + if self.tree.elementInScope("td", variant="table"): + self.endTagTableCell(impliedTagToken("td")) + elif self.tree.elementInScope("th", variant="table"): + self.endTagTableCell(impliedTagToken("th")) + + # the rest + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + return self.parser.phases["inBody"].processCharacters(token) + + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", variant="table") or + self.tree.elementInScope("th", variant="table")): + self.closeCell() + return token + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def startTagOther(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", + {"name": token["name"]}) + while True: + node = self.tree.openElements.pop() + if node.name == token["name"]: + break + else: + self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inRow"] + else: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.closeCell() + return token + else: + # sometimes innerHTML case + self.parser.parseError() + + def endTagOther(self, token): + return self.parser.phases["inBody"].processEndTag(token) + + class InSelectPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("option", self.startTagOption), + ("optgroup", self.startTagOptgroup), + ("select", self.startTagSelect), + (("input", "keygen", "textarea"), self.startTagInput), + ("script", self.startTagScript) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("option", self.endTagOption), + ("optgroup", self.endTagOptgroup), + ("select", self.endTagSelect) + ]) + self.endTagHandler.default = self.endTagOther + + # http://www.whatwg.org/specs/web-apps/current-work/#in-select + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML + + def processCharacters(self, token): + if token["data"] == "\u0000": + return + self.tree.insertText(token["data"]) + + def startTagOption(self, token): + # We need to imply </option> if <option> is the current node. + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagOptgroup(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagSelect(self, token): + self.parser.parseError("unexpected-select-in-select") + self.endTagSelect(impliedTagToken("select")) + + def startTagInput(self, token): + self.parser.parseError("unexpected-input-in-select") + if self.tree.elementInScope("select", variant="select"): + self.endTagSelect(impliedTagToken("select")) + return token + else: + assert self.parser.innerHTML + + def startTagScript(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-select", + {"name": token["name"]}) + + def endTagOption(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "option"}) + + def endTagOptgroup(self, token): + # </optgroup> implicitly closes <option> + if (self.tree.openElements[-1].name == "option" and + self.tree.openElements[-2].name == "optgroup"): + self.tree.openElements.pop() + # It also closes </optgroup> + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + # But nothing else + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "optgroup"}) + + def endTagSelect(self, token): + if self.tree.elementInScope("select", variant="select"): + node = self.tree.openElements.pop() + while node.name != "select": + node = self.tree.openElements.pop() + self.parser.resetInsertionMode() + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) + + class InSelectInTablePhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.startTagTable) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.endTagTable) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.parser.phases["inSelect"].processEOF() + + def processCharacters(self, token): + return self.parser.phases["inSelect"].processCharacters(token) + + def startTagTable(self, token): + self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) + self.endTagOther(impliedTagToken("select")) + return token + + def startTagOther(self, token): + return self.parser.phases["inSelect"].processStartTag(token) + + def endTagTable(self, token): + self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagOther(impliedTagToken("select")) + return token + + def endTagOther(self, token): + return self.parser.phases["inSelect"].processEndTag(token) + + class InForeignContentPhase(Phase): + breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", + "center", "code", "dd", "div", "dl", "dt", + "em", "embed", "h1", "h2", "h3", + "h4", "h5", "h6", "head", "hr", "i", "img", + "li", "listing", "menu", "meta", "nobr", + "ol", "p", "pre", "ruby", "s", "small", + "span", "strong", "strike", "sub", "sup", + "table", "tt", "u", "ul", "var"]) + + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + def adjustSVGTagNames(self, token): + replacements = {"altglyph": "altGlyph", + "altglyphdef": "altGlyphDef", + "altglyphitem": "altGlyphItem", + "animatecolor": "animateColor", + "animatemotion": "animateMotion", + "animatetransform": "animateTransform", + "clippath": "clipPath", + "feblend": "feBlend", + "fecolormatrix": "feColorMatrix", + "fecomponenttransfer": "feComponentTransfer", + "fecomposite": "feComposite", + "feconvolvematrix": "feConvolveMatrix", + "fediffuselighting": "feDiffuseLighting", + "fedisplacementmap": "feDisplacementMap", + "fedistantlight": "feDistantLight", + "feflood": "feFlood", + "fefunca": "feFuncA", + "fefuncb": "feFuncB", + "fefuncg": "feFuncG", + "fefuncr": "feFuncR", + "fegaussianblur": "feGaussianBlur", + "feimage": "feImage", + "femerge": "feMerge", + "femergenode": "feMergeNode", + "femorphology": "feMorphology", + "feoffset": "feOffset", + "fepointlight": "fePointLight", + "fespecularlighting": "feSpecularLighting", + "fespotlight": "feSpotLight", + "fetile": "feTile", + "feturbulence": "feTurbulence", + "foreignobject": "foreignObject", + "glyphref": "glyphRef", + "lineargradient": "linearGradient", + "radialgradient": "radialGradient", + "textpath": "textPath"} + + if token["name"] in replacements: + token["name"] = replacements[token["name"]] + + def processCharacters(self, token): + if token["data"] == "\u0000": + token["data"] = "\uFFFD" + elif (self.parser.framesetOK and + any(char not in spaceCharacters for char in token["data"])): + self.parser.framesetOK = False + Phase.processCharacters(self, token) + + def processStartTag(self, token): + currentNode = self.tree.openElements[-1] + if (token["name"] in self.breakoutElements or + (token["name"] == "font" and + set(token["data"].keys()) & set(["color", "face", "size"]))): + self.parser.parseError("unexpected-html-element-in-foreign-content", + {"name": token["name"]}) + while (self.tree.openElements[-1].namespace != + self.tree.defaultNamespace and + not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and + not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): + self.tree.openElements.pop() + return token + + else: + if currentNode.namespace == namespaces["mathml"]: + self.parser.adjustMathMLAttributes(token) + elif currentNode.namespace == namespaces["svg"]: + self.adjustSVGTagNames(token) + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = currentNode.namespace + self.tree.insertElement(token) + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def processEndTag(self, token): + nodeIndex = len(self.tree.openElements) - 1 + node = self.tree.openElements[-1] + if node.name.translate(asciiUpper2Lower) != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + while True: + if node.name.translate(asciiUpper2Lower) == token["name"]: + # XXX this isn't in the spec but it seems necessary + if self.parser.phase == self.parser.phases["inTableText"]: + self.parser.phase.flushCharacters() + self.parser.phase = self.parser.phase.originalPhase + while self.tree.openElements.pop() != node: + assert self.tree.openElements + new_token = None + break + nodeIndex -= 1 + + node = self.tree.openElements[nodeIndex] + if node.namespace != self.tree.defaultNamespace: + continue + else: + new_token = self.parser.phase.processEndTag(token) + break + return new_token + + class AfterBodyPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + # Stop parsing + pass + + def processComment(self, token): + # This is needed because data is to be appended to the <html> element + # here and not to whatever is currently open. + self.tree.insertComment(token, self.tree.openElements[0]) + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-body") + self.parser.phase = self.parser.phases["inBody"] + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + def endTagHtml(self, name): + if self.parser.innerHTML: + self.parser.parseError("unexpected-end-tag-after-body-innerhtml") + else: + self.parser.phase = self.parser.phases["afterAfterBody"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + class InFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("frameset", self.startTagFrameset), + ("frame", self.startTagFrame), + ("noframes", self.startTagNoframes) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("frameset", self.endTagFrameset) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-frameset") + else: + assert self.parser.innerHTML + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-in-frameset") + + def startTagFrameset(self, token): + self.tree.insertElement(token) + + def startTagFrame(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + + def startTagNoframes(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-frameset", + {"name": token["name"]}) + + def endTagFrameset(self, token): + if self.tree.openElements[-1].name == "html": + # innerHTML case + self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") + else: + self.tree.openElements.pop() + if (not self.parser.innerHTML and + self.tree.openElements[-1].name != "frameset"): + # If we're not in innerHTML mode and the current node is not a + # "frameset" element (anymore) then switch. + self.parser.phase = self.parser.phases["afterFrameset"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-frameset", + {"name": token["name"]}) + + class AfterFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#after3 + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("noframes", self.startTagNoframes) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = _utils.MethodDispatcher([ + ("html", self.endTagHtml) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + # Stop parsing + pass + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-frameset") + + def startTagNoframes(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-frameset", + {"name": token["name"]}) + + def endTagHtml(self, token): + self.parser.phase = self.parser.phases["afterAfterFrameset"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-frameset", + {"name": token["name"]}) + + class AfterAfterBodyPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + return self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + class AfterAfterFramesetPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = _utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("noframes", self.startTagNoFrames) + ]) + self.startTagHandler.default = self.startTagOther + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + return self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagNoFrames(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + # pylint:enable=unused-argument + + return { + "initial": InitialPhase, + "beforeHtml": BeforeHtmlPhase, + "beforeHead": BeforeHeadPhase, + "inHead": InHeadPhase, + "inHeadNoscript": InHeadNoscriptPhase, + "afterHead": AfterHeadPhase, + "inBody": InBodyPhase, + "text": TextPhase, + "inTable": InTablePhase, + "inTableText": InTableTextPhase, + "inCaption": InCaptionPhase, + "inColumnGroup": InColumnGroupPhase, + "inTableBody": InTableBodyPhase, + "inRow": InRowPhase, + "inCell": InCellPhase, + "inSelect": InSelectPhase, + "inSelectInTable": InSelectInTablePhase, + "inForeignContent": InForeignContentPhase, + "afterBody": AfterBodyPhase, + "inFrameset": InFramesetPhase, + "afterFrameset": AfterFramesetPhase, + "afterAfterBody": AfterAfterBodyPhase, + "afterAfterFrameset": AfterAfterFramesetPhase, + # XXX after after frameset + } + + +def adjust_attributes(token, replacements): + needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) + if needs_adjustment: + token['data'] = OrderedDict((replacements.get(k, k), v) + for k, v in token['data'].items()) + + +def impliedTagToken(name, type="EndTag", attributes=None, + selfClosing=False): + if attributes is None: + attributes = {} + return {"type": tokenTypes[type], "name": name, "data": attributes, + "selfClosing": selfClosing} + + +class ParseError(Exception): + """Error in parsed document""" + pass