Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/lxml/html/__init__.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/lxml/html/__init__.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,1926 @@ +# Copyright (c) 2004 Ian Bicking. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. Neither the name of Ian Bicking nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""The ``lxml.html`` tool set for HTML handling. +""" + +from __future__ import absolute_import + +__all__ = [ + 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', + 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] + + +import copy +import sys +import re +from functools import partial + +try: + from collections.abc import MutableMapping, MutableSet +except ImportError: + from collections import MutableMapping, MutableSet + +from .. import etree +from . import defs +from ._setmixin import SetMixin + +try: + from urlparse import urljoin +except ImportError: + # Python 3 + from urllib.parse import urljoin + +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + # Python 3 + basestring = (str, bytes) + + +def __fix_docstring(s): + if not s: + return s + if sys.version_info[0] >= 3: + sub = re.compile(r"^(\s*)u'", re.M).sub + else: + sub = re.compile(r"^(\s*)b'", re.M).sub + return sub(r"\1'", s) + + +XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", + namespaces={'x':XHTML_NAMESPACE}) +_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", + namespaces={'x':XHTML_NAMESPACE}) +_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", + namespaces={'x':XHTML_NAMESPACE}) +#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) +_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") +_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_collect_string_content = etree.XPath("string()") +_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer +_iter_css_imports = re.compile(r'@import "(.*?)"').finditer +_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", + namespaces={'x':XHTML_NAMESPACE}) +_archive_re = re.compile(r'[^ ]+') +_parse_meta_refresh_url = re.compile( + r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search + + +def _unquote_match(s, pos): + if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": + return s[1:-1], pos+1 + else: + return s,pos + + +def _transform_result(typ, result): + """Convert the result back into the input type. + """ + if issubclass(typ, bytes): + return tostring(result, encoding='utf-8') + elif issubclass(typ, unicode): + return tostring(result, encoding='unicode') + else: + return result + + +def _nons(tag): + if isinstance(tag, basestring): + if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: + return tag.split('}')[-1] + return tag + + +class Classes(MutableSet): + """Provides access to an element's class attribute as a set-like collection. + Usage:: + + >>> el = fromstring('<p class="hidden large">Text</p>') + >>> classes = el.classes # or: classes = Classes(el.attrib) + >>> classes |= ['block', 'paragraph'] + >>> el.get('class') + 'hidden large block paragraph' + >>> classes.toggle('hidden') + False + >>> el.get('class') + 'large block paragraph' + >>> classes -= ('some', 'classes', 'block') + >>> el.get('class') + 'large paragraph' + """ + def __init__(self, attributes): + self._attributes = attributes + self._get_class_value = partial(attributes.get, 'class', '') + + def add(self, value): + """ + Add a class. + + This has no effect if the class is already present. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + if value in classes: + return + classes.append(value) + self._attributes['class'] = ' '.join(classes) + + def discard(self, value): + """ + Remove a class if it is currently present. + + If the class is not present, do nothing. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = [name for name in self._get_class_value().split() + if name != value] + if classes: + self._attributes['class'] = ' '.join(classes) + elif 'class' in self._attributes: + del self._attributes['class'] + + def remove(self, value): + """ + Remove a class; it must currently be present. + + If the class is not present, raise a KeyError. + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + super(Classes, self).remove(value) + + def __contains__(self, name): + classes = self._get_class_value() + return name in classes and name in classes.split() + + def __iter__(self): + return iter(self._get_class_value().split()) + + def __len__(self): + return len(self._get_class_value().split()) + + # non-standard methods + + def update(self, values): + """ + Add all names from 'values'. + """ + classes = self._get_class_value().split() + extended = False + for value in values: + if value not in classes: + classes.append(value) + extended = True + if extended: + self._attributes['class'] = ' '.join(classes) + + def toggle(self, value): + """ + Add a class name if it isn't there yet, or remove it if it exists. + + Returns true if the class was added (and is now enabled) and + false if it was removed (and is now disabled). + """ + if not value or re.search(r'\s', value): + raise ValueError("Invalid class name: %r" % value) + classes = self._get_class_value().split() + try: + classes.remove(value) + enabled = False + except ValueError: + classes.append(value) + enabled = True + if classes: + self._attributes['class'] = ' '.join(classes) + else: + del self._attributes['class'] + return enabled + + +class HtmlMixin(object): + + def set(self, key, value=None): + """set(self, key, value=None) + + Sets an element attribute. If no value is provided, or if the value is None, + creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" + for ``form.set('novalidate')``. + """ + super(HtmlElement, self).set(key, value) + + @property + def classes(self): + """ + A set-like wrapper around the 'class' attribute. + """ + return Classes(self.attrib) + + @classes.setter + def classes(self, classes): + assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. + value = classes._get_class_value() + if value: + self.set('class', value) + elif self.get('class') is not None: + del self.attrib['class'] + + @property + def base_url(self): + """ + Returns the base URL, given when the page was parsed. + + Use with ``urlparse.urljoin(el.base_url, href)`` to get + absolute URLs. + """ + return self.getroottree().docinfo.URL + + @property + def forms(self): + """ + Return a list of all the forms + """ + return _forms_xpath(self) + + @property + def body(self): + """ + Return the <body> element. Can be called from a child element + to get the document's head. + """ + return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def head(self): + """ + Returns the <head> element. Can be called from a child + element to get the document's head. + """ + return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] + + @property + def label(self): + """ + Get or set any <label> element associated with this element. + """ + id = self.get('id') + if not id: + return None + result = _label_xpath(self, id=id) + if not result: + return None + else: + return result[0] + + @label.setter + def label(self, label): + id = self.get('id') + if not id: + raise TypeError( + "You cannot set a label for an element (%r) that has no id" + % self) + if _nons(label.tag) != 'label': + raise TypeError( + "You can only assign label to a label element (not %r)" + % label) + label.set('for', id) + + @label.deleter + def label(self): + label = self.label + if label is not None: + del label.attrib['for'] + + def drop_tree(self): + """ + Removes this element from the tree, including its children and + text. The tail text is joined to the previous element or + parent. + """ + parent = self.getparent() + assert parent is not None + if self.tail: + previous = self.getprevious() + if previous is None: + parent.text = (parent.text or '') + self.tail + else: + previous.tail = (previous.tail or '') + self.tail + parent.remove(self) + + def drop_tag(self): + """ + Remove the tag, but not its children or text. The children and text + are merged into the parent. + + Example:: + + >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') + >>> h.find('.//b').drop_tag() + >>> print(tostring(h, encoding='unicode')) + <div>Hello World!</div> + """ + parent = self.getparent() + assert parent is not None + previous = self.getprevious() + if self.text and isinstance(self.tag, basestring): + # not a Comment, etc. + if previous is None: + parent.text = (parent.text or '') + self.text + else: + previous.tail = (previous.tail or '') + self.text + if self.tail: + if len(self): + last = self[-1] + last.tail = (last.tail or '') + self.tail + elif previous is None: + parent.text = (parent.text or '') + self.tail + else: + previous.tail = (previous.tail or '') + self.tail + index = parent.index(self) + parent[index:index+1] = self[:] + + def find_rel_links(self, rel): + """ + Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. + """ + rel = rel.lower() + return [el for el in _rel_links_xpath(self) + if el.get('rel').lower() == rel] + + def find_class(self, class_name): + """ + Find any elements with the given class name. + """ + return _class_xpath(self, class_name=class_name) + + def get_element_by_id(self, id, *default): + """ + Get the first element in a document with the given id. If none is + found, return the default argument if provided or raise KeyError + otherwise. + + Note that there can be more than one element with the same id, + and this isn't uncommon in HTML documents found in the wild. + Browsers return only the first match, and this function does + the same. + """ + try: + # FIXME: should this check for multiple matches? + # browsers just return the first one + return _id_xpath(self, id=id)[0] + except IndexError: + if default: + return default[0] + else: + raise KeyError(id) + + def text_content(self): + """ + Return the text content of the tag (and the text in any children). + """ + return _collect_string_content(self) + + def cssselect(self, expr, translator='html'): + """ + Run the CSS expression on this element and its children, + returning a list of the results. + + Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) + -- note that pre-compiling the expression can provide a substantial + speedup. + """ + # Do the import here to make the dependency optional. + from lxml.cssselect import CSSSelector + return CSSSelector(expr, translator=translator)(self) + + ######################################## + ## Link functions + ######################################## + + def make_links_absolute(self, base_url=None, resolve_base_href=True, + handle_failures=None): + """ + Make all links in the document absolute, given the + ``base_url`` for the document (the full URL where the document + came from), or if no ``base_url`` is given, then the ``.base_url`` + of the document. + + If ``resolve_base_href`` is true, then any ``<base href>`` + tags in the document are used *and* removed from the document. + If it is false then any such tag is ignored. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. + """ + if base_url is None: + base_url = self.base_url + if base_url is None: + raise TypeError( + "No base_url given, and the document has no base_url") + if resolve_base_href: + self.resolve_base_href() + + if handle_failures == 'ignore': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return href + elif handle_failures == 'discard': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return None + elif handle_failures is None: + def link_repl(href): + return urljoin(base_url, href) + else: + raise ValueError( + "unexpected value for handle_failures: %r" % handle_failures) + + self.rewrite_links(link_repl) + + def resolve_base_href(self, handle_failures=None): + """ + Find any ``<base href>`` tag in the document, and apply its + values to all links found in the document. Also remove the + tag once it has been applied. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. + """ + base_href = None + basetags = self.xpath('//base[@href]|//x:base[@href]', + namespaces={'x': XHTML_NAMESPACE}) + for b in basetags: + base_href = b.get('href') + b.drop_tree() + if not base_href: + return + self.make_links_absolute(base_href, resolve_base_href=False, + handle_failures=handle_failures) + + def iterlinks(self): + """ + Yield (element, attribute, link, pos), where attribute may be None + (indicating the link is in the text). ``pos`` is the position + where the link occurs; often 0, but sometimes something else in + the case of links in stylesheets or style tags. + + Note: <base href> is *not* taken into account in any way. The + link you get is exactly the link in the document. + + Note: multiple links inside of a single text string or + attribute value are returned in reversed order. This makes it + possible to replace or delete them from the text string value + based on their reported text positions. Otherwise, a + modification at one text position can change the positions of + links reported later on. + """ + link_attrs = defs.link_attrs + for el in self.iter(etree.Element): + attribs = el.attrib + tag = _nons(el.tag) + if tag == 'object': + codebase = None + ## <object> tags have attributes that are relative to + ## codebase + if 'codebase' in attribs: + codebase = el.get('codebase') + yield (el, 'codebase', codebase, 0) + for attrib in ('classid', 'data'): + if attrib in attribs: + value = el.get(attrib) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, attrib, value, 0) + if 'archive' in attribs: + for match in _archive_re.finditer(el.get('archive')): + value = match.group(0) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, 'archive', value, match.start()) + else: + for attrib in link_attrs: + if attrib in attribs: + yield (el, attrib, attribs[attrib], 0) + if tag == 'meta': + http_equiv = attribs.get('http-equiv', '').lower() + if http_equiv == 'refresh': + content = attribs.get('content', '') + match = _parse_meta_refresh_url(content) + url = (match.group('url') if match else content).strip() + # unexpected content means the redirect won't work, but we might + # as well be permissive and return the entire string. + if url: + url, pos = _unquote_match( + url, match.start('url') if match else content.find(url)) + yield (el, 'content', url, pos) + elif tag == 'param': + valuetype = el.get('valuetype') or '' + if valuetype.lower() == 'ref': + ## FIXME: while it's fine we *find* this link, + ## according to the spec we aren't supposed to + ## actually change the value, including resolving + ## it. It can also still be a link, even if it + ## doesn't have a valuetype="ref" (which seems to be the norm) + ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype + yield (el, 'value', el.get('value'), 0) + elif tag == 'style' and el.text: + urls = [ + # (start_pos, url) + _unquote_match(match.group(1), match.start(1))[::-1] + for match in _iter_css_urls(el.text) + ] + [ + (match.start(1), match.group(1)) + for match in _iter_css_imports(el.text) + ] + if urls: + # sort by start pos to bring both match sets back into order + # and reverse the list to report correct positions despite + # modifications + urls.sort(reverse=True) + for start, url in urls: + yield (el, None, url, start) + if 'style' in attribs: + urls = list(_iter_css_urls(attribs['style'])) + if urls: + # return in reversed order to simplify in-place modifications + for match in urls[::-1]: + url, start = _unquote_match(match.group(1), match.start(1)) + yield (el, 'style', url, start) + + def rewrite_links(self, link_repl_func, resolve_base_href=True, + base_href=None): + """ + Rewrite all the links in the document. For each link + ``link_repl_func(link)`` will be called, and the return value + will replace the old link. + + Note that links may not be absolute (unless you first called + ``make_links_absolute()``), and may be internal (e.g., + ``'#anchor'``). They can also be values like + ``'mailto:email'`` or ``'javascript:expr'``. + + If you give ``base_href`` then all links passed to + ``link_repl_func()`` will take that into account. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. + """ + if base_href is not None: + # FIXME: this can be done in one pass with a wrapper + # around link_repl_func + self.make_links_absolute( + base_href, resolve_base_href=resolve_base_href) + elif resolve_base_href: + self.resolve_base_href() + + for el, attrib, link, pos in self.iterlinks(): + new_link = link_repl_func(link.strip()) + if new_link == link: + continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue + + if attrib is None: + new = el.text[:pos] + new_link + el.text[pos+len(link):] + el.text = new + else: + cur = el.get(attrib) + if not pos and len(cur) == len(link): + new = new_link # most common case + else: + new = cur[:pos] + new_link + cur[pos+len(link):] + el.set(attrib, new) + + +class _MethodFunc(object): + """ + An object that represents a method on an element as a function; + the function takes either an element or an HTML string. It + returns whatever the function normally returns, or if the function + works in-place (and so returns None) it returns a serialized form + of the resulting document. + """ + def __init__(self, name, copy=False, source_class=HtmlMixin): + self.name = name + self.copy = copy + self.__doc__ = getattr(source_class, self.name).__doc__ + def __call__(self, doc, *args, **kw): + result_type = type(doc) + if isinstance(doc, basestring): + if 'copy' in kw: + raise TypeError( + "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) + doc = fromstring(doc, **kw) + else: + if 'copy' in kw: + make_a_copy = kw.pop('copy') + else: + make_a_copy = self.copy + if make_a_copy: + doc = copy.deepcopy(doc) + meth = getattr(doc, self.name) + result = meth(*args, **kw) + # FIXME: this None test is a bit sloppy + if result is None: + # Then return what we got in + return _transform_result(result_type, doc) + else: + return result + + +find_rel_links = _MethodFunc('find_rel_links', copy=False) +find_class = _MethodFunc('find_class', copy=False) +make_links_absolute = _MethodFunc('make_links_absolute', copy=True) +resolve_base_href = _MethodFunc('resolve_base_href', copy=True) +iterlinks = _MethodFunc('iterlinks', copy=False) +rewrite_links = _MethodFunc('rewrite_links', copy=True) + + +class HtmlComment(etree.CommentBase, HtmlMixin): + pass + + +class HtmlElement(etree.ElementBase, HtmlMixin): + # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) + cssselect = HtmlMixin.cssselect + set = HtmlMixin.set + + +class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): + pass + + +class HtmlEntity(etree.EntityBase, HtmlMixin): + pass + + +class HtmlElementClassLookup(etree.CustomElementClassLookup): + """A lookup scheme for HTML Element classes. + + To create a lookup instance with different Element classes, pass a tag + name mapping of Element classes in the ``classes`` keyword argument and/or + a tag name mapping of Mixin classes in the ``mixins`` keyword argument. + The special key '*' denotes a Mixin class that should be mixed into all + Element classes. + """ + _default_element_classes = {} + + def __init__(self, classes=None, mixins=None): + etree.CustomElementClassLookup.__init__(self) + if classes is None: + classes = self._default_element_classes.copy() + if mixins: + mixers = {} + for name, value in mixins: + if name == '*': + for n in classes.keys(): + mixers.setdefault(n, []).append(value) + else: + mixers.setdefault(name, []).append(value) + for name, mix_bases in mixers.items(): + cur = classes.get(name, HtmlElement) + bases = tuple(mix_bases + [cur]) + classes[name] = type(cur.__name__, bases, {}) + self._element_classes = classes + + def lookup(self, node_type, document, namespace, name): + if node_type == 'element': + return self._element_classes.get(name.lower(), HtmlElement) + elif node_type == 'comment': + return HtmlComment + elif node_type == 'PI': + return HtmlProcessingInstruction + elif node_type == 'entity': + return HtmlEntity + # Otherwise normal lookup + return None + + +################################################################################ +# parsing +################################################################################ + +_looks_like_full_html_unicode = re.compile( + unicode(r'^\s*<(?:html|!doctype)'), re.I).match +_looks_like_full_html_bytes = re.compile( + r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match + + +def document_fromstring(html, parser=None, ensure_head_body=False, **kw): + if parser is None: + parser = html_parser + value = etree.fromstring(html, parser, **kw) + if value is None: + raise etree.ParserError( + "Document is empty") + if ensure_head_body and value.find('head') is None: + value.insert(0, Element('head')) + if ensure_head_body and value.find('body') is None: + value.append(Element('body')) + return value + + +def fragments_fromstring(html, no_leading_text=False, base_url=None, + parser=None, **kw): + """Parses several HTML elements, returning a list of elements. + + The first item in the list may be a string. + If no_leading_text is true, then it will be an error if there is + leading text, and it will always be a list of only elements. + + base_url will set the document's base_url attribute + (and the tree's docinfo.URL). + """ + if parser is None: + parser = html_parser + # FIXME: check what happens when you give html with a body, head, etc. + if isinstance(html, bytes): + if not _looks_like_full_html_bytes(html): + # can't use %-formatting in early Py3 versions + html = ('<html><body>'.encode('ascii') + html + + '</body></html>'.encode('ascii')) + else: + if not _looks_like_full_html_unicode(html): + html = '<html><body>%s</body></html>' % html + doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) + assert _nons(doc.tag) == 'html' + bodies = [e for e in doc if _nons(e.tag) == 'body'] + assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) + body = bodies[0] + elements = [] + if no_leading_text and body.text and body.text.strip(): + raise etree.ParserError( + "There is leading text: %r" % body.text) + if body.text and body.text.strip(): + elements.append(body.text) + elements.extend(body) + # FIXME: removing the reference to the parent artificial document + # would be nice + return elements + + +def fragment_fromstring(html, create_parent=False, base_url=None, + parser=None, **kw): + """ + Parses a single HTML element; it is an error if there is more than + one element, or if anything but whitespace precedes or follows the + element. + + If ``create_parent`` is true (or is a tag name) then a parent node + will be created to encapsulate the HTML in a single element. In this + case, leading or trailing text is also allowed, as are multiple elements + as result of the parsing. + + Passing a ``base_url`` will set the document's ``base_url`` attribute + (and the tree's docinfo.URL). + """ + if parser is None: + parser = html_parser + + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, parser=parser, no_leading_text=not accept_leading_text, + base_url=base_url, **kw) + + if create_parent: + if not isinstance(create_parent, basestring): + create_parent = 'div' + new_root = Element(create_parent) + if elements: + if isinstance(elements[0], basestring): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root + + if not elements: + raise etree.ParserError('No elements found') + if len(elements) > 1: + raise etree.ParserError( + "Multiple elements found (%s)" + % ', '.join([_element_name(e) for e in elements])) + el = elements[0] + if el.tail and el.tail.strip(): + raise etree.ParserError( + "Element followed by text: %r" % el.tail) + el.tail = None + return el + + +def fromstring(html, base_url=None, parser=None, **kw): + """ + Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + + base_url will set the document's base_url attribute (and the tree's docinfo.URL) + """ + if parser is None: + parser = html_parser + if isinstance(html, bytes): + is_full_html = _looks_like_full_html_bytes(html) + else: + is_full_html = _looks_like_full_html_unicode(html) + doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) + if is_full_html: + return doc + # otherwise, lets parse it out... + bodies = doc.findall('body') + if not bodies: + bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) + if bodies: + body = bodies[0] + if len(bodies) > 1: + # Somehow there are multiple bodies, which is bad, but just + # smash them into one body + for other_body in bodies[1:]: + if other_body.text: + if len(body): + body[-1].tail = (body[-1].tail or '') + other_body.text + else: + body.text = (body.text or '') + other_body.text + body.extend(other_body) + # We'll ignore tail + # I guess we are ignoring attributes too + other_body.drop_tree() + else: + body = None + heads = doc.findall('head') + if not heads: + heads = doc.findall('{%s}head' % XHTML_NAMESPACE) + if heads: + # Well, we have some sort of structure, so lets keep it all + head = heads[0] + if len(heads) > 1: + for other_head in heads[1:]: + head.extend(other_head) + # We don't care about text or tail in a head + other_head.drop_tree() + return doc + if body is None: + return doc + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + # The body has just one element, so it was probably a single + # element passed in + return body[0] + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except <body> implies too much structure. + if _contains_block_level_tag(body): + body.tag = 'div' + else: + body.tag = 'span' + return body + + +def parse(filename_or_url, parser=None, base_url=None, **kw): + """ + Parse a filename, URL, or file-like object into an HTML document + tree. Note: this returns a tree, not an element. Use + ``parse(...).getroot()`` to get the document root. + + You can override the base URL with the ``base_url`` keyword. This + is most useful when parsing from a file-like object. + """ + if parser is None: + parser = html_parser + return etree.parse(filename_or_url, parser, base_url=base_url, **kw) + + +def _contains_block_level_tag(el): + # FIXME: I could do this with XPath, but would that just be + # unnecessarily slow? + for el in el.iter(etree.Element): + if _nons(el.tag) in defs.block_tags: + return True + return False + + +def _element_name(el): + if isinstance(el, etree.CommentBase): + return 'comment' + elif isinstance(el, basestring): + return 'string' + else: + return _nons(el.tag) + + +################################################################################ +# form handling +################################################################################ + +class FormElement(HtmlElement): + """ + Represents a <form> element. + """ + + @property + def inputs(self): + """ + Returns an accessor for all the input elements in the form. + + See `InputGetter` for more information about the object. + """ + return InputGetter(self) + + @property + def fields(self): + """ + Dictionary-like object that represents all the fields in this + form. You can set values in this dictionary to effect the + form. + """ + return FieldsDict(self.inputs) + + @fields.setter + def fields(self, value): + fields = self.fields + prev_keys = fields.keys() + for key, value in value.items(): + if key in prev_keys: + prev_keys.remove(key) + fields[key] = value + for key in prev_keys: + if key is None: + # Case of an unnamed input; these aren't really + # expressed in form_values() anyway. + continue + fields[key] = None + + def _name(self): + if self.get('name'): + return self.get('name') + elif self.get('id'): + return '#' + self.get('id') + iter_tags = self.body.iter + forms = list(iter_tags('form')) + if not forms: + forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) + return str(forms.index(self)) + + def form_values(self): + """ + Return a list of tuples of the field values for the form. + This is suitable to be passed to ``urllib.urlencode()``. + """ + results = [] + for el in self.inputs: + name = el.name + if not name or 'disabled' in el.attrib: + continue + tag = _nons(el.tag) + if tag == 'textarea': + results.append((name, el.value)) + elif tag == 'select': + value = el.value + if el.multiple: + for v in value: + results.append((name, v)) + elif value is not None: + results.append((name, el.value)) + else: + assert tag == 'input', ( + "Unexpected tag: %r" % el) + if el.checkable and not el.checked: + continue + if el.type in ('submit', 'image', 'reset', 'file'): + continue + value = el.value + if value is not None: + results.append((name, el.value)) + return results + + @property + def action(self): + """ + Get/set the form's ``action`` attribute. + """ + base_url = self.base_url + action = self.get('action') + if base_url and action is not None: + return urljoin(base_url, action) + else: + return action + + @action.setter + def action(self, value): + self.set('action', value) + + @action.deleter + def action(self): + attrib = self.attrib + if 'action' in attrib: + del attrib['action'] + + @property + def method(self): + """ + Get/set the form's method. Always returns a capitalized + string, and defaults to ``'GET'`` + """ + return self.get('method', 'GET').upper() + + @method.setter + def method(self, value): + self.set('method', value.upper()) + + +HtmlElementClassLookup._default_element_classes['form'] = FormElement + + +def submit_form(form, extra_values=None, open_http=None): + """ + Helper function to submit a form. Returns a file-like object, as from + ``urllib.urlopen()``. This object also has a ``.geturl()`` function, + which shows the URL if there were any redirects. + + You can use this like:: + + form = doc.forms[0] + form.inputs['foo'].value = 'bar' # etc + response = form.submit() + doc = parse(response) + doc.make_links_absolute(response.geturl()) + + To change the HTTP requester, pass a function as ``open_http`` keyword + argument that opens the URL for you. The function must have the following + signature:: + + open_http(method, URL, values) + + The action is one of 'GET' or 'POST', the URL is the target URL as a + string, and the values are a sequence of ``(name, value)`` tuples with the + form data. + """ + values = form.form_values() + if extra_values: + if hasattr(extra_values, 'items'): + extra_values = extra_values.items() + values.extend(extra_values) + if open_http is None: + open_http = open_http_urllib + if form.action: + url = form.action + else: + url = form.base_url + return open_http(form.method, url, values) + + +def open_http_urllib(method, url, values): + if not url: + raise ValueError("cannot submit, no URL provided") + ## FIXME: should test that it's not a relative URL or something + try: + from urllib import urlencode, urlopen + except ImportError: # Python 3 + from urllib.request import urlopen + from urllib.parse import urlencode + if method == 'GET': + if '?' in url: + url += '&' + else: + url += '?' + url += urlencode(values) + data = None + else: + data = urlencode(values) + if not isinstance(data, bytes): + data = data.encode('ASCII') + return urlopen(url, data) + + +class FieldsDict(MutableMapping): + + def __init__(self, inputs): + self.inputs = inputs + def __getitem__(self, item): + return self.inputs[item].value + def __setitem__(self, item, value): + self.inputs[item].value = value + def __delitem__(self, item): + raise KeyError( + "You cannot remove keys from ElementDict") + def keys(self): + return self.inputs.keys() + def __contains__(self, item): + return item in self.inputs + def __iter__(self): + return iter(self.inputs.keys()) + def __len__(self): + return len(self.inputs) + + def __repr__(self): + return '<%s for form %s>' % ( + self.__class__.__name__, + self.inputs.form._name()) + + +class InputGetter(object): + + """ + An accessor that represents all the input fields in a form. + + You can get fields by name from this, with + ``form.inputs['field_name']``. If there are a set of checkboxes + with the same name, they are returned as a list (a `CheckboxGroup` + which also allows value setting). Radio inputs are handled + similarly. + + You can also iterate over this to get all input elements. This + won't return the same thing as if you get all the names, as + checkboxes and radio elements are returned individually. + """ + + _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") + _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") + + def __init__(self, form): + self.form = form + + def __repr__(self): + return '<%s for form %s>' % ( + self.__class__.__name__, + self.form._name()) + + ## FIXME: there should be more methods, and it's unclear if this is + ## a dictionary-like object or list-like object + + def __getitem__(self, name): + results = self._name_xpath(self.form, name=name) + if results: + type = results[0].get('type') + if type == 'radio' and len(results) > 1: + group = RadioGroup(results) + group.name = name + return group + elif type == 'checkbox' and len(results) > 1: + group = CheckboxGroup(results) + group.name = name + return group + else: + # I don't like throwing away elements like this + return results[0] + else: + raise KeyError( + "No input element with the name %r" % name) + + def __contains__(self, name): + results = self._name_xpath(self.form, name=name) + return bool(results) + + def keys(self): + names = set() + for el in self: + names.add(el.name) + if None in names: + names.remove(None) + return list(names) + + def __iter__(self): + ## FIXME: kind of dumb to turn a list into an iterator, only + ## to have it likely turned back into a list again :( + return iter(self._all_xpath(self.form)) + + +class InputMixin(object): + """ + Mix-in for all input elements (input, select, and textarea) + """ + @property + def name(self): + """ + Get/set the name of the element + """ + return self.get('name') + + @name.setter + def name(self, value): + self.set('name', value) + + @name.deleter + def name(self): + attrib = self.attrib + if 'name' in attrib: + del attrib['name'] + + def __repr__(self): + type_name = getattr(self, 'type', None) + if type_name: + type_name = ' type=%r' % type_name + else: + type_name = '' + return '<%s %x name=%r%s>' % ( + self.__class__.__name__, id(self), self.name, type_name) + + +class TextareaElement(InputMixin, HtmlElement): + """ + ``<textarea>`` element. You can get the name with ``.name`` and + get/set the value with ``.value`` + """ + @property + def value(self): + """ + Get/set the value (which is the contents of this element) + """ + content = self.text or '' + if self.tag.startswith("{%s}" % XHTML_NAMESPACE): + serialisation_method = 'xml' + else: + serialisation_method = 'html' + for el in self: + # it's rare that we actually get here, so let's not use ''.join() + content += etree.tostring( + el, method=serialisation_method, encoding='unicode') + return content + + @value.setter + def value(self, value): + del self[:] + self.text = value + + @value.deleter + def value(self): + self.text = '' + del self[:] + + +HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement + + +class SelectElement(InputMixin, HtmlElement): + """ + ``<select>`` element. You can get the name with ``.name``. + + ``.value`` will be the value of the selected option, unless this + is a multi-select element (``<select multiple>``), in which case + it will be a set-like object. In either case ``.value_options`` + gives the possible values. + + The boolean attribute ``.multiple`` shows if this is a + multi-select. + """ + @property + def value(self): + """ + Get/set the value of this select (the selected option). + + If this is a multi-select, this is a set-like object that + represents all the selected options. + """ + if self.multiple: + return MultipleSelectOptions(self) + options = _options_xpath(self) + + try: + selected_option = next(el for el in reversed(options) if el.get('selected') is not None) + except StopIteration: + try: + selected_option = next(el for el in options if el.get('disabled') is None) + except StopIteration: + return None + value = selected_option.get('value') + if value is None: + value = (selected_option.text or '').strip() + return value + + @value.setter + def value(self, value): + if self.multiple: + if isinstance(value, basestring): + raise TypeError("You must pass in a sequence") + values = self.value + values.clear() + values.update(value) + return + checked_option = None + if value is not None: + for el in _options_xpath(self): + opt_value = el.get('value') + if opt_value is None: + opt_value = (el.text or '').strip() + if opt_value == value: + checked_option = el + break + else: + raise ValueError( + "There is no option with the value of %r" % value) + for el in _options_xpath(self): + if 'selected' in el.attrib: + del el.attrib['selected'] + if checked_option is not None: + checked_option.set('selected', '') + + @value.deleter + def value(self): + # FIXME: should del be allowed at all? + if self.multiple: + self.value.clear() + else: + self.value = None + + @property + def value_options(self): + """ + All the possible values this select can have (the ``value`` + attribute of all the ``<option>`` elements. + """ + options = [] + for el in _options_xpath(self): + value = el.get('value') + if value is None: + value = (el.text or '').strip() + options.append(value) + return options + + @property + def multiple(self): + """ + Boolean attribute: is there a ``multiple`` attribute on this element. + """ + return 'multiple' in self.attrib + + @multiple.setter + def multiple(self, value): + if value: + self.set('multiple', '') + elif 'multiple' in self.attrib: + del self.attrib['multiple'] + + +HtmlElementClassLookup._default_element_classes['select'] = SelectElement + + +class MultipleSelectOptions(SetMixin): + """ + Represents all the selected options in a ``<select multiple>`` element. + + You can add to this set-like option to select an option, or remove + to unselect the option. + """ + + def __init__(self, select): + self.select = select + + @property + def options(self): + """ + Iterator of all the ``<option>`` elements. + """ + return iter(_options_xpath(self.select)) + + def __iter__(self): + for option in self.options: + if 'selected' in option.attrib: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + yield opt_value + + def add(self, item): + for option in self.options: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + if opt_value == item: + option.set('selected', '') + break + else: + raise ValueError( + "There is no option with the value %r" % item) + + def remove(self, item): + for option in self.options: + opt_value = option.get('value') + if opt_value is None: + opt_value = (option.text or '').strip() + if opt_value == item: + if 'selected' in option.attrib: + del option.attrib['selected'] + else: + raise ValueError( + "The option %r is not currently selected" % item) + break + else: + raise ValueError( + "There is not option with the value %r" % item) + + def __repr__(self): + return '<%s {%s} for select name=%r>' % ( + self.__class__.__name__, + ', '.join([repr(v) for v in self]), + self.select.name) + + +class RadioGroup(list): + """ + This object represents several ``<input type=radio>`` elements + that have the same name. + + You can use this like a list, but also use the property + ``.value`` to check/uncheck inputs. Also you can use + ``.value_options`` to get the possible values. + """ + @property + def value(self): + """ + Get/set the value, which checks the radio with that value (and + unchecks any other value). + """ + for el in self: + if 'checked' in el.attrib: + return el.get('value') + return None + + @value.setter + def value(self, value): + checked_option = None + if value is not None: + for el in self: + if el.get('value') == value: + checked_option = el + break + else: + raise ValueError("There is no radio input with the value %r" % value) + for el in self: + if 'checked' in el.attrib: + del el.attrib['checked'] + if checked_option is not None: + checked_option.set('checked', '') + + @value.deleter + def value(self): + self.value = None + + @property + def value_options(self): + """ + Returns a list of all the possible values. + """ + return [el.get('value') for el in self] + + def __repr__(self): + return '%s(%s)' % ( + self.__class__.__name__, + list.__repr__(self)) + + +class CheckboxGroup(list): + """ + Represents a group of checkboxes (``<input type=checkbox>``) that + have the same name. + + In addition to using this like a list, the ``.value`` attribute + returns a set-like object that you can add to or remove from to + check and uncheck checkboxes. You can also use ``.value_options`` + to get the possible values. + """ + @property + def value(self): + """ + Return a set-like object that can be modified to check or + uncheck individual checkboxes according to their value. + """ + return CheckboxValues(self) + + @value.setter + def value(self, value): + values = self.value + values.clear() + if not hasattr(value, '__iter__'): + raise ValueError( + "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" + % (self[0].name, value)) + values.update(value) + + @value.deleter + def value(self): + self.value.clear() + + @property + def value_options(self): + """ + Returns a list of all the possible values. + """ + return [el.get('value') for el in self] + + def __repr__(self): + return '%s(%s)' % ( + self.__class__.__name__, list.__repr__(self)) + + +class CheckboxValues(SetMixin): + """ + Represents the values of the checked checkboxes in a group of + checkboxes with the same name. + """ + + def __init__(self, group): + self.group = group + + def __iter__(self): + return iter([ + el.get('value') + for el in self.group + if 'checked' in el.attrib]) + + def add(self, value): + for el in self.group: + if el.get('value') == value: + el.set('checked', '') + break + else: + raise KeyError("No checkbox with value %r" % value) + + def remove(self, value): + for el in self.group: + if el.get('value') == value: + if 'checked' in el.attrib: + del el.attrib['checked'] + else: + raise KeyError( + "The checkbox with value %r was already unchecked" % value) + break + else: + raise KeyError( + "No checkbox with value %r" % value) + + def __repr__(self): + return '<%s {%s} for checkboxes name=%r>' % ( + self.__class__.__name__, + ', '.join([repr(v) for v in self]), + self.group.name) + + +class InputElement(InputMixin, HtmlElement): + """ + Represents an ``<input>`` element. + + You can get the type with ``.type`` (which is lower-cased and + defaults to ``'text'``). + + Also you can get and set the value with ``.value`` + + Checkboxes and radios have the attribute ``input.checkable == + True`` (for all others it is false) and a boolean attribute + ``.checked``. + + """ + + ## FIXME: I'm a little uncomfortable with the use of .checked + @property + def value(self): + """ + Get/set the value of this element, using the ``value`` attribute. + + Also, if this is a checkbox and it has no value, this defaults + to ``'on'``. If it is a checkbox or radio that is not + checked, this returns None. + """ + if self.checkable: + if self.checked: + return self.get('value') or 'on' + else: + return None + return self.get('value') + + @value.setter + def value(self, value): + if self.checkable: + if not value: + self.checked = False + else: + self.checked = True + if isinstance(value, basestring): + self.set('value', value) + else: + self.set('value', value) + + @value.deleter + def value(self): + if self.checkable: + self.checked = False + else: + if 'value' in self.attrib: + del self.attrib['value'] + + @property + def type(self): + """ + Return the type of this element (using the type attribute). + """ + return self.get('type', 'text').lower() + + @type.setter + def type(self, value): + self.set('type', value) + + @property + def checkable(self): + """ + Boolean: can this element be checked? + """ + return self.type in ('checkbox', 'radio') + + @property + def checked(self): + """ + Boolean attribute to get/set the presence of the ``checked`` + attribute. + + You can only use this on checkable input types. + """ + if not self.checkable: + raise AttributeError('Not a checkable input type') + return 'checked' in self.attrib + + @checked.setter + def checked(self, value): + if not self.checkable: + raise AttributeError('Not a checkable input type') + if value: + self.set('checked', '') + else: + attrib = self.attrib + if 'checked' in attrib: + del attrib['checked'] + + +HtmlElementClassLookup._default_element_classes['input'] = InputElement + + +class LabelElement(HtmlElement): + """ + Represents a ``<label>`` element. + + Label elements are linked to other elements with their ``for`` + attribute. You can access this element with ``label.for_element``. + """ + @property + def for_element(self): + """ + Get/set the element this label points to. Return None if it + can't be found. + """ + id = self.get('for') + if not id: + return None + return self.body.get_element_by_id(id) + + @for_element.setter + def for_element(self, other): + id = other.get('id') + if not id: + raise TypeError( + "Element %r has no id attribute" % other) + self.set('for', id) + + @for_element.deleter + def for_element(self): + attrib = self.attrib + if 'id' in attrib: + del attrib['id'] + + +HtmlElementClassLookup._default_element_classes['label'] = LabelElement + + +############################################################ +## Serialization +############################################################ + +def html_to_xhtml(html): + """Convert all tags in an HTML tree to XHTML by moving them to the + XHTML namespace. + """ + try: + html = html.getroot() + except AttributeError: + pass + prefix = "{%s}" % XHTML_NAMESPACE + for el in html.iter(etree.Element): + tag = el.tag + if tag[0] != '{': + el.tag = prefix + tag + + +def xhtml_to_html(xhtml): + """Convert all tags in an XHTML tree to HTML by removing their + XHTML namespace. + """ + try: + xhtml = xhtml.getroot() + except AttributeError: + pass + prefix = "{%s}" % XHTML_NAMESPACE + prefix_len = len(prefix) + for el in xhtml.iter(prefix + "*"): + el.tag = el.tag[prefix_len:] + + +# This isn't a general match, but it's a match for what libxml2 +# specifically serialises: +__str_replace_meta_content_type = re.compile( + r'<meta http-equiv="Content-Type"[^>]*>').sub +__bytes_replace_meta_content_type = re.compile( + r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub + + +def tostring(doc, pretty_print=False, include_meta_content_type=False, + encoding=None, method="html", with_tail=True, doctype=None): + """Return an HTML string representation of the document. + + Note: if include_meta_content_type is true this will create a + ``<meta http-equiv="Content-Type" ...>`` tag in the head; + regardless of the value of include_meta_content_type any existing + ``<meta http-equiv="Content-Type" ...>`` tag will be removed + + The ``encoding`` argument controls the output encoding (defaults to + ASCII, with &#...; character references for any characters outside + of ASCII). Note that you can pass the name ``'unicode'`` as + ``encoding`` argument to serialise to a Unicode string. + + The ``method`` argument defines the output method. It defaults to + 'html', but can also be 'xml' for xhtml output, or 'text' to + serialise to plain text without markup. + + To leave out the tail text of the top-level element that is being + serialised, pass ``with_tail=False``. + + The ``doctype`` option allows passing in a plain string that will + be serialised before the XML tree. Note that passing in non + well-formed content here will make the XML output non well-formed. + Also, an existing doctype in the document tree will not be removed + when serialising an ElementTree instance. + + Example:: + + >>> from lxml import html + >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') + + >>> html.tostring(root) + b'<p>Hello<br>world!</p>' + >>> html.tostring(root, method='html') + b'<p>Hello<br>world!</p>' + + >>> html.tostring(root, method='xml') + b'<p>Hello<br/>world!</p>' + + >>> html.tostring(root, method='text') + b'Helloworld!' + + >>> html.tostring(root, method='text', encoding='unicode') + u'Helloworld!' + + >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') + >>> html.tostring(root[0], method='text', encoding='unicode') + u'Helloworld!TAIL' + + >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) + u'Helloworld!' + + >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') + >>> html.tostring(doc, method='html', encoding='unicode') + u'<html><body><p>Hello<br>world!</p></body></html>' + + >>> print(html.tostring(doc, method='html', encoding='unicode', + ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' + ... ' "http://www.w3.org/TR/html4/strict.dtd">')) + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> + <html><body><p>Hello<br>world!</p></body></html> + """ + html = etree.tostring(doc, method=method, pretty_print=pretty_print, + encoding=encoding, with_tail=with_tail, + doctype=doctype) + if method == 'html' and not include_meta_content_type: + if isinstance(html, str): + html = __str_replace_meta_content_type('', html) + else: + html = __bytes_replace_meta_content_type(bytes(), html) + return html + + +tostring.__doc__ = __fix_docstring(tostring.__doc__) + + +def open_in_browser(doc, encoding=None): + """ + Open the HTML document in a web browser, saving it to a temporary + file to open it. Note that this does not delete the file after + use. This is mainly meant for debugging. + """ + import os + import webbrowser + import tempfile + if not isinstance(doc, etree._ElementTree): + doc = etree.ElementTree(doc) + handle, fn = tempfile.mkstemp(suffix='.html') + f = os.fdopen(handle, 'wb') + try: + doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") + finally: + # we leak the file itself here, but we should at least close it + f.close() + url = 'file://' + fn.replace(os.path.sep, '/') + print(url) + webbrowser.open(url) + + +################################################################################ +# configure Element class lookup +################################################################################ + +class HTMLParser(etree.HTMLParser): + """An HTML parser that is configured to return lxml.html Element + objects. + """ + def __init__(self, **kwargs): + super(HTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(HtmlElementClassLookup()) + + +class XHTMLParser(etree.XMLParser): + """An XML parser that is configured to return lxml.html Element + objects. + + Note that this parser is not really XHTML aware unless you let it + load a DTD that declares the HTML entities. To do this, make sure + you have the XHTML DTDs installed in your catalogs, and create the + parser like this:: + + >>> parser = XHTMLParser(load_dtd=True) + + If you additionally want to validate the document, use this:: + + >>> parser = XHTMLParser(dtd_validation=True) + + For catalog support, see http://www.xmlsoft.org/catalog.html. + """ + def __init__(self, **kwargs): + super(XHTMLParser, self).__init__(**kwargs) + self.set_element_class_lookup(HtmlElementClassLookup()) + + +def Element(*args, **kw): + """Create a new HTML Element. + + This can also be used for XHTML documents. + """ + v = html_parser.makeelement(*args, **kw) + return v + + +html_parser = HTMLParser() +xhtml_parser = XHTMLParser()