Mercurial > repos > guerler > springsuite

diff planemo/lib/python3.7/site-packages/lxml/html/init.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author: guerler
date: Fri, 31 Jul 2020 00:32:28 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/lxml/html/__init__.py	Fri Jul 31 00:32:28 2020 -0400
@@ -0,0 +1,1926 @@
+# Copyright (c) 2004 Ian Bicking. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. Neither the name of Ian Bicking nor the names of its contributors may
+# be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""The ``lxml.html`` tool set for HTML handling.
+"""
+
+from __future__ import absolute_import
+
+__all__ = [
+    'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
+    'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
+    'find_rel_links', 'find_class', 'make_links_absolute',
+    'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
+
+
+import copy
+import sys
+import re
+from functools import partial
+
+try:
+    from collections.abc import MutableMapping, MutableSet
+except ImportError:
+    from collections import MutableMapping, MutableSet
+
+from .. import etree
+from . import defs
+from ._setmixin import SetMixin
+
+try:
+    from urlparse import urljoin
+except ImportError:
+    # Python 3
+    from urllib.parse import urljoin
+
+try:
+    unicode
+except NameError:
+    # Python 3
+    unicode = str
+try:
+    basestring
+except NameError:
+    # Python 3
+    basestring = (str, bytes)
+
+
+def __fix_docstring(s):
+    if not s:
+        return s
+    if sys.version_info[0] >= 3:
+        sub = re.compile(r"^(\s*)u'", re.M).sub
+    else:
+        sub = re.compile(r"^(\s*)b'", re.M).sub
+    return sub(r"\1'", s)
+
+
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
+                               namespaces={'x':XHTML_NAMESPACE})
+_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
+                             namespaces={'x':XHTML_NAMESPACE})
+_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
+                           namespaces={'x':XHTML_NAMESPACE})
+#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
+_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_collect_string_content = etree.XPath("string()")
+_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
+_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
+_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
+                           namespaces={'x':XHTML_NAMESPACE})
+_archive_re = re.compile(r'[^ ]+')
+_parse_meta_refresh_url = re.compile(
+    r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
+
+
+def _unquote_match(s, pos):
+    if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
+        return s[1:-1], pos+1
+    else:
+        return s,pos
+
+
+def _transform_result(typ, result):
+    """Convert the result back into the input type.
+    """
+    if issubclass(typ, bytes):
+        return tostring(result, encoding='utf-8')
+    elif issubclass(typ, unicode):
+        return tostring(result, encoding='unicode')
+    else:
+        return result
+
+
+def _nons(tag):
+    if isinstance(tag, basestring):
+        if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
+            return tag.split('}')[-1]
+    return tag
+
+
+class Classes(MutableSet):
+    """Provides access to an element's class attribute as a set-like collection.
+    Usage::
+
+        >>> el = fromstring('<p class="hidden large">Text</p>')
+        >>> classes = el.classes  # or: classes = Classes(el.attrib)
+        >>> classes |= ['block', 'paragraph']
+        >>> el.get('class')
+        'hidden large block paragraph'
+        >>> classes.toggle('hidden')
+        False
+        >>> el.get('class')
+        'large block paragraph'
+        >>> classes -= ('some', 'classes', 'block')
+        >>> el.get('class')
+        'large paragraph'
+    """
+    def __init__(self, attributes):
+        self._attributes = attributes
+        self._get_class_value = partial(attributes.get, 'class', '')
+
+    def add(self, value):
+        """
+        Add a class.
+
+        This has no effect if the class is already present.
+        """
+        if not value or re.search(r'\s', value):
+            raise ValueError("Invalid class name: %r" % value)
+        classes = self._get_class_value().split()
+        if value in classes:
+            return
+        classes.append(value)
+        self._attributes['class'] = ' '.join(classes)
+
+    def discard(self, value):
+        """
+        Remove a class if it is currently present.
+
+        If the class is not present, do nothing.
+        """
+        if not value or re.search(r'\s', value):
+            raise ValueError("Invalid class name: %r" % value)
+        classes = [name for name in self._get_class_value().split()
+                   if name != value]
+        if classes:
+            self._attributes['class'] = ' '.join(classes)
+        elif 'class' in self._attributes:
+            del self._attributes['class']
+
+    def remove(self, value):
+        """
+        Remove a class; it must currently be present.
+
+        If the class is not present, raise a KeyError.
+        """
+        if not value or re.search(r'\s', value):
+            raise ValueError("Invalid class name: %r" % value)
+        super(Classes, self).remove(value)
+
+    def __contains__(self, name):
+        classes = self._get_class_value()
+        return name in classes and name in classes.split()
+
+    def __iter__(self):
+        return iter(self._get_class_value().split())
+
+    def __len__(self):
+        return len(self._get_class_value().split())
+
+    # non-standard methods
+
+    def update(self, values):
+        """
+        Add all names from 'values'.
+        """
+        classes = self._get_class_value().split()
+        extended = False
+        for value in values:
+            if value not in classes:
+                classes.append(value)
+                extended = True
+        if extended:
+            self._attributes['class'] = ' '.join(classes)
+
+    def toggle(self, value):
+        """
+        Add a class name if it isn't there yet, or remove it if it exists.
+
+        Returns true if the class was added (and is now enabled) and
+        false if it was removed (and is now disabled).
+        """
+        if not value or re.search(r'\s', value):
+            raise ValueError("Invalid class name: %r" % value)
+        classes = self._get_class_value().split()
+        try:
+            classes.remove(value)
+            enabled = False
+        except ValueError:
+            classes.append(value)
+            enabled = True
+        if classes:
+            self._attributes['class'] = ' '.join(classes)
+        else:
+            del self._attributes['class']
+        return enabled
+
+
+class HtmlMixin(object):
+
+    def set(self, key, value=None):
+        """set(self, key, value=None)
+
+        Sets an element attribute.  If no value is provided, or if the value is None,
+        creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
+        for ``form.set('novalidate')``.
+        """
+        super(HtmlElement, self).set(key, value)
+
+    @property
+    def classes(self):
+        """
+        A set-like wrapper around the 'class' attribute.
+        """
+        return Classes(self.attrib)
+
+    @classes.setter
+    def classes(self, classes):
+        assert isinstance(classes, Classes)  # only allow "el.classes |= ..." etc.
+        value = classes._get_class_value()
+        if value:
+            self.set('class', value)
+        elif self.get('class') is not None:
+            del self.attrib['class']
+
+    @property
+    def base_url(self):
+        """
+        Returns the base URL, given when the page was parsed.
+
+        Use with ``urlparse.urljoin(el.base_url, href)`` to get
+        absolute URLs.
+        """
+        return self.getroottree().docinfo.URL
+
+    @property
+    def forms(self):
+        """
+        Return a list of all the forms
+        """
+        return _forms_xpath(self)
+
+    @property
+    def body(self):
+        """
+        Return the <body> element.  Can be called from a child element
+        to get the document's head.
+        """
+        return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
+
+    @property
+    def head(self):
+        """
+        Returns the <head> element.  Can be called from a child
+        element to get the document's head.
+        """
+        return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
+
+    @property
+    def label(self):
+        """
+        Get or set any <label> element associated with this element.
+        """
+        id = self.get('id')
+        if not id:
+            return None
+        result = _label_xpath(self, id=id)
+        if not result:
+            return None
+        else:
+            return result[0]
+
+    @label.setter
+    def label(self, label):
+        id = self.get('id')
+        if not id:
+            raise TypeError(
+                "You cannot set a label for an element (%r) that has no id"
+                % self)
+        if _nons(label.tag) != 'label':
+            raise TypeError(
+                "You can only assign label to a label element (not %r)"
+                % label)
+        label.set('for', id)
+
+    @label.deleter
+    def label(self):
+        label = self.label
+        if label is not None:
+            del label.attrib['for']
+
+    def drop_tree(self):
+        """
+        Removes this element from the tree, including its children and
+        text.  The tail text is joined to the previous element or
+        parent.
+        """
+        parent = self.getparent()
+        assert parent is not None
+        if self.tail:
+            previous = self.getprevious()
+            if previous is None:
+                parent.text = (parent.text or '') + self.tail
+            else:
+                previous.tail = (previous.tail or '') + self.tail
+        parent.remove(self)
+
+    def drop_tag(self):
+        """
+        Remove the tag, but not its children or text.  The children and text
+        are merged into the parent.
+
+        Example::
+
+            >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
+            >>> h.find('.//b').drop_tag()
+            >>> print(tostring(h, encoding='unicode'))
+            <div>Hello World!</div>
+        """
+        parent = self.getparent()
+        assert parent is not None
+        previous = self.getprevious()
+        if self.text and isinstance(self.tag, basestring):
+            # not a Comment, etc.
+            if previous is None:
+                parent.text = (parent.text or '') + self.text
+            else:
+                previous.tail = (previous.tail or '') + self.text
+        if self.tail:
+            if len(self):
+                last = self[-1]
+                last.tail = (last.tail or '') + self.tail
+            elif previous is None:
+                parent.text = (parent.text or '') + self.tail
+            else:
+                previous.tail = (previous.tail or '') + self.tail
+        index = parent.index(self)
+        parent[index:index+1] = self[:]
+
+    def find_rel_links(self, rel):
+        """
+        Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
+        """
+        rel = rel.lower()
+        return [el for el in _rel_links_xpath(self)
+                if el.get('rel').lower() == rel]
+
+    def find_class(self, class_name):
+        """
+        Find any elements with the given class name.
+        """
+        return _class_xpath(self, class_name=class_name)
+
+    def get_element_by_id(self, id, *default):
+        """
+        Get the first element in a document with the given id.  If none is
+        found, return the default argument if provided or raise KeyError
+        otherwise.
+
+        Note that there can be more than one element with the same id,
+        and this isn't uncommon in HTML documents found in the wild.
+        Browsers return only the first match, and this function does
+        the same.
+        """
+        try:
+            # FIXME: should this check for multiple matches?
+            # browsers just return the first one
+            return _id_xpath(self, id=id)[0]
+        except IndexError:
+            if default:
+                return default[0]
+            else:
+                raise KeyError(id)
+
+    def text_content(self):
+        """
+        Return the text content of the tag (and the text in any children).
+        """
+        return _collect_string_content(self)
+
+    def cssselect(self, expr, translator='html'):
+        """
+        Run the CSS expression on this element and its children,
+        returning a list of the results.
+
+        Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
+        -- note that pre-compiling the expression can provide a substantial
+        speedup.
+        """
+        # Do the import here to make the dependency optional.
+        from lxml.cssselect import CSSSelector
+        return CSSSelector(expr, translator=translator)(self)
+
+    ########################################
+    ## Link functions
+    ########################################
+
+    def make_links_absolute(self, base_url=None, resolve_base_href=True,
+                            handle_failures=None):
+        """
+        Make all links in the document absolute, given the
+        ``base_url`` for the document (the full URL where the document
+        came from), or if no ``base_url`` is given, then the ``.base_url``
+        of the document.
+
+        If ``resolve_base_href`` is true, then any ``<base href>``
+        tags in the document are used *and* removed from the document.
+        If it is false then any such tag is ignored.
+
+        If ``handle_failures`` is None (default), a failure to process
+        a URL will abort the processing.  If set to 'ignore', errors
+        are ignored.  If set to 'discard', failing URLs will be removed.
+        """
+        if base_url is None:
+            base_url = self.base_url
+            if base_url is None:
+                raise TypeError(
+                    "No base_url given, and the document has no base_url")
+        if resolve_base_href:
+            self.resolve_base_href()
+
+        if handle_failures == 'ignore':
+            def link_repl(href):
+                try:
+                    return urljoin(base_url, href)
+                except ValueError:
+                    return href
+        elif handle_failures == 'discard':
+            def link_repl(href):
+                try:
+                    return urljoin(base_url, href)
+                except ValueError:
+                    return None
+        elif handle_failures is None:
+            def link_repl(href):
+                return urljoin(base_url, href)
+        else:
+            raise ValueError(
+                "unexpected value for handle_failures: %r" % handle_failures)
+
+        self.rewrite_links(link_repl)
+
+    def resolve_base_href(self, handle_failures=None):
+        """
+        Find any ``<base href>`` tag in the document, and apply its
+        values to all links found in the document.  Also remove the
+        tag once it has been applied.
+
+        If ``handle_failures`` is None (default), a failure to process
+        a URL will abort the processing.  If set to 'ignore', errors
+        are ignored.  If set to 'discard', failing URLs will be removed.
+        """
+        base_href = None
+        basetags = self.xpath('//base[@href]|//x:base[@href]',
+                              namespaces={'x': XHTML_NAMESPACE})
+        for b in basetags:
+            base_href = b.get('href')
+            b.drop_tree()
+        if not base_href:
+            return
+        self.make_links_absolute(base_href, resolve_base_href=False,
+                                 handle_failures=handle_failures)
+
+    def iterlinks(self):
+        """
+        Yield (element, attribute, link, pos), where attribute may be None
+        (indicating the link is in the text).  ``pos`` is the position
+        where the link occurs; often 0, but sometimes something else in
+        the case of links in stylesheets or style tags.
+
+        Note: <base href> is *not* taken into account in any way.  The
+        link you get is exactly the link in the document.
+
+        Note: multiple links inside of a single text string or
+        attribute value are returned in reversed order.  This makes it
+        possible to replace or delete them from the text string value
+        based on their reported text positions.  Otherwise, a
+        modification at one text position can change the positions of
+        links reported later on.
+        """
+        link_attrs = defs.link_attrs
+        for el in self.iter(etree.Element):
+            attribs = el.attrib
+            tag = _nons(el.tag)
+            if tag == 'object':
+                codebase = None
+                ## <object> tags have attributes that are relative to
+                ## codebase
+                if 'codebase' in attribs:
+                    codebase = el.get('codebase')
+                    yield (el, 'codebase', codebase, 0)
+                for attrib in ('classid', 'data'):
+                    if attrib in attribs:
+                        value = el.get(attrib)
+                        if codebase is not None:
+                            value = urljoin(codebase, value)
+                        yield (el, attrib, value, 0)
+                if 'archive' in attribs:
+                    for match in _archive_re.finditer(el.get('archive')):
+                        value = match.group(0)
+                        if codebase is not None:
+                            value = urljoin(codebase, value)
+                        yield (el, 'archive', value, match.start())
+            else:
+                for attrib in link_attrs:
+                    if attrib in attribs:
+                        yield (el, attrib, attribs[attrib], 0)
+            if tag == 'meta':
+                http_equiv = attribs.get('http-equiv', '').lower()
+                if http_equiv == 'refresh':
+                    content = attribs.get('content', '')
+                    match = _parse_meta_refresh_url(content)
+                    url = (match.group('url') if match else content).strip()
+                    # unexpected content means the redirect won't work, but we might
+                    # as well be permissive and return the entire string.
+                    if url:
+                        url, pos = _unquote_match(
+                            url, match.start('url') if match else content.find(url))
+                        yield (el, 'content', url, pos)
+            elif tag == 'param':
+                valuetype = el.get('valuetype') or ''
+                if valuetype.lower() == 'ref':
+                    ## FIXME: while it's fine we *find* this link,
+                    ## according to the spec we aren't supposed to
+                    ## actually change the value, including resolving
+                    ## it.  It can also still be a link, even if it
+                    ## doesn't have a valuetype="ref" (which seems to be the norm)
+                    ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
+                    yield (el, 'value', el.get('value'), 0)
+            elif tag == 'style' and el.text:
+                urls = [
+                    # (start_pos, url)
+                    _unquote_match(match.group(1), match.start(1))[::-1]
+                    for match in _iter_css_urls(el.text)
+                    ] + [
+                    (match.start(1), match.group(1))
+                    for match in _iter_css_imports(el.text)
+                    ]
+                if urls:
+                    # sort by start pos to bring both match sets back into order
+                    # and reverse the list to report correct positions despite
+                    # modifications
+                    urls.sort(reverse=True)
+                    for start, url in urls:
+                        yield (el, None, url, start)
+            if 'style' in attribs:
+                urls = list(_iter_css_urls(attribs['style']))
+                if urls:
+                    # return in reversed order to simplify in-place modifications
+                    for match in urls[::-1]:
+                        url, start = _unquote_match(match.group(1), match.start(1))
+                        yield (el, 'style', url, start)
+
+    def rewrite_links(self, link_repl_func, resolve_base_href=True,
+                      base_href=None):
+        """
+        Rewrite all the links in the document.  For each link
+        ``link_repl_func(link)`` will be called, and the return value
+        will replace the old link.
+
+        Note that links may not be absolute (unless you first called
+        ``make_links_absolute()``), and may be internal (e.g.,
+        ``'#anchor'``).  They can also be values like
+        ``'mailto:email'`` or ``'javascript:expr'``.
+
+        If you give ``base_href`` then all links passed to
+        ``link_repl_func()`` will take that into account.
+
+        If the ``link_repl_func`` returns None, the attribute or
+        tag text will be removed completely.
+        """
+        if base_href is not None:
+            # FIXME: this can be done in one pass with a wrapper
+            # around link_repl_func
+            self.make_links_absolute(
+                base_href, resolve_base_href=resolve_base_href)
+        elif resolve_base_href:
+            self.resolve_base_href()
+
+        for el, attrib, link, pos in self.iterlinks():
+            new_link = link_repl_func(link.strip())
+            if new_link == link:
+                continue
+            if new_link is None:
+                # Remove the attribute or element content
+                if attrib is None:
+                    el.text = ''
+                else:
+                    del el.attrib[attrib]
+                continue
+
+            if attrib is None:
+                new = el.text[:pos] + new_link + el.text[pos+len(link):]
+                el.text = new
+            else:
+                cur = el.get(attrib)
+                if not pos and len(cur) == len(link):
+                    new = new_link  # most common case
+                else:
+                    new = cur[:pos] + new_link + cur[pos+len(link):]
+                el.set(attrib, new)
+
+
+class _MethodFunc(object):
+    """
+    An object that represents a method on an element as a function;
+    the function takes either an element or an HTML string.  It
+    returns whatever the function normally returns, or if the function
+    works in-place (and so returns None) it returns a serialized form
+    of the resulting document.
+    """
+    def __init__(self, name, copy=False, source_class=HtmlMixin):
+        self.name = name
+        self.copy = copy
+        self.__doc__ = getattr(source_class, self.name).__doc__
+    def __call__(self, doc, *args, **kw):
+        result_type = type(doc)
+        if isinstance(doc, basestring):
+            if 'copy' in kw:
+                raise TypeError(
+                    "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
+            doc = fromstring(doc, **kw)
+        else:
+            if 'copy' in kw:
+                make_a_copy = kw.pop('copy')
+            else:
+                make_a_copy = self.copy
+            if make_a_copy:
+                doc = copy.deepcopy(doc)
+        meth = getattr(doc, self.name)
+        result = meth(*args, **kw)
+        # FIXME: this None test is a bit sloppy
+        if result is None:
+            # Then return what we got in
+            return _transform_result(result_type, doc)
+        else:
+            return result
+
+
+find_rel_links = _MethodFunc('find_rel_links', copy=False)
+find_class = _MethodFunc('find_class', copy=False)
+make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
+resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
+iterlinks = _MethodFunc('iterlinks', copy=False)
+rewrite_links = _MethodFunc('rewrite_links', copy=True)
+
+
+class HtmlComment(etree.CommentBase, HtmlMixin):
+    pass
+
+
+class HtmlElement(etree.ElementBase, HtmlMixin):
+    # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
+    cssselect = HtmlMixin.cssselect
+    set = HtmlMixin.set
+
+
+class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
+    pass
+
+
+class HtmlEntity(etree.EntityBase, HtmlMixin):
+    pass
+
+
+class HtmlElementClassLookup(etree.CustomElementClassLookup):
+    """A lookup scheme for HTML Element classes.
+
+    To create a lookup instance with different Element classes, pass a tag
+    name mapping of Element classes in the ``classes`` keyword argument and/or
+    a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
+    The special key '*' denotes a Mixin class that should be mixed into all
+    Element classes.
+    """
+    _default_element_classes = {}
+
+    def __init__(self, classes=None, mixins=None):
+        etree.CustomElementClassLookup.__init__(self)
+        if classes is None:
+            classes = self._default_element_classes.copy()
+        if mixins:
+            mixers = {}
+            for name, value in mixins:
+                if name == '*':
+                    for n in classes.keys():
+                        mixers.setdefault(n, []).append(value)
+                else:
+                    mixers.setdefault(name, []).append(value)
+            for name, mix_bases in mixers.items():
+                cur = classes.get(name, HtmlElement)
+                bases = tuple(mix_bases + [cur])
+                classes[name] = type(cur.__name__, bases, {})
+        self._element_classes = classes
+
+    def lookup(self, node_type, document, namespace, name):
+        if node_type == 'element':
+            return self._element_classes.get(name.lower(), HtmlElement)
+        elif node_type == 'comment':
+            return HtmlComment
+        elif node_type == 'PI':
+            return HtmlProcessingInstruction
+        elif node_type == 'entity':
+            return HtmlEntity
+        # Otherwise normal lookup
+        return None
+
+
+################################################################################
+# parsing
+################################################################################
+
+_looks_like_full_html_unicode = re.compile(
+    unicode(r'^\s*<(?:html|!doctype)'), re.I).match
+_looks_like_full_html_bytes = re.compile(
+    r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
+
+
+def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
+    if parser is None:
+        parser = html_parser
+    value = etree.fromstring(html, parser, **kw)
+    if value is None:
+        raise etree.ParserError(
+            "Document is empty")
+    if ensure_head_body and value.find('head') is None:
+        value.insert(0, Element('head'))
+    if ensure_head_body and value.find('body') is None:
+        value.append(Element('body'))
+    return value
+
+
+def fragments_fromstring(html, no_leading_text=False, base_url=None,
+                         parser=None, **kw):
+    """Parses several HTML elements, returning a list of elements.
+
+    The first item in the list may be a string.
+    If no_leading_text is true, then it will be an error if there is
+    leading text, and it will always be a list of only elements.
+
+    base_url will set the document's base_url attribute
+    (and the tree's docinfo.URL).
+    """
+    if parser is None:
+        parser = html_parser
+    # FIXME: check what happens when you give html with a body, head, etc.
+    if isinstance(html, bytes):
+        if not _looks_like_full_html_bytes(html):
+            # can't use %-formatting in early Py3 versions
+            html = ('<html><body>'.encode('ascii') + html +
+                    '</body></html>'.encode('ascii'))
+    else:
+        if not _looks_like_full_html_unicode(html):
+            html = '<html><body>%s</body></html>' % html
+    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+    assert _nons(doc.tag) == 'html'
+    bodies = [e for e in doc if _nons(e.tag) == 'body']
+    assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
+    body = bodies[0]
+    elements = []
+    if no_leading_text and body.text and body.text.strip():
+        raise etree.ParserError(
+            "There is leading text: %r" % body.text)
+    if body.text and body.text.strip():
+        elements.append(body.text)
+    elements.extend(body)
+    # FIXME: removing the reference to the parent artificial document
+    # would be nice
+    return elements
+
+
+def fragment_fromstring(html, create_parent=False, base_url=None,
+                        parser=None, **kw):
+    """
+    Parses a single HTML element; it is an error if there is more than
+    one element, or if anything but whitespace precedes or follows the
+    element.
+
+    If ``create_parent`` is true (or is a tag name) then a parent node
+    will be created to encapsulate the HTML in a single element.  In this
+    case, leading or trailing text is also allowed, as are multiple elements
+    as result of the parsing.
+
+    Passing a ``base_url`` will set the document's ``base_url`` attribute
+    (and the tree's docinfo.URL).
+    """
+    if parser is None:
+        parser = html_parser
+
+    accept_leading_text = bool(create_parent)
+
+    elements = fragments_fromstring(
+        html, parser=parser, no_leading_text=not accept_leading_text,
+        base_url=base_url, **kw)
+
+    if create_parent:
+        if not isinstance(create_parent, basestring):
+            create_parent = 'div'
+        new_root = Element(create_parent)
+        if elements:
+            if isinstance(elements[0], basestring):
+                new_root.text = elements[0]
+                del elements[0]
+            new_root.extend(elements)
+        return new_root
+
+    if not elements:
+        raise etree.ParserError('No elements found')
+    if len(elements) > 1:
+        raise etree.ParserError(
+            "Multiple elements found (%s)"
+            % ', '.join([_element_name(e) for e in elements]))
+    el = elements[0]
+    if el.tail and el.tail.strip():
+        raise etree.ParserError(
+            "Element followed by text: %r" % el.tail)
+    el.tail = None
+    return el
+
+
+def fromstring(html, base_url=None, parser=None, **kw):
+    """
+    Parse the html, returning a single element/document.
+
+    This tries to minimally parse the chunk of text, without knowing if it
+    is a fragment or a document.
+
+    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+    """
+    if parser is None:
+        parser = html_parser
+    if isinstance(html, bytes):
+        is_full_html = _looks_like_full_html_bytes(html)
+    else:
+        is_full_html = _looks_like_full_html_unicode(html)
+    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+    if is_full_html:
+        return doc
+    # otherwise, lets parse it out...
+    bodies = doc.findall('body')
+    if not bodies:
+        bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
+    if bodies:
+        body = bodies[0]
+        if len(bodies) > 1:
+            # Somehow there are multiple bodies, which is bad, but just
+            # smash them into one body
+            for other_body in bodies[1:]:
+                if other_body.text:
+                    if len(body):
+                        body[-1].tail = (body[-1].tail or '') + other_body.text
+                    else:
+                        body.text = (body.text or '') + other_body.text
+                body.extend(other_body)
+                # We'll ignore tail
+                # I guess we are ignoring attributes too
+                other_body.drop_tree()
+    else:
+        body = None
+    heads = doc.findall('head')
+    if not heads:
+        heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
+    if heads:
+        # Well, we have some sort of structure, so lets keep it all
+        head = heads[0]
+        if len(heads) > 1:
+            for other_head in heads[1:]:
+                head.extend(other_head)
+                # We don't care about text or tail in a head
+                other_head.drop_tree()
+        return doc
+    if body is None:
+        return doc
+    if (len(body) == 1 and (not body.text or not body.text.strip())
+        and (not body[-1].tail or not body[-1].tail.strip())):
+        # The body has just one element, so it was probably a single
+        # element passed in
+        return body[0]
+    # Now we have a body which represents a bunch of tags which have the
+    # content that was passed in.  We will create a fake container, which
+    # is the body tag, except <body> implies too much structure.
+    if _contains_block_level_tag(body):
+        body.tag = 'div'
+    else:
+        body.tag = 'span'
+    return body
+
+
+def parse(filename_or_url, parser=None, base_url=None, **kw):
+    """
+    Parse a filename, URL, or file-like object into an HTML document
+    tree.  Note: this returns a tree, not an element.  Use
+    ``parse(...).getroot()`` to get the document root.
+
+    You can override the base URL with the ``base_url`` keyword.  This
+    is most useful when parsing from a file-like object.
+    """
+    if parser is None:
+        parser = html_parser
+    return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
+
+
+def _contains_block_level_tag(el):
+    # FIXME: I could do this with XPath, but would that just be
+    # unnecessarily slow?
+    for el in el.iter(etree.Element):
+        if _nons(el.tag) in defs.block_tags:
+            return True
+    return False
+
+
+def _element_name(el):
+    if isinstance(el, etree.CommentBase):
+        return 'comment'
+    elif isinstance(el, basestring):
+        return 'string'
+    else:
+        return _nons(el.tag)
+
+
+################################################################################
+# form handling
+################################################################################
+
+class FormElement(HtmlElement):
+    """
+    Represents a <form> element.
+    """
+
+    @property
+    def inputs(self):
+        """
+        Returns an accessor for all the input elements in the form.
+
+        See `InputGetter` for more information about the object.
+        """
+        return InputGetter(self)
+
+    @property
+    def fields(self):
+        """
+        Dictionary-like object that represents all the fields in this
+        form.  You can set values in this dictionary to effect the
+        form.
+        """
+        return FieldsDict(self.inputs)
+
+    @fields.setter
+    def fields(self, value):
+        fields = self.fields
+        prev_keys = fields.keys()
+        for key, value in value.items():
+            if key in prev_keys:
+                prev_keys.remove(key)
+            fields[key] = value
+        for key in prev_keys:
+            if key is None:
+                # Case of an unnamed input; these aren't really
+                # expressed in form_values() anyway.
+                continue
+            fields[key] = None
+
+    def _name(self):
+        if self.get('name'):
+            return self.get('name')
+        elif self.get('id'):
+            return '#' + self.get('id')
+        iter_tags = self.body.iter
+        forms = list(iter_tags('form'))
+        if not forms:
+            forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
+        return str(forms.index(self))
+
+    def form_values(self):
+        """
+        Return a list of tuples of the field values for the form.
+        This is suitable to be passed to ``urllib.urlencode()``.
+        """
+        results = []
+        for el in self.inputs:
+            name = el.name
+            if not name or 'disabled' in el.attrib:
+                continue
+            tag = _nons(el.tag)
+            if tag == 'textarea':
+                results.append((name, el.value))
+            elif tag == 'select':
+                value = el.value
+                if el.multiple:
+                    for v in value:
+                        results.append((name, v))
+                elif value is not None:
+                    results.append((name, el.value))
+            else:
+                assert tag == 'input', (
+                    "Unexpected tag: %r" % el)
+                if el.checkable and not el.checked:
+                    continue
+                if el.type in ('submit', 'image', 'reset', 'file'):
+                    continue
+                value = el.value
+                if value is not None:
+                    results.append((name, el.value))
+        return results
+
+    @property
+    def action(self):
+        """
+        Get/set the form's ``action`` attribute.
+        """
+        base_url = self.base_url
+        action = self.get('action')
+        if base_url and action is not None:
+            return urljoin(base_url, action)
+        else:
+            return action
+
+    @action.setter
+    def action(self, value):
+        self.set('action', value)
+
+    @action.deleter
+    def action(self):
+        attrib = self.attrib
+        if 'action' in attrib:
+            del attrib['action']
+
+    @property
+    def method(self):
+        """
+        Get/set the form's method.  Always returns a capitalized
+        string, and defaults to ``'GET'``
+        """
+        return self.get('method', 'GET').upper()
+
+    @method.setter
+    def method(self, value):
+        self.set('method', value.upper())
+
+
+HtmlElementClassLookup._default_element_classes['form'] = FormElement
+
+
+def submit_form(form, extra_values=None, open_http=None):
+    """
+    Helper function to submit a form.  Returns a file-like object, as from
+    ``urllib.urlopen()``.  This object also has a ``.geturl()`` function,
+    which shows the URL if there were any redirects.
+
+    You can use this like::
+
+        form = doc.forms[0]
+        form.inputs['foo'].value = 'bar' # etc
+        response = form.submit()
+        doc = parse(response)
+        doc.make_links_absolute(response.geturl())
+
+    To change the HTTP requester, pass a function as ``open_http`` keyword
+    argument that opens the URL for you.  The function must have the following
+    signature::
+
+        open_http(method, URL, values)
+
+    The action is one of 'GET' or 'POST', the URL is the target URL as a
+    string, and the values are a sequence of ``(name, value)`` tuples with the
+    form data.
+    """
+    values = form.form_values()
+    if extra_values:
+        if hasattr(extra_values, 'items'):
+            extra_values = extra_values.items()
+        values.extend(extra_values)
+    if open_http is None:
+        open_http = open_http_urllib
+    if form.action:
+        url = form.action
+    else:
+        url = form.base_url
+    return open_http(form.method, url, values)
+
+
+def open_http_urllib(method, url, values):
+    if not url:
+        raise ValueError("cannot submit, no URL provided")
+    ## FIXME: should test that it's not a relative URL or something
+    try:
+        from urllib import urlencode, urlopen
+    except ImportError: # Python 3
+        from urllib.request import urlopen
+        from urllib.parse import urlencode
+    if method == 'GET':
+        if '?' in url:
+            url += '&'
+        else:
+            url += '?'
+        url += urlencode(values)
+        data = None
+    else:
+        data = urlencode(values)
+        if not isinstance(data, bytes):
+            data = data.encode('ASCII')
+    return urlopen(url, data)
+
+
+class FieldsDict(MutableMapping):
+
+    def __init__(self, inputs):
+        self.inputs = inputs
+    def __getitem__(self, item):
+        return self.inputs[item].value
+    def __setitem__(self, item, value):
+        self.inputs[item].value = value
+    def __delitem__(self, item):
+        raise KeyError(
+            "You cannot remove keys from ElementDict")
+    def keys(self):
+        return self.inputs.keys()
+    def __contains__(self, item):
+        return item in self.inputs
+    def __iter__(self):
+        return iter(self.inputs.keys())
+    def __len__(self):
+        return len(self.inputs)
+
+    def __repr__(self):
+        return '<%s for form %s>' % (
+            self.__class__.__name__,
+            self.inputs.form._name())
+
+
+class InputGetter(object):
+
+    """
+    An accessor that represents all the input fields in a form.
+
+    You can get fields by name from this, with
+    ``form.inputs['field_name']``.  If there are a set of checkboxes
+    with the same name, they are returned as a list (a `CheckboxGroup`
+    which also allows value setting).  Radio inputs are handled
+    similarly.
+
+    You can also iterate over this to get all input elements.  This
+    won't return the same thing as if you get all the names, as
+    checkboxes and radio elements are returned individually.
+    """
+
+    _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
+    _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
+
+    def __init__(self, form):
+        self.form = form
+
+    def __repr__(self):
+        return '<%s for form %s>' % (
+            self.__class__.__name__,
+            self.form._name())
+
+    ## FIXME: there should be more methods, and it's unclear if this is
+    ## a dictionary-like object or list-like object
+
+    def __getitem__(self, name):
+        results = self._name_xpath(self.form, name=name)
+        if results:
+            type = results[0].get('type')
+            if type == 'radio' and len(results) > 1:
+                group = RadioGroup(results)
+                group.name = name
+                return group
+            elif type == 'checkbox' and len(results) > 1:
+                group = CheckboxGroup(results)
+                group.name = name
+                return group
+            else:
+                # I don't like throwing away elements like this
+                return results[0]
+        else:
+            raise KeyError(
+                "No input element with the name %r" % name)
+
+    def __contains__(self, name):
+        results = self._name_xpath(self.form, name=name)
+        return bool(results)
+
+    def keys(self):
+        names = set()
+        for el in self:
+            names.add(el.name)
+        if None in names:
+            names.remove(None)
+        return list(names)
+
+    def __iter__(self):
+        ## FIXME: kind of dumb to turn a list into an iterator, only
+        ## to have it likely turned back into a list again :(
+        return iter(self._all_xpath(self.form))
+
+
+class InputMixin(object):
+    """
+    Mix-in for all input elements (input, select, and textarea)
+    """
+    @property
+    def name(self):
+        """
+        Get/set the name of the element
+        """
+        return self.get('name')
+
+    @name.setter
+    def name(self, value):
+        self.set('name', value)
+
+    @name.deleter
+    def name(self):
+        attrib = self.attrib
+        if 'name' in attrib:
+            del attrib['name']
+
+    def __repr__(self):
+        type_name = getattr(self, 'type', None)
+        if type_name:
+            type_name = ' type=%r' % type_name
+        else:
+            type_name = ''
+        return '<%s %x name=%r%s>' % (
+            self.__class__.__name__, id(self), self.name, type_name)
+
+
+class TextareaElement(InputMixin, HtmlElement):
+    """
+    ``<textarea>`` element.  You can get the name with ``.name`` and
+    get/set the value with ``.value``
+    """
+    @property
+    def value(self):
+        """
+        Get/set the value (which is the contents of this element)
+        """
+        content = self.text or ''
+        if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
+            serialisation_method = 'xml'
+        else:
+            serialisation_method = 'html'
+        for el in self:
+            # it's rare that we actually get here, so let's not use ''.join()
+            content += etree.tostring(
+                el, method=serialisation_method, encoding='unicode')
+        return content
+
+    @value.setter
+    def value(self, value):
+        del self[:]
+        self.text = value
+
+    @value.deleter
+    def value(self):
+        self.text = ''
+        del self[:]
+
+
+HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
+
+
+class SelectElement(InputMixin, HtmlElement):
+    """
+    ``<select>`` element.  You can get the name with ``.name``.
+
+    ``.value`` will be the value of the selected option, unless this
+    is a multi-select element (``<select multiple>``), in which case
+    it will be a set-like object.  In either case ``.value_options``
+    gives the possible values.
+
+    The boolean attribute ``.multiple`` shows if this is a
+    multi-select.
+    """
+    @property
+    def value(self):
+        """
+        Get/set the value of this select (the selected option).
+
+        If this is a multi-select, this is a set-like object that
+        represents all the selected options.
+        """
+        if self.multiple:
+            return MultipleSelectOptions(self)
+        options = _options_xpath(self)
+
+        try:
+            selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
+        except StopIteration:
+            try:
+                selected_option = next(el for el in options if el.get('disabled') is None)
+            except StopIteration:
+                return None
+        value = selected_option.get('value')
+        if value is None:
+            value = (selected_option.text or '').strip()
+        return value
+
+    @value.setter
+    def value(self, value):
+        if self.multiple:
+            if isinstance(value, basestring):
+                raise TypeError("You must pass in a sequence")
+            values = self.value
+            values.clear()
+            values.update(value)
+            return
+        checked_option = None
+        if value is not None:
+            for el in _options_xpath(self):
+                opt_value = el.get('value')
+                if opt_value is None:
+                    opt_value = (el.text or '').strip()
+                if opt_value == value:
+                    checked_option = el
+                    break
+            else:
+                raise ValueError(
+                    "There is no option with the value of %r" % value)
+        for el in _options_xpath(self):
+            if 'selected' in el.attrib:
+                del el.attrib['selected']
+        if checked_option is not None:
+            checked_option.set('selected', '')
+
+    @value.deleter
+    def value(self):
+        # FIXME: should del be allowed at all?
+        if self.multiple:
+            self.value.clear()
+        else:
+            self.value = None
+
+    @property
+    def value_options(self):
+        """
+        All the possible values this select can have (the ``value``
+        attribute of all the ``<option>`` elements.
+        """
+        options = []
+        for el in _options_xpath(self):
+            value = el.get('value')
+            if value is None:
+                value = (el.text or '').strip()
+            options.append(value)
+        return options
+
+    @property
+    def multiple(self):
+        """
+        Boolean attribute: is there a ``multiple`` attribute on this element.
+        """
+        return 'multiple' in self.attrib
+
+    @multiple.setter
+    def multiple(self, value):
+        if value:
+            self.set('multiple', '')
+        elif 'multiple' in self.attrib:
+            del self.attrib['multiple']
+
+
+HtmlElementClassLookup._default_element_classes['select'] = SelectElement
+
+
+class MultipleSelectOptions(SetMixin):
+    """
+    Represents all the selected options in a ``<select multiple>`` element.
+
+    You can add to this set-like option to select an option, or remove
+    to unselect the option.
+    """
+
+    def __init__(self, select):
+        self.select = select
+
+    @property
+    def options(self):
+        """
+        Iterator of all the ``<option>`` elements.
+        """
+        return iter(_options_xpath(self.select))
+
+    def __iter__(self):
+        for option in self.options:
+            if 'selected' in option.attrib:
+                opt_value = option.get('value')
+                if opt_value is None:
+                    opt_value = (option.text or '').strip()
+                yield opt_value
+
+    def add(self, item):
+        for option in self.options:
+            opt_value = option.get('value')
+            if opt_value is None:
+                opt_value = (option.text or '').strip()
+            if opt_value == item:
+                option.set('selected', '')
+                break
+        else:
+            raise ValueError(
+                "There is no option with the value %r" % item)
+
+    def remove(self, item):
+        for option in self.options:
+            opt_value = option.get('value')
+            if opt_value is None:
+                opt_value = (option.text or '').strip()
+            if opt_value == item:
+                if 'selected' in option.attrib:
+                    del option.attrib['selected']
+                else:
+                    raise ValueError(
+                        "The option %r is not currently selected" % item)
+                break
+        else:
+            raise ValueError(
+                "There is not option with the value %r" % item)
+
+    def __repr__(self):
+        return '<%s {%s} for select name=%r>' % (
+            self.__class__.__name__,
+            ', '.join([repr(v) for v in self]),
+            self.select.name)
+
+
+class RadioGroup(list):
+    """
+    This object represents several ``<input type=radio>`` elements
+    that have the same name.
+
+    You can use this like a list, but also use the property
+    ``.value`` to check/uncheck inputs.  Also you can use
+    ``.value_options`` to get the possible values.
+    """
+    @property
+    def value(self):
+        """
+        Get/set the value, which checks the radio with that value (and
+        unchecks any other value).
+        """
+        for el in self:
+            if 'checked' in el.attrib:
+                return el.get('value')
+        return None
+
+    @value.setter
+    def value(self, value):
+        checked_option = None
+        if value is not None:
+            for el in self:
+                if el.get('value') == value:
+                    checked_option = el
+                    break
+            else:
+                raise ValueError("There is no radio input with the value %r" % value)
+        for el in self:
+            if 'checked' in el.attrib:
+                del el.attrib['checked']
+        if checked_option is not None:
+            checked_option.set('checked', '')
+
+    @value.deleter
+    def value(self):
+        self.value = None
+
+    @property
+    def value_options(self):
+        """
+        Returns a list of all the possible values.
+        """
+        return [el.get('value') for el in self]
+
+    def __repr__(self):
+        return '%s(%s)' % (
+            self.__class__.__name__,
+            list.__repr__(self))
+
+
+class CheckboxGroup(list):
+    """
+    Represents a group of checkboxes (``<input type=checkbox>``) that
+    have the same name.
+
+    In addition to using this like a list, the ``.value`` attribute
+    returns a set-like object that you can add to or remove from to
+    check and uncheck checkboxes.  You can also use ``.value_options``
+    to get the possible values.
+    """
+    @property
+    def value(self):
+        """
+        Return a set-like object that can be modified to check or
+        uncheck individual checkboxes according to their value.
+        """
+        return CheckboxValues(self)
+
+    @value.setter
+    def value(self, value):
+        values = self.value
+        values.clear()
+        if not hasattr(value, '__iter__'):
+            raise ValueError(
+                "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
+                % (self[0].name, value))
+        values.update(value)
+
+    @value.deleter
+    def value(self):
+        self.value.clear()
+
+    @property
+    def value_options(self):
+        """
+        Returns a list of all the possible values.
+        """
+        return [el.get('value') for el in self]
+
+    def __repr__(self):
+        return '%s(%s)' % (
+            self.__class__.__name__, list.__repr__(self))
+
+
+class CheckboxValues(SetMixin):
+    """
+    Represents the values of the checked checkboxes in a group of
+    checkboxes with the same name.
+    """
+
+    def __init__(self, group):
+        self.group = group
+
+    def __iter__(self):
+        return iter([
+            el.get('value')
+            for el in self.group
+            if 'checked' in el.attrib])
+
+    def add(self, value):
+        for el in self.group:
+            if el.get('value') == value:
+                el.set('checked', '')
+                break
+        else:
+            raise KeyError("No checkbox with value %r" % value)
+
+    def remove(self, value):
+        for el in self.group:
+            if el.get('value') == value:
+                if 'checked' in el.attrib:
+                    del el.attrib['checked']
+                else:
+                    raise KeyError(
+                        "The checkbox with value %r was already unchecked" % value)
+                break
+        else:
+            raise KeyError(
+                "No checkbox with value %r" % value)
+
+    def __repr__(self):
+        return '<%s {%s} for checkboxes name=%r>' % (
+            self.__class__.__name__,
+            ', '.join([repr(v) for v in self]),
+            self.group.name)
+
+
+class InputElement(InputMixin, HtmlElement):
+    """
+    Represents an ``<input>`` element.
+
+    You can get the type with ``.type`` (which is lower-cased and
+    defaults to ``'text'``).
+
+    Also you can get and set the value with ``.value``
+
+    Checkboxes and radios have the attribute ``input.checkable ==
+    True`` (for all others it is false) and a boolean attribute
+    ``.checked``.
+
+    """
+
+    ## FIXME: I'm a little uncomfortable with the use of .checked
+    @property
+    def value(self):
+        """
+        Get/set the value of this element, using the ``value`` attribute.
+
+        Also, if this is a checkbox and it has no value, this defaults
+        to ``'on'``.  If it is a checkbox or radio that is not
+        checked, this returns None.
+        """
+        if self.checkable:
+            if self.checked:
+                return self.get('value') or 'on'
+            else:
+                return None
+        return self.get('value')
+
+    @value.setter
+    def value(self, value):
+        if self.checkable:
+            if not value:
+                self.checked = False
+            else:
+                self.checked = True
+                if isinstance(value, basestring):
+                    self.set('value', value)
+        else:
+            self.set('value', value)
+
+    @value.deleter
+    def value(self):
+        if self.checkable:
+            self.checked = False
+        else:
+            if 'value' in self.attrib:
+                del self.attrib['value']
+
+    @property
+    def type(self):
+        """
+        Return the type of this element (using the type attribute).
+        """
+        return self.get('type', 'text').lower()
+
+    @type.setter
+    def type(self, value):
+        self.set('type', value)
+
+    @property
+    def checkable(self):
+        """
+        Boolean: can this element be checked?
+        """
+        return self.type in ('checkbox', 'radio')
+
+    @property
+    def checked(self):
+        """
+        Boolean attribute to get/set the presence of the ``checked``
+        attribute.
+
+        You can only use this on checkable input types.
+        """
+        if not self.checkable:
+            raise AttributeError('Not a checkable input type')
+        return 'checked' in self.attrib
+
+    @checked.setter
+    def checked(self, value):
+        if not self.checkable:
+            raise AttributeError('Not a checkable input type')
+        if value:
+            self.set('checked', '')
+        else:
+            attrib = self.attrib
+            if 'checked' in attrib:
+                del attrib['checked']
+
+
+HtmlElementClassLookup._default_element_classes['input'] = InputElement
+
+
+class LabelElement(HtmlElement):
+    """
+    Represents a ``<label>`` element.
+
+    Label elements are linked to other elements with their ``for``
+    attribute.  You can access this element with ``label.for_element``.
+    """
+    @property
+    def for_element(self):
+        """
+        Get/set the element this label points to.  Return None if it
+        can't be found.
+        """
+        id = self.get('for')
+        if not id:
+            return None
+        return self.body.get_element_by_id(id)
+
+    @for_element.setter
+    def for_element(self, other):
+        id = other.get('id')
+        if not id:
+            raise TypeError(
+                "Element %r has no id attribute" % other)
+        self.set('for', id)
+
+    @for_element.deleter
+    def for_element(self):
+        attrib = self.attrib
+        if 'id' in attrib:
+            del attrib['id']
+
+
+HtmlElementClassLookup._default_element_classes['label'] = LabelElement
+
+
+############################################################
+## Serialization
+############################################################
+
+def html_to_xhtml(html):
+    """Convert all tags in an HTML tree to XHTML by moving them to the
+    XHTML namespace.
+    """
+    try:
+        html = html.getroot()
+    except AttributeError:
+        pass
+    prefix = "{%s}" % XHTML_NAMESPACE
+    for el in html.iter(etree.Element):
+        tag = el.tag
+        if tag[0] != '{':
+            el.tag = prefix + tag
+
+
+def xhtml_to_html(xhtml):
+    """Convert all tags in an XHTML tree to HTML by removing their
+    XHTML namespace.
+    """
+    try:
+        xhtml = xhtml.getroot()
+    except AttributeError:
+        pass
+    prefix = "{%s}" % XHTML_NAMESPACE
+    prefix_len = len(prefix)
+    for el in xhtml.iter(prefix + "*"):
+        el.tag = el.tag[prefix_len:]
+
+
+# This isn't a general match, but it's a match for what libxml2
+# specifically serialises:
+__str_replace_meta_content_type = re.compile(
+    r'<meta http-equiv="Content-Type"[^>]*>').sub
+__bytes_replace_meta_content_type = re.compile(
+    r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
+
+
+def tostring(doc, pretty_print=False, include_meta_content_type=False,
+             encoding=None, method="html", with_tail=True, doctype=None):
+    """Return an HTML string representation of the document.
+
+    Note: if include_meta_content_type is true this will create a
+    ``<meta http-equiv="Content-Type" ...>`` tag in the head;
+    regardless of the value of include_meta_content_type any existing
+    ``<meta http-equiv="Content-Type" ...>`` tag will be removed
+
+    The ``encoding`` argument controls the output encoding (defaults to
+    ASCII, with &#...; character references for any characters outside
+    of ASCII).  Note that you can pass the name ``'unicode'`` as
+    ``encoding`` argument to serialise to a Unicode string.
+
+    The ``method`` argument defines the output method.  It defaults to
+    'html', but can also be 'xml' for xhtml output, or 'text' to
+    serialise to plain text without markup.
+
+    To leave out the tail text of the top-level element that is being
+    serialised, pass ``with_tail=False``.
+
+    The ``doctype`` option allows passing in a plain string that will
+    be serialised before the XML tree.  Note that passing in non
+    well-formed content here will make the XML output non well-formed.
+    Also, an existing doctype in the document tree will not be removed
+    when serialising an ElementTree instance.
+
+    Example::
+
+        >>> from lxml import html
+        >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
+
+        >>> html.tostring(root)
+        b'<p>Hello<br>world!</p>'
+        >>> html.tostring(root, method='html')
+        b'<p>Hello<br>world!</p>'
+
+        >>> html.tostring(root, method='xml')
+        b'<p>Hello<br/>world!</p>'
+
+        >>> html.tostring(root, method='text')
+        b'Helloworld!'
+
+        >>> html.tostring(root, method='text', encoding='unicode')
+        u'Helloworld!'
+
+        >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
+        >>> html.tostring(root[0], method='text', encoding='unicode')
+        u'Helloworld!TAIL'
+
+        >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
+        u'Helloworld!'
+
+        >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
+        >>> html.tostring(doc, method='html', encoding='unicode')
+        u'<html><body><p>Hello<br>world!</p></body></html>'
+
+        >>> print(html.tostring(doc, method='html', encoding='unicode',
+        ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
+        ...                  ' "http://www.w3.org/TR/html4/strict.dtd">'))
+        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+        <html><body><p>Hello<br>world!</p></body></html>
+    """
+    html = etree.tostring(doc, method=method, pretty_print=pretty_print,
+                          encoding=encoding, with_tail=with_tail,
+                          doctype=doctype)
+    if method == 'html' and not include_meta_content_type:
+        if isinstance(html, str):
+            html = __str_replace_meta_content_type('', html)
+        else:
+            html = __bytes_replace_meta_content_type(bytes(), html)
+    return html
+
+
+tostring.__doc__ = __fix_docstring(tostring.__doc__)
+
+
+def open_in_browser(doc, encoding=None):
+    """
+    Open the HTML document in a web browser, saving it to a temporary
+    file to open it.  Note that this does not delete the file after
+    use.  This is mainly meant for debugging.
+    """
+    import os
+    import webbrowser
+    import tempfile
+    if not isinstance(doc, etree._ElementTree):
+        doc = etree.ElementTree(doc)
+    handle, fn = tempfile.mkstemp(suffix='.html')
+    f = os.fdopen(handle, 'wb')
+    try:
+        doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
+    finally:
+        # we leak the file itself here, but we should at least close it
+        f.close()
+    url = 'file://' + fn.replace(os.path.sep, '/')
+    print(url)
+    webbrowser.open(url)
+
+
+################################################################################
+# configure Element class lookup
+################################################################################
+
+class HTMLParser(etree.HTMLParser):
+    """An HTML parser that is configured to return lxml.html Element
+    objects.
+    """
+    def __init__(self, **kwargs):
+        super(HTMLParser, self).__init__(**kwargs)
+        self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+class XHTMLParser(etree.XMLParser):
+    """An XML parser that is configured to return lxml.html Element
+    objects.
+
+    Note that this parser is not really XHTML aware unless you let it
+    load a DTD that declares the HTML entities.  To do this, make sure
+    you have the XHTML DTDs installed in your catalogs, and create the
+    parser like this::
+
+        >>> parser = XHTMLParser(load_dtd=True)
+
+    If you additionally want to validate the document, use this::
+
+        >>> parser = XHTMLParser(dtd_validation=True)
+
+    For catalog support, see http://www.xmlsoft.org/catalog.html.
+    """
+    def __init__(self, **kwargs):
+        super(XHTMLParser, self).__init__(**kwargs)
+        self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+def Element(*args, **kw):
+    """Create a new HTML Element.
+
+    This can also be used for XHTML documents.
+    """
+    v = html_parser.makeelement(*args, **kw)
+    return v
+
+
+html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()
author	guerler
date	Fri, 31 Jul 2020 00:32:28 -0400
parents
children