comparison planemo/lib/python3.7/site-packages/lxml/html/__init__.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # Copyright (c) 2004 Ian Bicking. All rights reserved.
2 #
3 # Redistribution and use in source and binary forms, with or without
4 # modification, are permitted provided that the following conditions are
5 # met:
6 #
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 #
10 # 2. Redistributions in binary form must reproduce the above copyright
11 # notice, this list of conditions and the following disclaimer in
12 # the documentation and/or other materials provided with the
13 # distribution.
14 #
15 # 3. Neither the name of Ian Bicking nor the names of its contributors may
16 # be used to endorse or promote products derived from this software
17 # without specific prior written permission.
18 #
19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 from __future__ import absolute_import
35
36 __all__ = [
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39 'find_rel_links', 'find_class', 'make_links_absolute',
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41
42
43 import copy
44 import sys
45 import re
46 from functools import partial
47
48 try:
49 from collections.abc import MutableMapping, MutableSet
50 except ImportError:
51 from collections import MutableMapping, MutableSet
52
53 from .. import etree
54 from . import defs
55 from ._setmixin import SetMixin
56
57 try:
58 from urlparse import urljoin
59 except ImportError:
60 # Python 3
61 from urllib.parse import urljoin
62
63 try:
64 unicode
65 except NameError:
66 # Python 3
67 unicode = str
68 try:
69 basestring
70 except NameError:
71 # Python 3
72 basestring = (str, bytes)
73
74
75 def __fix_docstring(s):
76 if not s:
77 return s
78 if sys.version_info[0] >= 3:
79 sub = re.compile(r"^(\s*)u'", re.M).sub
80 else:
81 sub = re.compile(r"^(\s*)b'", re.M).sub
82 return sub(r"\1'", s)
83
84
85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
86
87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
88 namespaces={'x':XHTML_NAMESPACE})
89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
90 namespaces={'x':XHTML_NAMESPACE})
91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
92 namespaces={'x':XHTML_NAMESPACE})
93 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
96 _collect_string_content = etree.XPath("string()")
97 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
100 namespaces={'x':XHTML_NAMESPACE})
101 _archive_re = re.compile(r'[^ ]+')
102 _parse_meta_refresh_url = re.compile(
103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
104
105
106 def _unquote_match(s, pos):
107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
108 return s[1:-1], pos+1
109 else:
110 return s,pos
111
112
113 def _transform_result(typ, result):
114 """Convert the result back into the input type.
115 """
116 if issubclass(typ, bytes):
117 return tostring(result, encoding='utf-8')
118 elif issubclass(typ, unicode):
119 return tostring(result, encoding='unicode')
120 else:
121 return result
122
123
124 def _nons(tag):
125 if isinstance(tag, basestring):
126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
127 return tag.split('}')[-1]
128 return tag
129
130
131 class Classes(MutableSet):
132 """Provides access to an element's class attribute as a set-like collection.
133 Usage::
134
135 >>> el = fromstring('<p class="hidden large">Text</p>')
136 >>> classes = el.classes # or: classes = Classes(el.attrib)
137 >>> classes |= ['block', 'paragraph']
138 >>> el.get('class')
139 'hidden large block paragraph'
140 >>> classes.toggle('hidden')
141 False
142 >>> el.get('class')
143 'large block paragraph'
144 >>> classes -= ('some', 'classes', 'block')
145 >>> el.get('class')
146 'large paragraph'
147 """
148 def __init__(self, attributes):
149 self._attributes = attributes
150 self._get_class_value = partial(attributes.get, 'class', '')
151
152 def add(self, value):
153 """
154 Add a class.
155
156 This has no effect if the class is already present.
157 """
158 if not value or re.search(r'\s', value):
159 raise ValueError("Invalid class name: %r" % value)
160 classes = self._get_class_value().split()
161 if value in classes:
162 return
163 classes.append(value)
164 self._attributes['class'] = ' '.join(classes)
165
166 def discard(self, value):
167 """
168 Remove a class if it is currently present.
169
170 If the class is not present, do nothing.
171 """
172 if not value or re.search(r'\s', value):
173 raise ValueError("Invalid class name: %r" % value)
174 classes = [name for name in self._get_class_value().split()
175 if name != value]
176 if classes:
177 self._attributes['class'] = ' '.join(classes)
178 elif 'class' in self._attributes:
179 del self._attributes['class']
180
181 def remove(self, value):
182 """
183 Remove a class; it must currently be present.
184
185 If the class is not present, raise a KeyError.
186 """
187 if not value or re.search(r'\s', value):
188 raise ValueError("Invalid class name: %r" % value)
189 super(Classes, self).remove(value)
190
191 def __contains__(self, name):
192 classes = self._get_class_value()
193 return name in classes and name in classes.split()
194
195 def __iter__(self):
196 return iter(self._get_class_value().split())
197
198 def __len__(self):
199 return len(self._get_class_value().split())
200
201 # non-standard methods
202
203 def update(self, values):
204 """
205 Add all names from 'values'.
206 """
207 classes = self._get_class_value().split()
208 extended = False
209 for value in values:
210 if value not in classes:
211 classes.append(value)
212 extended = True
213 if extended:
214 self._attributes['class'] = ' '.join(classes)
215
216 def toggle(self, value):
217 """
218 Add a class name if it isn't there yet, or remove it if it exists.
219
220 Returns true if the class was added (and is now enabled) and
221 false if it was removed (and is now disabled).
222 """
223 if not value or re.search(r'\s', value):
224 raise ValueError("Invalid class name: %r" % value)
225 classes = self._get_class_value().split()
226 try:
227 classes.remove(value)
228 enabled = False
229 except ValueError:
230 classes.append(value)
231 enabled = True
232 if classes:
233 self._attributes['class'] = ' '.join(classes)
234 else:
235 del self._attributes['class']
236 return enabled
237
238
239 class HtmlMixin(object):
240
241 def set(self, key, value=None):
242 """set(self, key, value=None)
243
244 Sets an element attribute. If no value is provided, or if the value is None,
245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
246 for ``form.set('novalidate')``.
247 """
248 super(HtmlElement, self).set(key, value)
249
250 @property
251 def classes(self):
252 """
253 A set-like wrapper around the 'class' attribute.
254 """
255 return Classes(self.attrib)
256
257 @classes.setter
258 def classes(self, classes):
259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
260 value = classes._get_class_value()
261 if value:
262 self.set('class', value)
263 elif self.get('class') is not None:
264 del self.attrib['class']
265
266 @property
267 def base_url(self):
268 """
269 Returns the base URL, given when the page was parsed.
270
271 Use with ``urlparse.urljoin(el.base_url, href)`` to get
272 absolute URLs.
273 """
274 return self.getroottree().docinfo.URL
275
276 @property
277 def forms(self):
278 """
279 Return a list of all the forms
280 """
281 return _forms_xpath(self)
282
283 @property
284 def body(self):
285 """
286 Return the <body> element. Can be called from a child element
287 to get the document's head.
288 """
289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
290
291 @property
292 def head(self):
293 """
294 Returns the <head> element. Can be called from a child
295 element to get the document's head.
296 """
297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
298
299 @property
300 def label(self):
301 """
302 Get or set any <label> element associated with this element.
303 """
304 id = self.get('id')
305 if not id:
306 return None
307 result = _label_xpath(self, id=id)
308 if not result:
309 return None
310 else:
311 return result[0]
312
313 @label.setter
314 def label(self, label):
315 id = self.get('id')
316 if not id:
317 raise TypeError(
318 "You cannot set a label for an element (%r) that has no id"
319 % self)
320 if _nons(label.tag) != 'label':
321 raise TypeError(
322 "You can only assign label to a label element (not %r)"
323 % label)
324 label.set('for', id)
325
326 @label.deleter
327 def label(self):
328 label = self.label
329 if label is not None:
330 del label.attrib['for']
331
332 def drop_tree(self):
333 """
334 Removes this element from the tree, including its children and
335 text. The tail text is joined to the previous element or
336 parent.
337 """
338 parent = self.getparent()
339 assert parent is not None
340 if self.tail:
341 previous = self.getprevious()
342 if previous is None:
343 parent.text = (parent.text or '') + self.tail
344 else:
345 previous.tail = (previous.tail or '') + self.tail
346 parent.remove(self)
347
348 def drop_tag(self):
349 """
350 Remove the tag, but not its children or text. The children and text
351 are merged into the parent.
352
353 Example::
354
355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
356 >>> h.find('.//b').drop_tag()
357 >>> print(tostring(h, encoding='unicode'))
358 <div>Hello World!</div>
359 """
360 parent = self.getparent()
361 assert parent is not None
362 previous = self.getprevious()
363 if self.text and isinstance(self.tag, basestring):
364 # not a Comment, etc.
365 if previous is None:
366 parent.text = (parent.text or '') + self.text
367 else:
368 previous.tail = (previous.tail or '') + self.text
369 if self.tail:
370 if len(self):
371 last = self[-1]
372 last.tail = (last.tail or '') + self.tail
373 elif previous is None:
374 parent.text = (parent.text or '') + self.tail
375 else:
376 previous.tail = (previous.tail or '') + self.tail
377 index = parent.index(self)
378 parent[index:index+1] = self[:]
379
380 def find_rel_links(self, rel):
381 """
382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
383 """
384 rel = rel.lower()
385 return [el for el in _rel_links_xpath(self)
386 if el.get('rel').lower() == rel]
387
388 def find_class(self, class_name):
389 """
390 Find any elements with the given class name.
391 """
392 return _class_xpath(self, class_name=class_name)
393
394 def get_element_by_id(self, id, *default):
395 """
396 Get the first element in a document with the given id. If none is
397 found, return the default argument if provided or raise KeyError
398 otherwise.
399
400 Note that there can be more than one element with the same id,
401 and this isn't uncommon in HTML documents found in the wild.
402 Browsers return only the first match, and this function does
403 the same.
404 """
405 try:
406 # FIXME: should this check for multiple matches?
407 # browsers just return the first one
408 return _id_xpath(self, id=id)[0]
409 except IndexError:
410 if default:
411 return default[0]
412 else:
413 raise KeyError(id)
414
415 def text_content(self):
416 """
417 Return the text content of the tag (and the text in any children).
418 """
419 return _collect_string_content(self)
420
421 def cssselect(self, expr, translator='html'):
422 """
423 Run the CSS expression on this element and its children,
424 returning a list of the results.
425
426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
427 -- note that pre-compiling the expression can provide a substantial
428 speedup.
429 """
430 # Do the import here to make the dependency optional.
431 from lxml.cssselect import CSSSelector
432 return CSSSelector(expr, translator=translator)(self)
433
434 ########################################
435 ## Link functions
436 ########################################
437
438 def make_links_absolute(self, base_url=None, resolve_base_href=True,
439 handle_failures=None):
440 """
441 Make all links in the document absolute, given the
442 ``base_url`` for the document (the full URL where the document
443 came from), or if no ``base_url`` is given, then the ``.base_url``
444 of the document.
445
446 If ``resolve_base_href`` is true, then any ``<base href>``
447 tags in the document are used *and* removed from the document.
448 If it is false then any such tag is ignored.
449
450 If ``handle_failures`` is None (default), a failure to process
451 a URL will abort the processing. If set to 'ignore', errors
452 are ignored. If set to 'discard', failing URLs will be removed.
453 """
454 if base_url is None:
455 base_url = self.base_url
456 if base_url is None:
457 raise TypeError(
458 "No base_url given, and the document has no base_url")
459 if resolve_base_href:
460 self.resolve_base_href()
461
462 if handle_failures == 'ignore':
463 def link_repl(href):
464 try:
465 return urljoin(base_url, href)
466 except ValueError:
467 return href
468 elif handle_failures == 'discard':
469 def link_repl(href):
470 try:
471 return urljoin(base_url, href)
472 except ValueError:
473 return None
474 elif handle_failures is None:
475 def link_repl(href):
476 return urljoin(base_url, href)
477 else:
478 raise ValueError(
479 "unexpected value for handle_failures: %r" % handle_failures)
480
481 self.rewrite_links(link_repl)
482
483 def resolve_base_href(self, handle_failures=None):
484 """
485 Find any ``<base href>`` tag in the document, and apply its
486 values to all links found in the document. Also remove the
487 tag once it has been applied.
488
489 If ``handle_failures`` is None (default), a failure to process
490 a URL will abort the processing. If set to 'ignore', errors
491 are ignored. If set to 'discard', failing URLs will be removed.
492 """
493 base_href = None
494 basetags = self.xpath('//base[@href]|//x:base[@href]',
495 namespaces={'x': XHTML_NAMESPACE})
496 for b in basetags:
497 base_href = b.get('href')
498 b.drop_tree()
499 if not base_href:
500 return
501 self.make_links_absolute(base_href, resolve_base_href=False,
502 handle_failures=handle_failures)
503
504 def iterlinks(self):
505 """
506 Yield (element, attribute, link, pos), where attribute may be None
507 (indicating the link is in the text). ``pos`` is the position
508 where the link occurs; often 0, but sometimes something else in
509 the case of links in stylesheets or style tags.
510
511 Note: <base href> is *not* taken into account in any way. The
512 link you get is exactly the link in the document.
513
514 Note: multiple links inside of a single text string or
515 attribute value are returned in reversed order. This makes it
516 possible to replace or delete them from the text string value
517 based on their reported text positions. Otherwise, a
518 modification at one text position can change the positions of
519 links reported later on.
520 """
521 link_attrs = defs.link_attrs
522 for el in self.iter(etree.Element):
523 attribs = el.attrib
524 tag = _nons(el.tag)
525 if tag == 'object':
526 codebase = None
527 ## <object> tags have attributes that are relative to
528 ## codebase
529 if 'codebase' in attribs:
530 codebase = el.get('codebase')
531 yield (el, 'codebase', codebase, 0)
532 for attrib in ('classid', 'data'):
533 if attrib in attribs:
534 value = el.get(attrib)
535 if codebase is not None:
536 value = urljoin(codebase, value)
537 yield (el, attrib, value, 0)
538 if 'archive' in attribs:
539 for match in _archive_re.finditer(el.get('archive')):
540 value = match.group(0)
541 if codebase is not None:
542 value = urljoin(codebase, value)
543 yield (el, 'archive', value, match.start())
544 else:
545 for attrib in link_attrs:
546 if attrib in attribs:
547 yield (el, attrib, attribs[attrib], 0)
548 if tag == 'meta':
549 http_equiv = attribs.get('http-equiv', '').lower()
550 if http_equiv == 'refresh':
551 content = attribs.get('content', '')
552 match = _parse_meta_refresh_url(content)
553 url = (match.group('url') if match else content).strip()
554 # unexpected content means the redirect won't work, but we might
555 # as well be permissive and return the entire string.
556 if url:
557 url, pos = _unquote_match(
558 url, match.start('url') if match else content.find(url))
559 yield (el, 'content', url, pos)
560 elif tag == 'param':
561 valuetype = el.get('valuetype') or ''
562 if valuetype.lower() == 'ref':
563 ## FIXME: while it's fine we *find* this link,
564 ## according to the spec we aren't supposed to
565 ## actually change the value, including resolving
566 ## it. It can also still be a link, even if it
567 ## doesn't have a valuetype="ref" (which seems to be the norm)
568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
569 yield (el, 'value', el.get('value'), 0)
570 elif tag == 'style' and el.text:
571 urls = [
572 # (start_pos, url)
573 _unquote_match(match.group(1), match.start(1))[::-1]
574 for match in _iter_css_urls(el.text)
575 ] + [
576 (match.start(1), match.group(1))
577 for match in _iter_css_imports(el.text)
578 ]
579 if urls:
580 # sort by start pos to bring both match sets back into order
581 # and reverse the list to report correct positions despite
582 # modifications
583 urls.sort(reverse=True)
584 for start, url in urls:
585 yield (el, None, url, start)
586 if 'style' in attribs:
587 urls = list(_iter_css_urls(attribs['style']))
588 if urls:
589 # return in reversed order to simplify in-place modifications
590 for match in urls[::-1]:
591 url, start = _unquote_match(match.group(1), match.start(1))
592 yield (el, 'style', url, start)
593
594 def rewrite_links(self, link_repl_func, resolve_base_href=True,
595 base_href=None):
596 """
597 Rewrite all the links in the document. For each link
598 ``link_repl_func(link)`` will be called, and the return value
599 will replace the old link.
600
601 Note that links may not be absolute (unless you first called
602 ``make_links_absolute()``), and may be internal (e.g.,
603 ``'#anchor'``). They can also be values like
604 ``'mailto:email'`` or ``'javascript:expr'``.
605
606 If you give ``base_href`` then all links passed to
607 ``link_repl_func()`` will take that into account.
608
609 If the ``link_repl_func`` returns None, the attribute or
610 tag text will be removed completely.
611 """
612 if base_href is not None:
613 # FIXME: this can be done in one pass with a wrapper
614 # around link_repl_func
615 self.make_links_absolute(
616 base_href, resolve_base_href=resolve_base_href)
617 elif resolve_base_href:
618 self.resolve_base_href()
619
620 for el, attrib, link, pos in self.iterlinks():
621 new_link = link_repl_func(link.strip())
622 if new_link == link:
623 continue
624 if new_link is None:
625 # Remove the attribute or element content
626 if attrib is None:
627 el.text = ''
628 else:
629 del el.attrib[attrib]
630 continue
631
632 if attrib is None:
633 new = el.text[:pos] + new_link + el.text[pos+len(link):]
634 el.text = new
635 else:
636 cur = el.get(attrib)
637 if not pos and len(cur) == len(link):
638 new = new_link # most common case
639 else:
640 new = cur[:pos] + new_link + cur[pos+len(link):]
641 el.set(attrib, new)
642
643
644 class _MethodFunc(object):
645 """
646 An object that represents a method on an element as a function;
647 the function takes either an element or an HTML string. It
648 returns whatever the function normally returns, or if the function
649 works in-place (and so returns None) it returns a serialized form
650 of the resulting document.
651 """
652 def __init__(self, name, copy=False, source_class=HtmlMixin):
653 self.name = name
654 self.copy = copy
655 self.__doc__ = getattr(source_class, self.name).__doc__
656 def __call__(self, doc, *args, **kw):
657 result_type = type(doc)
658 if isinstance(doc, basestring):
659 if 'copy' in kw:
660 raise TypeError(
661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
662 doc = fromstring(doc, **kw)
663 else:
664 if 'copy' in kw:
665 make_a_copy = kw.pop('copy')
666 else:
667 make_a_copy = self.copy
668 if make_a_copy:
669 doc = copy.deepcopy(doc)
670 meth = getattr(doc, self.name)
671 result = meth(*args, **kw)
672 # FIXME: this None test is a bit sloppy
673 if result is None:
674 # Then return what we got in
675 return _transform_result(result_type, doc)
676 else:
677 return result
678
679
680 find_rel_links = _MethodFunc('find_rel_links', copy=False)
681 find_class = _MethodFunc('find_class', copy=False)
682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
684 iterlinks = _MethodFunc('iterlinks', copy=False)
685 rewrite_links = _MethodFunc('rewrite_links', copy=True)
686
687
688 class HtmlComment(etree.CommentBase, HtmlMixin):
689 pass
690
691
692 class HtmlElement(etree.ElementBase, HtmlMixin):
693 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
694 cssselect = HtmlMixin.cssselect
695 set = HtmlMixin.set
696
697
698 class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
699 pass
700
701
702 class HtmlEntity(etree.EntityBase, HtmlMixin):
703 pass
704
705
706 class HtmlElementClassLookup(etree.CustomElementClassLookup):
707 """A lookup scheme for HTML Element classes.
708
709 To create a lookup instance with different Element classes, pass a tag
710 name mapping of Element classes in the ``classes`` keyword argument and/or
711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
712 The special key '*' denotes a Mixin class that should be mixed into all
713 Element classes.
714 """
715 _default_element_classes = {}
716
717 def __init__(self, classes=None, mixins=None):
718 etree.CustomElementClassLookup.__init__(self)
719 if classes is None:
720 classes = self._default_element_classes.copy()
721 if mixins:
722 mixers = {}
723 for name, value in mixins:
724 if name == '*':
725 for n in classes.keys():
726 mixers.setdefault(n, []).append(value)
727 else:
728 mixers.setdefault(name, []).append(value)
729 for name, mix_bases in mixers.items():
730 cur = classes.get(name, HtmlElement)
731 bases = tuple(mix_bases + [cur])
732 classes[name] = type(cur.__name__, bases, {})
733 self._element_classes = classes
734
735 def lookup(self, node_type, document, namespace, name):
736 if node_type == 'element':
737 return self._element_classes.get(name.lower(), HtmlElement)
738 elif node_type == 'comment':
739 return HtmlComment
740 elif node_type == 'PI':
741 return HtmlProcessingInstruction
742 elif node_type == 'entity':
743 return HtmlEntity
744 # Otherwise normal lookup
745 return None
746
747
748 ################################################################################
749 # parsing
750 ################################################################################
751
752 _looks_like_full_html_unicode = re.compile(
753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
754 _looks_like_full_html_bytes = re.compile(
755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
756
757
758 def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
759 if parser is None:
760 parser = html_parser
761 value = etree.fromstring(html, parser, **kw)
762 if value is None:
763 raise etree.ParserError(
764 "Document is empty")
765 if ensure_head_body and value.find('head') is None:
766 value.insert(0, Element('head'))
767 if ensure_head_body and value.find('body') is None:
768 value.append(Element('body'))
769 return value
770
771
772 def fragments_fromstring(html, no_leading_text=False, base_url=None,
773 parser=None, **kw):
774 """Parses several HTML elements, returning a list of elements.
775
776 The first item in the list may be a string.
777 If no_leading_text is true, then it will be an error if there is
778 leading text, and it will always be a list of only elements.
779
780 base_url will set the document's base_url attribute
781 (and the tree's docinfo.URL).
782 """
783 if parser is None:
784 parser = html_parser
785 # FIXME: check what happens when you give html with a body, head, etc.
786 if isinstance(html, bytes):
787 if not _looks_like_full_html_bytes(html):
788 # can't use %-formatting in early Py3 versions
789 html = ('<html><body>'.encode('ascii') + html +
790 '</body></html>'.encode('ascii'))
791 else:
792 if not _looks_like_full_html_unicode(html):
793 html = '<html><body>%s</body></html>' % html
794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
795 assert _nons(doc.tag) == 'html'
796 bodies = [e for e in doc if _nons(e.tag) == 'body']
797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
798 body = bodies[0]
799 elements = []
800 if no_leading_text and body.text and body.text.strip():
801 raise etree.ParserError(
802 "There is leading text: %r" % body.text)
803 if body.text and body.text.strip():
804 elements.append(body.text)
805 elements.extend(body)
806 # FIXME: removing the reference to the parent artificial document
807 # would be nice
808 return elements
809
810
811 def fragment_fromstring(html, create_parent=False, base_url=None,
812 parser=None, **kw):
813 """
814 Parses a single HTML element; it is an error if there is more than
815 one element, or if anything but whitespace precedes or follows the
816 element.
817
818 If ``create_parent`` is true (or is a tag name) then a parent node
819 will be created to encapsulate the HTML in a single element. In this
820 case, leading or trailing text is also allowed, as are multiple elements
821 as result of the parsing.
822
823 Passing a ``base_url`` will set the document's ``base_url`` attribute
824 (and the tree's docinfo.URL).
825 """
826 if parser is None:
827 parser = html_parser
828
829 accept_leading_text = bool(create_parent)
830
831 elements = fragments_fromstring(
832 html, parser=parser, no_leading_text=not accept_leading_text,
833 base_url=base_url, **kw)
834
835 if create_parent:
836 if not isinstance(create_parent, basestring):
837 create_parent = 'div'
838 new_root = Element(create_parent)
839 if elements:
840 if isinstance(elements[0], basestring):
841 new_root.text = elements[0]
842 del elements[0]
843 new_root.extend(elements)
844 return new_root
845
846 if not elements:
847 raise etree.ParserError('No elements found')
848 if len(elements) > 1:
849 raise etree.ParserError(
850 "Multiple elements found (%s)"
851 % ', '.join([_element_name(e) for e in elements]))
852 el = elements[0]
853 if el.tail and el.tail.strip():
854 raise etree.ParserError(
855 "Element followed by text: %r" % el.tail)
856 el.tail = None
857 return el
858
859
860 def fromstring(html, base_url=None, parser=None, **kw):
861 """
862 Parse the html, returning a single element/document.
863
864 This tries to minimally parse the chunk of text, without knowing if it
865 is a fragment or a document.
866
867 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
868 """
869 if parser is None:
870 parser = html_parser
871 if isinstance(html, bytes):
872 is_full_html = _looks_like_full_html_bytes(html)
873 else:
874 is_full_html = _looks_like_full_html_unicode(html)
875 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
876 if is_full_html:
877 return doc
878 # otherwise, lets parse it out...
879 bodies = doc.findall('body')
880 if not bodies:
881 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
882 if bodies:
883 body = bodies[0]
884 if len(bodies) > 1:
885 # Somehow there are multiple bodies, which is bad, but just
886 # smash them into one body
887 for other_body in bodies[1:]:
888 if other_body.text:
889 if len(body):
890 body[-1].tail = (body[-1].tail or '') + other_body.text
891 else:
892 body.text = (body.text or '') + other_body.text
893 body.extend(other_body)
894 # We'll ignore tail
895 # I guess we are ignoring attributes too
896 other_body.drop_tree()
897 else:
898 body = None
899 heads = doc.findall('head')
900 if not heads:
901 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
902 if heads:
903 # Well, we have some sort of structure, so lets keep it all
904 head = heads[0]
905 if len(heads) > 1:
906 for other_head in heads[1:]:
907 head.extend(other_head)
908 # We don't care about text or tail in a head
909 other_head.drop_tree()
910 return doc
911 if body is None:
912 return doc
913 if (len(body) == 1 and (not body.text or not body.text.strip())
914 and (not body[-1].tail or not body[-1].tail.strip())):
915 # The body has just one element, so it was probably a single
916 # element passed in
917 return body[0]
918 # Now we have a body which represents a bunch of tags which have the
919 # content that was passed in. We will create a fake container, which
920 # is the body tag, except <body> implies too much structure.
921 if _contains_block_level_tag(body):
922 body.tag = 'div'
923 else:
924 body.tag = 'span'
925 return body
926
927
928 def parse(filename_or_url, parser=None, base_url=None, **kw):
929 """
930 Parse a filename, URL, or file-like object into an HTML document
931 tree. Note: this returns a tree, not an element. Use
932 ``parse(...).getroot()`` to get the document root.
933
934 You can override the base URL with the ``base_url`` keyword. This
935 is most useful when parsing from a file-like object.
936 """
937 if parser is None:
938 parser = html_parser
939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940
941
942 def _contains_block_level_tag(el):
943 # FIXME: I could do this with XPath, but would that just be
944 # unnecessarily slow?
945 for el in el.iter(etree.Element):
946 if _nons(el.tag) in defs.block_tags:
947 return True
948 return False
949
950
951 def _element_name(el):
952 if isinstance(el, etree.CommentBase):
953 return 'comment'
954 elif isinstance(el, basestring):
955 return 'string'
956 else:
957 return _nons(el.tag)
958
959
960 ################################################################################
961 # form handling
962 ################################################################################
963
964 class FormElement(HtmlElement):
965 """
966 Represents a <form> element.
967 """
968
969 @property
970 def inputs(self):
971 """
972 Returns an accessor for all the input elements in the form.
973
974 See `InputGetter` for more information about the object.
975 """
976 return InputGetter(self)
977
978 @property
979 def fields(self):
980 """
981 Dictionary-like object that represents all the fields in this
982 form. You can set values in this dictionary to effect the
983 form.
984 """
985 return FieldsDict(self.inputs)
986
987 @fields.setter
988 def fields(self, value):
989 fields = self.fields
990 prev_keys = fields.keys()
991 for key, value in value.items():
992 if key in prev_keys:
993 prev_keys.remove(key)
994 fields[key] = value
995 for key in prev_keys:
996 if key is None:
997 # Case of an unnamed input; these aren't really
998 # expressed in form_values() anyway.
999 continue
1000 fields[key] = None
1001
1002 def _name(self):
1003 if self.get('name'):
1004 return self.get('name')
1005 elif self.get('id'):
1006 return '#' + self.get('id')
1007 iter_tags = self.body.iter
1008 forms = list(iter_tags('form'))
1009 if not forms:
1010 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
1011 return str(forms.index(self))
1012
1013 def form_values(self):
1014 """
1015 Return a list of tuples of the field values for the form.
1016 This is suitable to be passed to ``urllib.urlencode()``.
1017 """
1018 results = []
1019 for el in self.inputs:
1020 name = el.name
1021 if not name or 'disabled' in el.attrib:
1022 continue
1023 tag = _nons(el.tag)
1024 if tag == 'textarea':
1025 results.append((name, el.value))
1026 elif tag == 'select':
1027 value = el.value
1028 if el.multiple:
1029 for v in value:
1030 results.append((name, v))
1031 elif value is not None:
1032 results.append((name, el.value))
1033 else:
1034 assert tag == 'input', (
1035 "Unexpected tag: %r" % el)
1036 if el.checkable and not el.checked:
1037 continue
1038 if el.type in ('submit', 'image', 'reset', 'file'):
1039 continue
1040 value = el.value
1041 if value is not None:
1042 results.append((name, el.value))
1043 return results
1044
1045 @property
1046 def action(self):
1047 """
1048 Get/set the form's ``action`` attribute.
1049 """
1050 base_url = self.base_url
1051 action = self.get('action')
1052 if base_url and action is not None:
1053 return urljoin(base_url, action)
1054 else:
1055 return action
1056
1057 @action.setter
1058 def action(self, value):
1059 self.set('action', value)
1060
1061 @action.deleter
1062 def action(self):
1063 attrib = self.attrib
1064 if 'action' in attrib:
1065 del attrib['action']
1066
1067 @property
1068 def method(self):
1069 """
1070 Get/set the form's method. Always returns a capitalized
1071 string, and defaults to ``'GET'``
1072 """
1073 return self.get('method', 'GET').upper()
1074
1075 @method.setter
1076 def method(self, value):
1077 self.set('method', value.upper())
1078
1079
1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement
1081
1082
1083 def submit_form(form, extra_values=None, open_http=None):
1084 """
1085 Helper function to submit a form. Returns a file-like object, as from
1086 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
1087 which shows the URL if there were any redirects.
1088
1089 You can use this like::
1090
1091 form = doc.forms[0]
1092 form.inputs['foo'].value = 'bar' # etc
1093 response = form.submit()
1094 doc = parse(response)
1095 doc.make_links_absolute(response.geturl())
1096
1097 To change the HTTP requester, pass a function as ``open_http`` keyword
1098 argument that opens the URL for you. The function must have the following
1099 signature::
1100
1101 open_http(method, URL, values)
1102
1103 The action is one of 'GET' or 'POST', the URL is the target URL as a
1104 string, and the values are a sequence of ``(name, value)`` tuples with the
1105 form data.
1106 """
1107 values = form.form_values()
1108 if extra_values:
1109 if hasattr(extra_values, 'items'):
1110 extra_values = extra_values.items()
1111 values.extend(extra_values)
1112 if open_http is None:
1113 open_http = open_http_urllib
1114 if form.action:
1115 url = form.action
1116 else:
1117 url = form.base_url
1118 return open_http(form.method, url, values)
1119
1120
1121 def open_http_urllib(method, url, values):
1122 if not url:
1123 raise ValueError("cannot submit, no URL provided")
1124 ## FIXME: should test that it's not a relative URL or something
1125 try:
1126 from urllib import urlencode, urlopen
1127 except ImportError: # Python 3
1128 from urllib.request import urlopen
1129 from urllib.parse import urlencode
1130 if method == 'GET':
1131 if '?' in url:
1132 url += '&'
1133 else:
1134 url += '?'
1135 url += urlencode(values)
1136 data = None
1137 else:
1138 data = urlencode(values)
1139 if not isinstance(data, bytes):
1140 data = data.encode('ASCII')
1141 return urlopen(url, data)
1142
1143
1144 class FieldsDict(MutableMapping):
1145
1146 def __init__(self, inputs):
1147 self.inputs = inputs
1148 def __getitem__(self, item):
1149 return self.inputs[item].value
1150 def __setitem__(self, item, value):
1151 self.inputs[item].value = value
1152 def __delitem__(self, item):
1153 raise KeyError(
1154 "You cannot remove keys from ElementDict")
1155 def keys(self):
1156 return self.inputs.keys()
1157 def __contains__(self, item):
1158 return item in self.inputs
1159 def __iter__(self):
1160 return iter(self.inputs.keys())
1161 def __len__(self):
1162 return len(self.inputs)
1163
1164 def __repr__(self):
1165 return '<%s for form %s>' % (
1166 self.__class__.__name__,
1167 self.inputs.form._name())
1168
1169
1170 class InputGetter(object):
1171
1172 """
1173 An accessor that represents all the input fields in a form.
1174
1175 You can get fields by name from this, with
1176 ``form.inputs['field_name']``. If there are a set of checkboxes
1177 with the same name, they are returned as a list (a `CheckboxGroup`
1178 which also allows value setting). Radio inputs are handled
1179 similarly.
1180
1181 You can also iterate over this to get all input elements. This
1182 won't return the same thing as if you get all the names, as
1183 checkboxes and radio elements are returned individually.
1184 """
1185
1186 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
1187 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
1188
1189 def __init__(self, form):
1190 self.form = form
1191
1192 def __repr__(self):
1193 return '<%s for form %s>' % (
1194 self.__class__.__name__,
1195 self.form._name())
1196
1197 ## FIXME: there should be more methods, and it's unclear if this is
1198 ## a dictionary-like object or list-like object
1199
1200 def __getitem__(self, name):
1201 results = self._name_xpath(self.form, name=name)
1202 if results:
1203 type = results[0].get('type')
1204 if type == 'radio' and len(results) > 1:
1205 group = RadioGroup(results)
1206 group.name = name
1207 return group
1208 elif type == 'checkbox' and len(results) > 1:
1209 group = CheckboxGroup(results)
1210 group.name = name
1211 return group
1212 else:
1213 # I don't like throwing away elements like this
1214 return results[0]
1215 else:
1216 raise KeyError(
1217 "No input element with the name %r" % name)
1218
1219 def __contains__(self, name):
1220 results = self._name_xpath(self.form, name=name)
1221 return bool(results)
1222
1223 def keys(self):
1224 names = set()
1225 for el in self:
1226 names.add(el.name)
1227 if None in names:
1228 names.remove(None)
1229 return list(names)
1230
1231 def __iter__(self):
1232 ## FIXME: kind of dumb to turn a list into an iterator, only
1233 ## to have it likely turned back into a list again :(
1234 return iter(self._all_xpath(self.form))
1235
1236
1237 class InputMixin(object):
1238 """
1239 Mix-in for all input elements (input, select, and textarea)
1240 """
1241 @property
1242 def name(self):
1243 """
1244 Get/set the name of the element
1245 """
1246 return self.get('name')
1247
1248 @name.setter
1249 def name(self, value):
1250 self.set('name', value)
1251
1252 @name.deleter
1253 def name(self):
1254 attrib = self.attrib
1255 if 'name' in attrib:
1256 del attrib['name']
1257
1258 def __repr__(self):
1259 type_name = getattr(self, 'type', None)
1260 if type_name:
1261 type_name = ' type=%r' % type_name
1262 else:
1263 type_name = ''
1264 return '<%s %x name=%r%s>' % (
1265 self.__class__.__name__, id(self), self.name, type_name)
1266
1267
1268 class TextareaElement(InputMixin, HtmlElement):
1269 """
1270 ``<textarea>`` element. You can get the name with ``.name`` and
1271 get/set the value with ``.value``
1272 """
1273 @property
1274 def value(self):
1275 """
1276 Get/set the value (which is the contents of this element)
1277 """
1278 content = self.text or ''
1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1280 serialisation_method = 'xml'
1281 else:
1282 serialisation_method = 'html'
1283 for el in self:
1284 # it's rare that we actually get here, so let's not use ''.join()
1285 content += etree.tostring(
1286 el, method=serialisation_method, encoding='unicode')
1287 return content
1288
1289 @value.setter
1290 def value(self, value):
1291 del self[:]
1292 self.text = value
1293
1294 @value.deleter
1295 def value(self):
1296 self.text = ''
1297 del self[:]
1298
1299
1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1301
1302
1303 class SelectElement(InputMixin, HtmlElement):
1304 """
1305 ``<select>`` element. You can get the name with ``.name``.
1306
1307 ``.value`` will be the value of the selected option, unless this
1308 is a multi-select element (``<select multiple>``), in which case
1309 it will be a set-like object. In either case ``.value_options``
1310 gives the possible values.
1311
1312 The boolean attribute ``.multiple`` shows if this is a
1313 multi-select.
1314 """
1315 @property
1316 def value(self):
1317 """
1318 Get/set the value of this select (the selected option).
1319
1320 If this is a multi-select, this is a set-like object that
1321 represents all the selected options.
1322 """
1323 if self.multiple:
1324 return MultipleSelectOptions(self)
1325 options = _options_xpath(self)
1326
1327 try:
1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1329 except StopIteration:
1330 try:
1331 selected_option = next(el for el in options if el.get('disabled') is None)
1332 except StopIteration:
1333 return None
1334 value = selected_option.get('value')
1335 if value is None:
1336 value = (selected_option.text or '').strip()
1337 return value
1338
1339 @value.setter
1340 def value(self, value):
1341 if self.multiple:
1342 if isinstance(value, basestring):
1343 raise TypeError("You must pass in a sequence")
1344 values = self.value
1345 values.clear()
1346 values.update(value)
1347 return
1348 checked_option = None
1349 if value is not None:
1350 for el in _options_xpath(self):
1351 opt_value = el.get('value')
1352 if opt_value is None:
1353 opt_value = (el.text or '').strip()
1354 if opt_value == value:
1355 checked_option = el
1356 break
1357 else:
1358 raise ValueError(
1359 "There is no option with the value of %r" % value)
1360 for el in _options_xpath(self):
1361 if 'selected' in el.attrib:
1362 del el.attrib['selected']
1363 if checked_option is not None:
1364 checked_option.set('selected', '')
1365
1366 @value.deleter
1367 def value(self):
1368 # FIXME: should del be allowed at all?
1369 if self.multiple:
1370 self.value.clear()
1371 else:
1372 self.value = None
1373
1374 @property
1375 def value_options(self):
1376 """
1377 All the possible values this select can have (the ``value``
1378 attribute of all the ``<option>`` elements.
1379 """
1380 options = []
1381 for el in _options_xpath(self):
1382 value = el.get('value')
1383 if value is None:
1384 value = (el.text or '').strip()
1385 options.append(value)
1386 return options
1387
1388 @property
1389 def multiple(self):
1390 """
1391 Boolean attribute: is there a ``multiple`` attribute on this element.
1392 """
1393 return 'multiple' in self.attrib
1394
1395 @multiple.setter
1396 def multiple(self, value):
1397 if value:
1398 self.set('multiple', '')
1399 elif 'multiple' in self.attrib:
1400 del self.attrib['multiple']
1401
1402
1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1404
1405
1406 class MultipleSelectOptions(SetMixin):
1407 """
1408 Represents all the selected options in a ``<select multiple>`` element.
1409
1410 You can add to this set-like option to select an option, or remove
1411 to unselect the option.
1412 """
1413
1414 def __init__(self, select):
1415 self.select = select
1416
1417 @property
1418 def options(self):
1419 """
1420 Iterator of all the ``<option>`` elements.
1421 """
1422 return iter(_options_xpath(self.select))
1423
1424 def __iter__(self):
1425 for option in self.options:
1426 if 'selected' in option.attrib:
1427 opt_value = option.get('value')
1428 if opt_value is None:
1429 opt_value = (option.text or '').strip()
1430 yield opt_value
1431
1432 def add(self, item):
1433 for option in self.options:
1434 opt_value = option.get('value')
1435 if opt_value is None:
1436 opt_value = (option.text or '').strip()
1437 if opt_value == item:
1438 option.set('selected', '')
1439 break
1440 else:
1441 raise ValueError(
1442 "There is no option with the value %r" % item)
1443
1444 def remove(self, item):
1445 for option in self.options:
1446 opt_value = option.get('value')
1447 if opt_value is None:
1448 opt_value = (option.text or '').strip()
1449 if opt_value == item:
1450 if 'selected' in option.attrib:
1451 del option.attrib['selected']
1452 else:
1453 raise ValueError(
1454 "The option %r is not currently selected" % item)
1455 break
1456 else:
1457 raise ValueError(
1458 "There is not option with the value %r" % item)
1459
1460 def __repr__(self):
1461 return '<%s {%s} for select name=%r>' % (
1462 self.__class__.__name__,
1463 ', '.join([repr(v) for v in self]),
1464 self.select.name)
1465
1466
1467 class RadioGroup(list):
1468 """
1469 This object represents several ``<input type=radio>`` elements
1470 that have the same name.
1471
1472 You can use this like a list, but also use the property
1473 ``.value`` to check/uncheck inputs. Also you can use
1474 ``.value_options`` to get the possible values.
1475 """
1476 @property
1477 def value(self):
1478 """
1479 Get/set the value, which checks the radio with that value (and
1480 unchecks any other value).
1481 """
1482 for el in self:
1483 if 'checked' in el.attrib:
1484 return el.get('value')
1485 return None
1486
1487 @value.setter
1488 def value(self, value):
1489 checked_option = None
1490 if value is not None:
1491 for el in self:
1492 if el.get('value') == value:
1493 checked_option = el
1494 break
1495 else:
1496 raise ValueError("There is no radio input with the value %r" % value)
1497 for el in self:
1498 if 'checked' in el.attrib:
1499 del el.attrib['checked']
1500 if checked_option is not None:
1501 checked_option.set('checked', '')
1502
1503 @value.deleter
1504 def value(self):
1505 self.value = None
1506
1507 @property
1508 def value_options(self):
1509 """
1510 Returns a list of all the possible values.
1511 """
1512 return [el.get('value') for el in self]
1513
1514 def __repr__(self):
1515 return '%s(%s)' % (
1516 self.__class__.__name__,
1517 list.__repr__(self))
1518
1519
1520 class CheckboxGroup(list):
1521 """
1522 Represents a group of checkboxes (``<input type=checkbox>``) that
1523 have the same name.
1524
1525 In addition to using this like a list, the ``.value`` attribute
1526 returns a set-like object that you can add to or remove from to
1527 check and uncheck checkboxes. You can also use ``.value_options``
1528 to get the possible values.
1529 """
1530 @property
1531 def value(self):
1532 """
1533 Return a set-like object that can be modified to check or
1534 uncheck individual checkboxes according to their value.
1535 """
1536 return CheckboxValues(self)
1537
1538 @value.setter
1539 def value(self, value):
1540 values = self.value
1541 values.clear()
1542 if not hasattr(value, '__iter__'):
1543 raise ValueError(
1544 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1545 % (self[0].name, value))
1546 values.update(value)
1547
1548 @value.deleter
1549 def value(self):
1550 self.value.clear()
1551
1552 @property
1553 def value_options(self):
1554 """
1555 Returns a list of all the possible values.
1556 """
1557 return [el.get('value') for el in self]
1558
1559 def __repr__(self):
1560 return '%s(%s)' % (
1561 self.__class__.__name__, list.__repr__(self))
1562
1563
1564 class CheckboxValues(SetMixin):
1565 """
1566 Represents the values of the checked checkboxes in a group of
1567 checkboxes with the same name.
1568 """
1569
1570 def __init__(self, group):
1571 self.group = group
1572
1573 def __iter__(self):
1574 return iter([
1575 el.get('value')
1576 for el in self.group
1577 if 'checked' in el.attrib])
1578
1579 def add(self, value):
1580 for el in self.group:
1581 if el.get('value') == value:
1582 el.set('checked', '')
1583 break
1584 else:
1585 raise KeyError("No checkbox with value %r" % value)
1586
1587 def remove(self, value):
1588 for el in self.group:
1589 if el.get('value') == value:
1590 if 'checked' in el.attrib:
1591 del el.attrib['checked']
1592 else:
1593 raise KeyError(
1594 "The checkbox with value %r was already unchecked" % value)
1595 break
1596 else:
1597 raise KeyError(
1598 "No checkbox with value %r" % value)
1599
1600 def __repr__(self):
1601 return '<%s {%s} for checkboxes name=%r>' % (
1602 self.__class__.__name__,
1603 ', '.join([repr(v) for v in self]),
1604 self.group.name)
1605
1606
1607 class InputElement(InputMixin, HtmlElement):
1608 """
1609 Represents an ``<input>`` element.
1610
1611 You can get the type with ``.type`` (which is lower-cased and
1612 defaults to ``'text'``).
1613
1614 Also you can get and set the value with ``.value``
1615
1616 Checkboxes and radios have the attribute ``input.checkable ==
1617 True`` (for all others it is false) and a boolean attribute
1618 ``.checked``.
1619
1620 """
1621
1622 ## FIXME: I'm a little uncomfortable with the use of .checked
1623 @property
1624 def value(self):
1625 """
1626 Get/set the value of this element, using the ``value`` attribute.
1627
1628 Also, if this is a checkbox and it has no value, this defaults
1629 to ``'on'``. If it is a checkbox or radio that is not
1630 checked, this returns None.
1631 """
1632 if self.checkable:
1633 if self.checked:
1634 return self.get('value') or 'on'
1635 else:
1636 return None
1637 return self.get('value')
1638
1639 @value.setter
1640 def value(self, value):
1641 if self.checkable:
1642 if not value:
1643 self.checked = False
1644 else:
1645 self.checked = True
1646 if isinstance(value, basestring):
1647 self.set('value', value)
1648 else:
1649 self.set('value', value)
1650
1651 @value.deleter
1652 def value(self):
1653 if self.checkable:
1654 self.checked = False
1655 else:
1656 if 'value' in self.attrib:
1657 del self.attrib['value']
1658
1659 @property
1660 def type(self):
1661 """
1662 Return the type of this element (using the type attribute).
1663 """
1664 return self.get('type', 'text').lower()
1665
1666 @type.setter
1667 def type(self, value):
1668 self.set('type', value)
1669
1670 @property
1671 def checkable(self):
1672 """
1673 Boolean: can this element be checked?
1674 """
1675 return self.type in ('checkbox', 'radio')
1676
1677 @property
1678 def checked(self):
1679 """
1680 Boolean attribute to get/set the presence of the ``checked``
1681 attribute.
1682
1683 You can only use this on checkable input types.
1684 """
1685 if not self.checkable:
1686 raise AttributeError('Not a checkable input type')
1687 return 'checked' in self.attrib
1688
1689 @checked.setter
1690 def checked(self, value):
1691 if not self.checkable:
1692 raise AttributeError('Not a checkable input type')
1693 if value:
1694 self.set('checked', '')
1695 else:
1696 attrib = self.attrib
1697 if 'checked' in attrib:
1698 del attrib['checked']
1699
1700
1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1702
1703
1704 class LabelElement(HtmlElement):
1705 """
1706 Represents a ``<label>`` element.
1707
1708 Label elements are linked to other elements with their ``for``
1709 attribute. You can access this element with ``label.for_element``.
1710 """
1711 @property
1712 def for_element(self):
1713 """
1714 Get/set the element this label points to. Return None if it
1715 can't be found.
1716 """
1717 id = self.get('for')
1718 if not id:
1719 return None
1720 return self.body.get_element_by_id(id)
1721
1722 @for_element.setter
1723 def for_element(self, other):
1724 id = other.get('id')
1725 if not id:
1726 raise TypeError(
1727 "Element %r has no id attribute" % other)
1728 self.set('for', id)
1729
1730 @for_element.deleter
1731 def for_element(self):
1732 attrib = self.attrib
1733 if 'id' in attrib:
1734 del attrib['id']
1735
1736
1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1738
1739
1740 ############################################################
1741 ## Serialization
1742 ############################################################
1743
1744 def html_to_xhtml(html):
1745 """Convert all tags in an HTML tree to XHTML by moving them to the
1746 XHTML namespace.
1747 """
1748 try:
1749 html = html.getroot()
1750 except AttributeError:
1751 pass
1752 prefix = "{%s}" % XHTML_NAMESPACE
1753 for el in html.iter(etree.Element):
1754 tag = el.tag
1755 if tag[0] != '{':
1756 el.tag = prefix + tag
1757
1758
1759 def xhtml_to_html(xhtml):
1760 """Convert all tags in an XHTML tree to HTML by removing their
1761 XHTML namespace.
1762 """
1763 try:
1764 xhtml = xhtml.getroot()
1765 except AttributeError:
1766 pass
1767 prefix = "{%s}" % XHTML_NAMESPACE
1768 prefix_len = len(prefix)
1769 for el in xhtml.iter(prefix + "*"):
1770 el.tag = el.tag[prefix_len:]
1771
1772
1773 # This isn't a general match, but it's a match for what libxml2
1774 # specifically serialises:
1775 __str_replace_meta_content_type = re.compile(
1776 r'<meta http-equiv="Content-Type"[^>]*>').sub
1777 __bytes_replace_meta_content_type = re.compile(
1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1779
1780
1781 def tostring(doc, pretty_print=False, include_meta_content_type=False,
1782 encoding=None, method="html", with_tail=True, doctype=None):
1783 """Return an HTML string representation of the document.
1784
1785 Note: if include_meta_content_type is true this will create a
1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1787 regardless of the value of include_meta_content_type any existing
1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1789
1790 The ``encoding`` argument controls the output encoding (defaults to
1791 ASCII, with &#...; character references for any characters outside
1792 of ASCII). Note that you can pass the name ``'unicode'`` as
1793 ``encoding`` argument to serialise to a Unicode string.
1794
1795 The ``method`` argument defines the output method. It defaults to
1796 'html', but can also be 'xml' for xhtml output, or 'text' to
1797 serialise to plain text without markup.
1798
1799 To leave out the tail text of the top-level element that is being
1800 serialised, pass ``with_tail=False``.
1801
1802 The ``doctype`` option allows passing in a plain string that will
1803 be serialised before the XML tree. Note that passing in non
1804 well-formed content here will make the XML output non well-formed.
1805 Also, an existing doctype in the document tree will not be removed
1806 when serialising an ElementTree instance.
1807
1808 Example::
1809
1810 >>> from lxml import html
1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1812
1813 >>> html.tostring(root)
1814 b'<p>Hello<br>world!</p>'
1815 >>> html.tostring(root, method='html')
1816 b'<p>Hello<br>world!</p>'
1817
1818 >>> html.tostring(root, method='xml')
1819 b'<p>Hello<br/>world!</p>'
1820
1821 >>> html.tostring(root, method='text')
1822 b'Helloworld!'
1823
1824 >>> html.tostring(root, method='text', encoding='unicode')
1825 u'Helloworld!'
1826
1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1828 >>> html.tostring(root[0], method='text', encoding='unicode')
1829 u'Helloworld!TAIL'
1830
1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1832 u'Helloworld!'
1833
1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1835 >>> html.tostring(doc, method='html', encoding='unicode')
1836 u'<html><body><p>Hello<br>world!</p></body></html>'
1837
1838 >>> print(html.tostring(doc, method='html', encoding='unicode',
1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1842 <html><body><p>Hello<br>world!</p></body></html>
1843 """
1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1845 encoding=encoding, with_tail=with_tail,
1846 doctype=doctype)
1847 if method == 'html' and not include_meta_content_type:
1848 if isinstance(html, str):
1849 html = __str_replace_meta_content_type('', html)
1850 else:
1851 html = __bytes_replace_meta_content_type(bytes(), html)
1852 return html
1853
1854
1855 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1856
1857
1858 def open_in_browser(doc, encoding=None):
1859 """
1860 Open the HTML document in a web browser, saving it to a temporary
1861 file to open it. Note that this does not delete the file after
1862 use. This is mainly meant for debugging.
1863 """
1864 import os
1865 import webbrowser
1866 import tempfile
1867 if not isinstance(doc, etree._ElementTree):
1868 doc = etree.ElementTree(doc)
1869 handle, fn = tempfile.mkstemp(suffix='.html')
1870 f = os.fdopen(handle, 'wb')
1871 try:
1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1873 finally:
1874 # we leak the file itself here, but we should at least close it
1875 f.close()
1876 url = 'file://' + fn.replace(os.path.sep, '/')
1877 print(url)
1878 webbrowser.open(url)
1879
1880
1881 ################################################################################
1882 # configure Element class lookup
1883 ################################################################################
1884
1885 class HTMLParser(etree.HTMLParser):
1886 """An HTML parser that is configured to return lxml.html Element
1887 objects.
1888 """
1889 def __init__(self, **kwargs):
1890 super(HTMLParser, self).__init__(**kwargs)
1891 self.set_element_class_lookup(HtmlElementClassLookup())
1892
1893
1894 class XHTMLParser(etree.XMLParser):
1895 """An XML parser that is configured to return lxml.html Element
1896 objects.
1897
1898 Note that this parser is not really XHTML aware unless you let it
1899 load a DTD that declares the HTML entities. To do this, make sure
1900 you have the XHTML DTDs installed in your catalogs, and create the
1901 parser like this::
1902
1903 >>> parser = XHTMLParser(load_dtd=True)
1904
1905 If you additionally want to validate the document, use this::
1906
1907 >>> parser = XHTMLParser(dtd_validation=True)
1908
1909 For catalog support, see http://www.xmlsoft.org/catalog.html.
1910 """
1911 def __init__(self, **kwargs):
1912 super(XHTMLParser, self).__init__(**kwargs)
1913 self.set_element_class_lookup(HtmlElementClassLookup())
1914
1915
1916 def Element(*args, **kw):
1917 """Create a new HTML Element.
1918
1919 This can also be used for XHTML documents.
1920 """
1921 v = html_parser.makeelement(*args, **kw)
1922 return v
1923
1924
1925 html_parser = HTMLParser()
1926 xhtml_parser = XHTMLParser()