Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/lxml/html/__init__.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 # Copyright (c) 2004 Ian Bicking. All rights reserved. | |
2 # | |
3 # Redistribution and use in source and binary forms, with or without | |
4 # modification, are permitted provided that the following conditions are | |
5 # met: | |
6 # | |
7 # 1. Redistributions of source code must retain the above copyright | |
8 # notice, this list of conditions and the following disclaimer. | |
9 # | |
10 # 2. Redistributions in binary form must reproduce the above copyright | |
11 # notice, this list of conditions and the following disclaimer in | |
12 # the documentation and/or other materials provided with the | |
13 # distribution. | |
14 # | |
15 # 3. Neither the name of Ian Bicking nor the names of its contributors may | |
16 # be used to endorse or promote products derived from this software | |
17 # without specific prior written permission. | |
18 # | |
19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR | |
23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
30 | |
31 """The ``lxml.html`` tool set for HTML handling. | |
32 """ | |
33 | |
34 from __future__ import absolute_import | |
35 | |
36 __all__ = [ | |
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', | |
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', | |
39 'find_rel_links', 'find_class', 'make_links_absolute', | |
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] | |
41 | |
42 | |
43 import copy | |
44 import sys | |
45 import re | |
46 from functools import partial | |
47 | |
48 try: | |
49 from collections.abc import MutableMapping, MutableSet | |
50 except ImportError: | |
51 from collections import MutableMapping, MutableSet | |
52 | |
53 from .. import etree | |
54 from . import defs | |
55 from ._setmixin import SetMixin | |
56 | |
57 try: | |
58 from urlparse import urljoin | |
59 except ImportError: | |
60 # Python 3 | |
61 from urllib.parse import urljoin | |
62 | |
63 try: | |
64 unicode | |
65 except NameError: | |
66 # Python 3 | |
67 unicode = str | |
68 try: | |
69 basestring | |
70 except NameError: | |
71 # Python 3 | |
72 basestring = (str, bytes) | |
73 | |
74 | |
75 def __fix_docstring(s): | |
76 if not s: | |
77 return s | |
78 if sys.version_info[0] >= 3: | |
79 sub = re.compile(r"^(\s*)u'", re.M).sub | |
80 else: | |
81 sub = re.compile(r"^(\s*)b'", re.M).sub | |
82 return sub(r"\1'", s) | |
83 | |
84 | |
85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" | |
86 | |
87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", | |
88 namespaces={'x':XHTML_NAMESPACE}) | |
89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", | |
90 namespaces={'x':XHTML_NAMESPACE}) | |
91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", | |
92 namespaces={'x':XHTML_NAMESPACE}) | |
93 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) | |
94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") | |
95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") | |
96 _collect_string_content = etree.XPath("string()") | |
97 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer | |
98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer | |
99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", | |
100 namespaces={'x':XHTML_NAMESPACE}) | |
101 _archive_re = re.compile(r'[^ ]+') | |
102 _parse_meta_refresh_url = re.compile( | |
103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search | |
104 | |
105 | |
106 def _unquote_match(s, pos): | |
107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": | |
108 return s[1:-1], pos+1 | |
109 else: | |
110 return s,pos | |
111 | |
112 | |
113 def _transform_result(typ, result): | |
114 """Convert the result back into the input type. | |
115 """ | |
116 if issubclass(typ, bytes): | |
117 return tostring(result, encoding='utf-8') | |
118 elif issubclass(typ, unicode): | |
119 return tostring(result, encoding='unicode') | |
120 else: | |
121 return result | |
122 | |
123 | |
124 def _nons(tag): | |
125 if isinstance(tag, basestring): | |
126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: | |
127 return tag.split('}')[-1] | |
128 return tag | |
129 | |
130 | |
131 class Classes(MutableSet): | |
132 """Provides access to an element's class attribute as a set-like collection. | |
133 Usage:: | |
134 | |
135 >>> el = fromstring('<p class="hidden large">Text</p>') | |
136 >>> classes = el.classes # or: classes = Classes(el.attrib) | |
137 >>> classes |= ['block', 'paragraph'] | |
138 >>> el.get('class') | |
139 'hidden large block paragraph' | |
140 >>> classes.toggle('hidden') | |
141 False | |
142 >>> el.get('class') | |
143 'large block paragraph' | |
144 >>> classes -= ('some', 'classes', 'block') | |
145 >>> el.get('class') | |
146 'large paragraph' | |
147 """ | |
148 def __init__(self, attributes): | |
149 self._attributes = attributes | |
150 self._get_class_value = partial(attributes.get, 'class', '') | |
151 | |
152 def add(self, value): | |
153 """ | |
154 Add a class. | |
155 | |
156 This has no effect if the class is already present. | |
157 """ | |
158 if not value or re.search(r'\s', value): | |
159 raise ValueError("Invalid class name: %r" % value) | |
160 classes = self._get_class_value().split() | |
161 if value in classes: | |
162 return | |
163 classes.append(value) | |
164 self._attributes['class'] = ' '.join(classes) | |
165 | |
166 def discard(self, value): | |
167 """ | |
168 Remove a class if it is currently present. | |
169 | |
170 If the class is not present, do nothing. | |
171 """ | |
172 if not value or re.search(r'\s', value): | |
173 raise ValueError("Invalid class name: %r" % value) | |
174 classes = [name for name in self._get_class_value().split() | |
175 if name != value] | |
176 if classes: | |
177 self._attributes['class'] = ' '.join(classes) | |
178 elif 'class' in self._attributes: | |
179 del self._attributes['class'] | |
180 | |
181 def remove(self, value): | |
182 """ | |
183 Remove a class; it must currently be present. | |
184 | |
185 If the class is not present, raise a KeyError. | |
186 """ | |
187 if not value or re.search(r'\s', value): | |
188 raise ValueError("Invalid class name: %r" % value) | |
189 super(Classes, self).remove(value) | |
190 | |
191 def __contains__(self, name): | |
192 classes = self._get_class_value() | |
193 return name in classes and name in classes.split() | |
194 | |
195 def __iter__(self): | |
196 return iter(self._get_class_value().split()) | |
197 | |
198 def __len__(self): | |
199 return len(self._get_class_value().split()) | |
200 | |
201 # non-standard methods | |
202 | |
203 def update(self, values): | |
204 """ | |
205 Add all names from 'values'. | |
206 """ | |
207 classes = self._get_class_value().split() | |
208 extended = False | |
209 for value in values: | |
210 if value not in classes: | |
211 classes.append(value) | |
212 extended = True | |
213 if extended: | |
214 self._attributes['class'] = ' '.join(classes) | |
215 | |
216 def toggle(self, value): | |
217 """ | |
218 Add a class name if it isn't there yet, or remove it if it exists. | |
219 | |
220 Returns true if the class was added (and is now enabled) and | |
221 false if it was removed (and is now disabled). | |
222 """ | |
223 if not value or re.search(r'\s', value): | |
224 raise ValueError("Invalid class name: %r" % value) | |
225 classes = self._get_class_value().split() | |
226 try: | |
227 classes.remove(value) | |
228 enabled = False | |
229 except ValueError: | |
230 classes.append(value) | |
231 enabled = True | |
232 if classes: | |
233 self._attributes['class'] = ' '.join(classes) | |
234 else: | |
235 del self._attributes['class'] | |
236 return enabled | |
237 | |
238 | |
239 class HtmlMixin(object): | |
240 | |
241 def set(self, key, value=None): | |
242 """set(self, key, value=None) | |
243 | |
244 Sets an element attribute. If no value is provided, or if the value is None, | |
245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" | |
246 for ``form.set('novalidate')``. | |
247 """ | |
248 super(HtmlElement, self).set(key, value) | |
249 | |
250 @property | |
251 def classes(self): | |
252 """ | |
253 A set-like wrapper around the 'class' attribute. | |
254 """ | |
255 return Classes(self.attrib) | |
256 | |
257 @classes.setter | |
258 def classes(self, classes): | |
259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. | |
260 value = classes._get_class_value() | |
261 if value: | |
262 self.set('class', value) | |
263 elif self.get('class') is not None: | |
264 del self.attrib['class'] | |
265 | |
266 @property | |
267 def base_url(self): | |
268 """ | |
269 Returns the base URL, given when the page was parsed. | |
270 | |
271 Use with ``urlparse.urljoin(el.base_url, href)`` to get | |
272 absolute URLs. | |
273 """ | |
274 return self.getroottree().docinfo.URL | |
275 | |
276 @property | |
277 def forms(self): | |
278 """ | |
279 Return a list of all the forms | |
280 """ | |
281 return _forms_xpath(self) | |
282 | |
283 @property | |
284 def body(self): | |
285 """ | |
286 Return the <body> element. Can be called from a child element | |
287 to get the document's head. | |
288 """ | |
289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] | |
290 | |
291 @property | |
292 def head(self): | |
293 """ | |
294 Returns the <head> element. Can be called from a child | |
295 element to get the document's head. | |
296 """ | |
297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] | |
298 | |
299 @property | |
300 def label(self): | |
301 """ | |
302 Get or set any <label> element associated with this element. | |
303 """ | |
304 id = self.get('id') | |
305 if not id: | |
306 return None | |
307 result = _label_xpath(self, id=id) | |
308 if not result: | |
309 return None | |
310 else: | |
311 return result[0] | |
312 | |
313 @label.setter | |
314 def label(self, label): | |
315 id = self.get('id') | |
316 if not id: | |
317 raise TypeError( | |
318 "You cannot set a label for an element (%r) that has no id" | |
319 % self) | |
320 if _nons(label.tag) != 'label': | |
321 raise TypeError( | |
322 "You can only assign label to a label element (not %r)" | |
323 % label) | |
324 label.set('for', id) | |
325 | |
326 @label.deleter | |
327 def label(self): | |
328 label = self.label | |
329 if label is not None: | |
330 del label.attrib['for'] | |
331 | |
332 def drop_tree(self): | |
333 """ | |
334 Removes this element from the tree, including its children and | |
335 text. The tail text is joined to the previous element or | |
336 parent. | |
337 """ | |
338 parent = self.getparent() | |
339 assert parent is not None | |
340 if self.tail: | |
341 previous = self.getprevious() | |
342 if previous is None: | |
343 parent.text = (parent.text or '') + self.tail | |
344 else: | |
345 previous.tail = (previous.tail or '') + self.tail | |
346 parent.remove(self) | |
347 | |
348 def drop_tag(self): | |
349 """ | |
350 Remove the tag, but not its children or text. The children and text | |
351 are merged into the parent. | |
352 | |
353 Example:: | |
354 | |
355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') | |
356 >>> h.find('.//b').drop_tag() | |
357 >>> print(tostring(h, encoding='unicode')) | |
358 <div>Hello World!</div> | |
359 """ | |
360 parent = self.getparent() | |
361 assert parent is not None | |
362 previous = self.getprevious() | |
363 if self.text and isinstance(self.tag, basestring): | |
364 # not a Comment, etc. | |
365 if previous is None: | |
366 parent.text = (parent.text or '') + self.text | |
367 else: | |
368 previous.tail = (previous.tail or '') + self.text | |
369 if self.tail: | |
370 if len(self): | |
371 last = self[-1] | |
372 last.tail = (last.tail or '') + self.tail | |
373 elif previous is None: | |
374 parent.text = (parent.text or '') + self.tail | |
375 else: | |
376 previous.tail = (previous.tail or '') + self.tail | |
377 index = parent.index(self) | |
378 parent[index:index+1] = self[:] | |
379 | |
380 def find_rel_links(self, rel): | |
381 """ | |
382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. | |
383 """ | |
384 rel = rel.lower() | |
385 return [el for el in _rel_links_xpath(self) | |
386 if el.get('rel').lower() == rel] | |
387 | |
388 def find_class(self, class_name): | |
389 """ | |
390 Find any elements with the given class name. | |
391 """ | |
392 return _class_xpath(self, class_name=class_name) | |
393 | |
394 def get_element_by_id(self, id, *default): | |
395 """ | |
396 Get the first element in a document with the given id. If none is | |
397 found, return the default argument if provided or raise KeyError | |
398 otherwise. | |
399 | |
400 Note that there can be more than one element with the same id, | |
401 and this isn't uncommon in HTML documents found in the wild. | |
402 Browsers return only the first match, and this function does | |
403 the same. | |
404 """ | |
405 try: | |
406 # FIXME: should this check for multiple matches? | |
407 # browsers just return the first one | |
408 return _id_xpath(self, id=id)[0] | |
409 except IndexError: | |
410 if default: | |
411 return default[0] | |
412 else: | |
413 raise KeyError(id) | |
414 | |
415 def text_content(self): | |
416 """ | |
417 Return the text content of the tag (and the text in any children). | |
418 """ | |
419 return _collect_string_content(self) | |
420 | |
421 def cssselect(self, expr, translator='html'): | |
422 """ | |
423 Run the CSS expression on this element and its children, | |
424 returning a list of the results. | |
425 | |
426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) | |
427 -- note that pre-compiling the expression can provide a substantial | |
428 speedup. | |
429 """ | |
430 # Do the import here to make the dependency optional. | |
431 from lxml.cssselect import CSSSelector | |
432 return CSSSelector(expr, translator=translator)(self) | |
433 | |
434 ######################################## | |
435 ## Link functions | |
436 ######################################## | |
437 | |
438 def make_links_absolute(self, base_url=None, resolve_base_href=True, | |
439 handle_failures=None): | |
440 """ | |
441 Make all links in the document absolute, given the | |
442 ``base_url`` for the document (the full URL where the document | |
443 came from), or if no ``base_url`` is given, then the ``.base_url`` | |
444 of the document. | |
445 | |
446 If ``resolve_base_href`` is true, then any ``<base href>`` | |
447 tags in the document are used *and* removed from the document. | |
448 If it is false then any such tag is ignored. | |
449 | |
450 If ``handle_failures`` is None (default), a failure to process | |
451 a URL will abort the processing. If set to 'ignore', errors | |
452 are ignored. If set to 'discard', failing URLs will be removed. | |
453 """ | |
454 if base_url is None: | |
455 base_url = self.base_url | |
456 if base_url is None: | |
457 raise TypeError( | |
458 "No base_url given, and the document has no base_url") | |
459 if resolve_base_href: | |
460 self.resolve_base_href() | |
461 | |
462 if handle_failures == 'ignore': | |
463 def link_repl(href): | |
464 try: | |
465 return urljoin(base_url, href) | |
466 except ValueError: | |
467 return href | |
468 elif handle_failures == 'discard': | |
469 def link_repl(href): | |
470 try: | |
471 return urljoin(base_url, href) | |
472 except ValueError: | |
473 return None | |
474 elif handle_failures is None: | |
475 def link_repl(href): | |
476 return urljoin(base_url, href) | |
477 else: | |
478 raise ValueError( | |
479 "unexpected value for handle_failures: %r" % handle_failures) | |
480 | |
481 self.rewrite_links(link_repl) | |
482 | |
483 def resolve_base_href(self, handle_failures=None): | |
484 """ | |
485 Find any ``<base href>`` tag in the document, and apply its | |
486 values to all links found in the document. Also remove the | |
487 tag once it has been applied. | |
488 | |
489 If ``handle_failures`` is None (default), a failure to process | |
490 a URL will abort the processing. If set to 'ignore', errors | |
491 are ignored. If set to 'discard', failing URLs will be removed. | |
492 """ | |
493 base_href = None | |
494 basetags = self.xpath('//base[@href]|//x:base[@href]', | |
495 namespaces={'x': XHTML_NAMESPACE}) | |
496 for b in basetags: | |
497 base_href = b.get('href') | |
498 b.drop_tree() | |
499 if not base_href: | |
500 return | |
501 self.make_links_absolute(base_href, resolve_base_href=False, | |
502 handle_failures=handle_failures) | |
503 | |
504 def iterlinks(self): | |
505 """ | |
506 Yield (element, attribute, link, pos), where attribute may be None | |
507 (indicating the link is in the text). ``pos`` is the position | |
508 where the link occurs; often 0, but sometimes something else in | |
509 the case of links in stylesheets or style tags. | |
510 | |
511 Note: <base href> is *not* taken into account in any way. The | |
512 link you get is exactly the link in the document. | |
513 | |
514 Note: multiple links inside of a single text string or | |
515 attribute value are returned in reversed order. This makes it | |
516 possible to replace or delete them from the text string value | |
517 based on their reported text positions. Otherwise, a | |
518 modification at one text position can change the positions of | |
519 links reported later on. | |
520 """ | |
521 link_attrs = defs.link_attrs | |
522 for el in self.iter(etree.Element): | |
523 attribs = el.attrib | |
524 tag = _nons(el.tag) | |
525 if tag == 'object': | |
526 codebase = None | |
527 ## <object> tags have attributes that are relative to | |
528 ## codebase | |
529 if 'codebase' in attribs: | |
530 codebase = el.get('codebase') | |
531 yield (el, 'codebase', codebase, 0) | |
532 for attrib in ('classid', 'data'): | |
533 if attrib in attribs: | |
534 value = el.get(attrib) | |
535 if codebase is not None: | |
536 value = urljoin(codebase, value) | |
537 yield (el, attrib, value, 0) | |
538 if 'archive' in attribs: | |
539 for match in _archive_re.finditer(el.get('archive')): | |
540 value = match.group(0) | |
541 if codebase is not None: | |
542 value = urljoin(codebase, value) | |
543 yield (el, 'archive', value, match.start()) | |
544 else: | |
545 for attrib in link_attrs: | |
546 if attrib in attribs: | |
547 yield (el, attrib, attribs[attrib], 0) | |
548 if tag == 'meta': | |
549 http_equiv = attribs.get('http-equiv', '').lower() | |
550 if http_equiv == 'refresh': | |
551 content = attribs.get('content', '') | |
552 match = _parse_meta_refresh_url(content) | |
553 url = (match.group('url') if match else content).strip() | |
554 # unexpected content means the redirect won't work, but we might | |
555 # as well be permissive and return the entire string. | |
556 if url: | |
557 url, pos = _unquote_match( | |
558 url, match.start('url') if match else content.find(url)) | |
559 yield (el, 'content', url, pos) | |
560 elif tag == 'param': | |
561 valuetype = el.get('valuetype') or '' | |
562 if valuetype.lower() == 'ref': | |
563 ## FIXME: while it's fine we *find* this link, | |
564 ## according to the spec we aren't supposed to | |
565 ## actually change the value, including resolving | |
566 ## it. It can also still be a link, even if it | |
567 ## doesn't have a valuetype="ref" (which seems to be the norm) | |
568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype | |
569 yield (el, 'value', el.get('value'), 0) | |
570 elif tag == 'style' and el.text: | |
571 urls = [ | |
572 # (start_pos, url) | |
573 _unquote_match(match.group(1), match.start(1))[::-1] | |
574 for match in _iter_css_urls(el.text) | |
575 ] + [ | |
576 (match.start(1), match.group(1)) | |
577 for match in _iter_css_imports(el.text) | |
578 ] | |
579 if urls: | |
580 # sort by start pos to bring both match sets back into order | |
581 # and reverse the list to report correct positions despite | |
582 # modifications | |
583 urls.sort(reverse=True) | |
584 for start, url in urls: | |
585 yield (el, None, url, start) | |
586 if 'style' in attribs: | |
587 urls = list(_iter_css_urls(attribs['style'])) | |
588 if urls: | |
589 # return in reversed order to simplify in-place modifications | |
590 for match in urls[::-1]: | |
591 url, start = _unquote_match(match.group(1), match.start(1)) | |
592 yield (el, 'style', url, start) | |
593 | |
594 def rewrite_links(self, link_repl_func, resolve_base_href=True, | |
595 base_href=None): | |
596 """ | |
597 Rewrite all the links in the document. For each link | |
598 ``link_repl_func(link)`` will be called, and the return value | |
599 will replace the old link. | |
600 | |
601 Note that links may not be absolute (unless you first called | |
602 ``make_links_absolute()``), and may be internal (e.g., | |
603 ``'#anchor'``). They can also be values like | |
604 ``'mailto:email'`` or ``'javascript:expr'``. | |
605 | |
606 If you give ``base_href`` then all links passed to | |
607 ``link_repl_func()`` will take that into account. | |
608 | |
609 If the ``link_repl_func`` returns None, the attribute or | |
610 tag text will be removed completely. | |
611 """ | |
612 if base_href is not None: | |
613 # FIXME: this can be done in one pass with a wrapper | |
614 # around link_repl_func | |
615 self.make_links_absolute( | |
616 base_href, resolve_base_href=resolve_base_href) | |
617 elif resolve_base_href: | |
618 self.resolve_base_href() | |
619 | |
620 for el, attrib, link, pos in self.iterlinks(): | |
621 new_link = link_repl_func(link.strip()) | |
622 if new_link == link: | |
623 continue | |
624 if new_link is None: | |
625 # Remove the attribute or element content | |
626 if attrib is None: | |
627 el.text = '' | |
628 else: | |
629 del el.attrib[attrib] | |
630 continue | |
631 | |
632 if attrib is None: | |
633 new = el.text[:pos] + new_link + el.text[pos+len(link):] | |
634 el.text = new | |
635 else: | |
636 cur = el.get(attrib) | |
637 if not pos and len(cur) == len(link): | |
638 new = new_link # most common case | |
639 else: | |
640 new = cur[:pos] + new_link + cur[pos+len(link):] | |
641 el.set(attrib, new) | |
642 | |
643 | |
644 class _MethodFunc(object): | |
645 """ | |
646 An object that represents a method on an element as a function; | |
647 the function takes either an element or an HTML string. It | |
648 returns whatever the function normally returns, or if the function | |
649 works in-place (and so returns None) it returns a serialized form | |
650 of the resulting document. | |
651 """ | |
652 def __init__(self, name, copy=False, source_class=HtmlMixin): | |
653 self.name = name | |
654 self.copy = copy | |
655 self.__doc__ = getattr(source_class, self.name).__doc__ | |
656 def __call__(self, doc, *args, **kw): | |
657 result_type = type(doc) | |
658 if isinstance(doc, basestring): | |
659 if 'copy' in kw: | |
660 raise TypeError( | |
661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) | |
662 doc = fromstring(doc, **kw) | |
663 else: | |
664 if 'copy' in kw: | |
665 make_a_copy = kw.pop('copy') | |
666 else: | |
667 make_a_copy = self.copy | |
668 if make_a_copy: | |
669 doc = copy.deepcopy(doc) | |
670 meth = getattr(doc, self.name) | |
671 result = meth(*args, **kw) | |
672 # FIXME: this None test is a bit sloppy | |
673 if result is None: | |
674 # Then return what we got in | |
675 return _transform_result(result_type, doc) | |
676 else: | |
677 return result | |
678 | |
679 | |
680 find_rel_links = _MethodFunc('find_rel_links', copy=False) | |
681 find_class = _MethodFunc('find_class', copy=False) | |
682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) | |
683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) | |
684 iterlinks = _MethodFunc('iterlinks', copy=False) | |
685 rewrite_links = _MethodFunc('rewrite_links', copy=True) | |
686 | |
687 | |
688 class HtmlComment(etree.CommentBase, HtmlMixin): | |
689 pass | |
690 | |
691 | |
692 class HtmlElement(etree.ElementBase, HtmlMixin): | |
693 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) | |
694 cssselect = HtmlMixin.cssselect | |
695 set = HtmlMixin.set | |
696 | |
697 | |
698 class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): | |
699 pass | |
700 | |
701 | |
702 class HtmlEntity(etree.EntityBase, HtmlMixin): | |
703 pass | |
704 | |
705 | |
706 class HtmlElementClassLookup(etree.CustomElementClassLookup): | |
707 """A lookup scheme for HTML Element classes. | |
708 | |
709 To create a lookup instance with different Element classes, pass a tag | |
710 name mapping of Element classes in the ``classes`` keyword argument and/or | |
711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. | |
712 The special key '*' denotes a Mixin class that should be mixed into all | |
713 Element classes. | |
714 """ | |
715 _default_element_classes = {} | |
716 | |
717 def __init__(self, classes=None, mixins=None): | |
718 etree.CustomElementClassLookup.__init__(self) | |
719 if classes is None: | |
720 classes = self._default_element_classes.copy() | |
721 if mixins: | |
722 mixers = {} | |
723 for name, value in mixins: | |
724 if name == '*': | |
725 for n in classes.keys(): | |
726 mixers.setdefault(n, []).append(value) | |
727 else: | |
728 mixers.setdefault(name, []).append(value) | |
729 for name, mix_bases in mixers.items(): | |
730 cur = classes.get(name, HtmlElement) | |
731 bases = tuple(mix_bases + [cur]) | |
732 classes[name] = type(cur.__name__, bases, {}) | |
733 self._element_classes = classes | |
734 | |
735 def lookup(self, node_type, document, namespace, name): | |
736 if node_type == 'element': | |
737 return self._element_classes.get(name.lower(), HtmlElement) | |
738 elif node_type == 'comment': | |
739 return HtmlComment | |
740 elif node_type == 'PI': | |
741 return HtmlProcessingInstruction | |
742 elif node_type == 'entity': | |
743 return HtmlEntity | |
744 # Otherwise normal lookup | |
745 return None | |
746 | |
747 | |
748 ################################################################################ | |
749 # parsing | |
750 ################################################################################ | |
751 | |
752 _looks_like_full_html_unicode = re.compile( | |
753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match | |
754 _looks_like_full_html_bytes = re.compile( | |
755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match | |
756 | |
757 | |
758 def document_fromstring(html, parser=None, ensure_head_body=False, **kw): | |
759 if parser is None: | |
760 parser = html_parser | |
761 value = etree.fromstring(html, parser, **kw) | |
762 if value is None: | |
763 raise etree.ParserError( | |
764 "Document is empty") | |
765 if ensure_head_body and value.find('head') is None: | |
766 value.insert(0, Element('head')) | |
767 if ensure_head_body and value.find('body') is None: | |
768 value.append(Element('body')) | |
769 return value | |
770 | |
771 | |
772 def fragments_fromstring(html, no_leading_text=False, base_url=None, | |
773 parser=None, **kw): | |
774 """Parses several HTML elements, returning a list of elements. | |
775 | |
776 The first item in the list may be a string. | |
777 If no_leading_text is true, then it will be an error if there is | |
778 leading text, and it will always be a list of only elements. | |
779 | |
780 base_url will set the document's base_url attribute | |
781 (and the tree's docinfo.URL). | |
782 """ | |
783 if parser is None: | |
784 parser = html_parser | |
785 # FIXME: check what happens when you give html with a body, head, etc. | |
786 if isinstance(html, bytes): | |
787 if not _looks_like_full_html_bytes(html): | |
788 # can't use %-formatting in early Py3 versions | |
789 html = ('<html><body>'.encode('ascii') + html + | |
790 '</body></html>'.encode('ascii')) | |
791 else: | |
792 if not _looks_like_full_html_unicode(html): | |
793 html = '<html><body>%s</body></html>' % html | |
794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) | |
795 assert _nons(doc.tag) == 'html' | |
796 bodies = [e for e in doc if _nons(e.tag) == 'body'] | |
797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) | |
798 body = bodies[0] | |
799 elements = [] | |
800 if no_leading_text and body.text and body.text.strip(): | |
801 raise etree.ParserError( | |
802 "There is leading text: %r" % body.text) | |
803 if body.text and body.text.strip(): | |
804 elements.append(body.text) | |
805 elements.extend(body) | |
806 # FIXME: removing the reference to the parent artificial document | |
807 # would be nice | |
808 return elements | |
809 | |
810 | |
811 def fragment_fromstring(html, create_parent=False, base_url=None, | |
812 parser=None, **kw): | |
813 """ | |
814 Parses a single HTML element; it is an error if there is more than | |
815 one element, or if anything but whitespace precedes or follows the | |
816 element. | |
817 | |
818 If ``create_parent`` is true (or is a tag name) then a parent node | |
819 will be created to encapsulate the HTML in a single element. In this | |
820 case, leading or trailing text is also allowed, as are multiple elements | |
821 as result of the parsing. | |
822 | |
823 Passing a ``base_url`` will set the document's ``base_url`` attribute | |
824 (and the tree's docinfo.URL). | |
825 """ | |
826 if parser is None: | |
827 parser = html_parser | |
828 | |
829 accept_leading_text = bool(create_parent) | |
830 | |
831 elements = fragments_fromstring( | |
832 html, parser=parser, no_leading_text=not accept_leading_text, | |
833 base_url=base_url, **kw) | |
834 | |
835 if create_parent: | |
836 if not isinstance(create_parent, basestring): | |
837 create_parent = 'div' | |
838 new_root = Element(create_parent) | |
839 if elements: | |
840 if isinstance(elements[0], basestring): | |
841 new_root.text = elements[0] | |
842 del elements[0] | |
843 new_root.extend(elements) | |
844 return new_root | |
845 | |
846 if not elements: | |
847 raise etree.ParserError('No elements found') | |
848 if len(elements) > 1: | |
849 raise etree.ParserError( | |
850 "Multiple elements found (%s)" | |
851 % ', '.join([_element_name(e) for e in elements])) | |
852 el = elements[0] | |
853 if el.tail and el.tail.strip(): | |
854 raise etree.ParserError( | |
855 "Element followed by text: %r" % el.tail) | |
856 el.tail = None | |
857 return el | |
858 | |
859 | |
860 def fromstring(html, base_url=None, parser=None, **kw): | |
861 """ | |
862 Parse the html, returning a single element/document. | |
863 | |
864 This tries to minimally parse the chunk of text, without knowing if it | |
865 is a fragment or a document. | |
866 | |
867 base_url will set the document's base_url attribute (and the tree's docinfo.URL) | |
868 """ | |
869 if parser is None: | |
870 parser = html_parser | |
871 if isinstance(html, bytes): | |
872 is_full_html = _looks_like_full_html_bytes(html) | |
873 else: | |
874 is_full_html = _looks_like_full_html_unicode(html) | |
875 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) | |
876 if is_full_html: | |
877 return doc | |
878 # otherwise, lets parse it out... | |
879 bodies = doc.findall('body') | |
880 if not bodies: | |
881 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) | |
882 if bodies: | |
883 body = bodies[0] | |
884 if len(bodies) > 1: | |
885 # Somehow there are multiple bodies, which is bad, but just | |
886 # smash them into one body | |
887 for other_body in bodies[1:]: | |
888 if other_body.text: | |
889 if len(body): | |
890 body[-1].tail = (body[-1].tail or '') + other_body.text | |
891 else: | |
892 body.text = (body.text or '') + other_body.text | |
893 body.extend(other_body) | |
894 # We'll ignore tail | |
895 # I guess we are ignoring attributes too | |
896 other_body.drop_tree() | |
897 else: | |
898 body = None | |
899 heads = doc.findall('head') | |
900 if not heads: | |
901 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) | |
902 if heads: | |
903 # Well, we have some sort of structure, so lets keep it all | |
904 head = heads[0] | |
905 if len(heads) > 1: | |
906 for other_head in heads[1:]: | |
907 head.extend(other_head) | |
908 # We don't care about text or tail in a head | |
909 other_head.drop_tree() | |
910 return doc | |
911 if body is None: | |
912 return doc | |
913 if (len(body) == 1 and (not body.text or not body.text.strip()) | |
914 and (not body[-1].tail or not body[-1].tail.strip())): | |
915 # The body has just one element, so it was probably a single | |
916 # element passed in | |
917 return body[0] | |
918 # Now we have a body which represents a bunch of tags which have the | |
919 # content that was passed in. We will create a fake container, which | |
920 # is the body tag, except <body> implies too much structure. | |
921 if _contains_block_level_tag(body): | |
922 body.tag = 'div' | |
923 else: | |
924 body.tag = 'span' | |
925 return body | |
926 | |
927 | |
928 def parse(filename_or_url, parser=None, base_url=None, **kw): | |
929 """ | |
930 Parse a filename, URL, or file-like object into an HTML document | |
931 tree. Note: this returns a tree, not an element. Use | |
932 ``parse(...).getroot()`` to get the document root. | |
933 | |
934 You can override the base URL with the ``base_url`` keyword. This | |
935 is most useful when parsing from a file-like object. | |
936 """ | |
937 if parser is None: | |
938 parser = html_parser | |
939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw) | |
940 | |
941 | |
942 def _contains_block_level_tag(el): | |
943 # FIXME: I could do this with XPath, but would that just be | |
944 # unnecessarily slow? | |
945 for el in el.iter(etree.Element): | |
946 if _nons(el.tag) in defs.block_tags: | |
947 return True | |
948 return False | |
949 | |
950 | |
951 def _element_name(el): | |
952 if isinstance(el, etree.CommentBase): | |
953 return 'comment' | |
954 elif isinstance(el, basestring): | |
955 return 'string' | |
956 else: | |
957 return _nons(el.tag) | |
958 | |
959 | |
960 ################################################################################ | |
961 # form handling | |
962 ################################################################################ | |
963 | |
964 class FormElement(HtmlElement): | |
965 """ | |
966 Represents a <form> element. | |
967 """ | |
968 | |
969 @property | |
970 def inputs(self): | |
971 """ | |
972 Returns an accessor for all the input elements in the form. | |
973 | |
974 See `InputGetter` for more information about the object. | |
975 """ | |
976 return InputGetter(self) | |
977 | |
978 @property | |
979 def fields(self): | |
980 """ | |
981 Dictionary-like object that represents all the fields in this | |
982 form. You can set values in this dictionary to effect the | |
983 form. | |
984 """ | |
985 return FieldsDict(self.inputs) | |
986 | |
987 @fields.setter | |
988 def fields(self, value): | |
989 fields = self.fields | |
990 prev_keys = fields.keys() | |
991 for key, value in value.items(): | |
992 if key in prev_keys: | |
993 prev_keys.remove(key) | |
994 fields[key] = value | |
995 for key in prev_keys: | |
996 if key is None: | |
997 # Case of an unnamed input; these aren't really | |
998 # expressed in form_values() anyway. | |
999 continue | |
1000 fields[key] = None | |
1001 | |
1002 def _name(self): | |
1003 if self.get('name'): | |
1004 return self.get('name') | |
1005 elif self.get('id'): | |
1006 return '#' + self.get('id') | |
1007 iter_tags = self.body.iter | |
1008 forms = list(iter_tags('form')) | |
1009 if not forms: | |
1010 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) | |
1011 return str(forms.index(self)) | |
1012 | |
1013 def form_values(self): | |
1014 """ | |
1015 Return a list of tuples of the field values for the form. | |
1016 This is suitable to be passed to ``urllib.urlencode()``. | |
1017 """ | |
1018 results = [] | |
1019 for el in self.inputs: | |
1020 name = el.name | |
1021 if not name or 'disabled' in el.attrib: | |
1022 continue | |
1023 tag = _nons(el.tag) | |
1024 if tag == 'textarea': | |
1025 results.append((name, el.value)) | |
1026 elif tag == 'select': | |
1027 value = el.value | |
1028 if el.multiple: | |
1029 for v in value: | |
1030 results.append((name, v)) | |
1031 elif value is not None: | |
1032 results.append((name, el.value)) | |
1033 else: | |
1034 assert tag == 'input', ( | |
1035 "Unexpected tag: %r" % el) | |
1036 if el.checkable and not el.checked: | |
1037 continue | |
1038 if el.type in ('submit', 'image', 'reset', 'file'): | |
1039 continue | |
1040 value = el.value | |
1041 if value is not None: | |
1042 results.append((name, el.value)) | |
1043 return results | |
1044 | |
1045 @property | |
1046 def action(self): | |
1047 """ | |
1048 Get/set the form's ``action`` attribute. | |
1049 """ | |
1050 base_url = self.base_url | |
1051 action = self.get('action') | |
1052 if base_url and action is not None: | |
1053 return urljoin(base_url, action) | |
1054 else: | |
1055 return action | |
1056 | |
1057 @action.setter | |
1058 def action(self, value): | |
1059 self.set('action', value) | |
1060 | |
1061 @action.deleter | |
1062 def action(self): | |
1063 attrib = self.attrib | |
1064 if 'action' in attrib: | |
1065 del attrib['action'] | |
1066 | |
1067 @property | |
1068 def method(self): | |
1069 """ | |
1070 Get/set the form's method. Always returns a capitalized | |
1071 string, and defaults to ``'GET'`` | |
1072 """ | |
1073 return self.get('method', 'GET').upper() | |
1074 | |
1075 @method.setter | |
1076 def method(self, value): | |
1077 self.set('method', value.upper()) | |
1078 | |
1079 | |
1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement | |
1081 | |
1082 | |
1083 def submit_form(form, extra_values=None, open_http=None): | |
1084 """ | |
1085 Helper function to submit a form. Returns a file-like object, as from | |
1086 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, | |
1087 which shows the URL if there were any redirects. | |
1088 | |
1089 You can use this like:: | |
1090 | |
1091 form = doc.forms[0] | |
1092 form.inputs['foo'].value = 'bar' # etc | |
1093 response = form.submit() | |
1094 doc = parse(response) | |
1095 doc.make_links_absolute(response.geturl()) | |
1096 | |
1097 To change the HTTP requester, pass a function as ``open_http`` keyword | |
1098 argument that opens the URL for you. The function must have the following | |
1099 signature:: | |
1100 | |
1101 open_http(method, URL, values) | |
1102 | |
1103 The action is one of 'GET' or 'POST', the URL is the target URL as a | |
1104 string, and the values are a sequence of ``(name, value)`` tuples with the | |
1105 form data. | |
1106 """ | |
1107 values = form.form_values() | |
1108 if extra_values: | |
1109 if hasattr(extra_values, 'items'): | |
1110 extra_values = extra_values.items() | |
1111 values.extend(extra_values) | |
1112 if open_http is None: | |
1113 open_http = open_http_urllib | |
1114 if form.action: | |
1115 url = form.action | |
1116 else: | |
1117 url = form.base_url | |
1118 return open_http(form.method, url, values) | |
1119 | |
1120 | |
1121 def open_http_urllib(method, url, values): | |
1122 if not url: | |
1123 raise ValueError("cannot submit, no URL provided") | |
1124 ## FIXME: should test that it's not a relative URL or something | |
1125 try: | |
1126 from urllib import urlencode, urlopen | |
1127 except ImportError: # Python 3 | |
1128 from urllib.request import urlopen | |
1129 from urllib.parse import urlencode | |
1130 if method == 'GET': | |
1131 if '?' in url: | |
1132 url += '&' | |
1133 else: | |
1134 url += '?' | |
1135 url += urlencode(values) | |
1136 data = None | |
1137 else: | |
1138 data = urlencode(values) | |
1139 if not isinstance(data, bytes): | |
1140 data = data.encode('ASCII') | |
1141 return urlopen(url, data) | |
1142 | |
1143 | |
1144 class FieldsDict(MutableMapping): | |
1145 | |
1146 def __init__(self, inputs): | |
1147 self.inputs = inputs | |
1148 def __getitem__(self, item): | |
1149 return self.inputs[item].value | |
1150 def __setitem__(self, item, value): | |
1151 self.inputs[item].value = value | |
1152 def __delitem__(self, item): | |
1153 raise KeyError( | |
1154 "You cannot remove keys from ElementDict") | |
1155 def keys(self): | |
1156 return self.inputs.keys() | |
1157 def __contains__(self, item): | |
1158 return item in self.inputs | |
1159 def __iter__(self): | |
1160 return iter(self.inputs.keys()) | |
1161 def __len__(self): | |
1162 return len(self.inputs) | |
1163 | |
1164 def __repr__(self): | |
1165 return '<%s for form %s>' % ( | |
1166 self.__class__.__name__, | |
1167 self.inputs.form._name()) | |
1168 | |
1169 | |
1170 class InputGetter(object): | |
1171 | |
1172 """ | |
1173 An accessor that represents all the input fields in a form. | |
1174 | |
1175 You can get fields by name from this, with | |
1176 ``form.inputs['field_name']``. If there are a set of checkboxes | |
1177 with the same name, they are returned as a list (a `CheckboxGroup` | |
1178 which also allows value setting). Radio inputs are handled | |
1179 similarly. | |
1180 | |
1181 You can also iterate over this to get all input elements. This | |
1182 won't return the same thing as if you get all the names, as | |
1183 checkboxes and radio elements are returned individually. | |
1184 """ | |
1185 | |
1186 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") | |
1187 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") | |
1188 | |
1189 def __init__(self, form): | |
1190 self.form = form | |
1191 | |
1192 def __repr__(self): | |
1193 return '<%s for form %s>' % ( | |
1194 self.__class__.__name__, | |
1195 self.form._name()) | |
1196 | |
1197 ## FIXME: there should be more methods, and it's unclear if this is | |
1198 ## a dictionary-like object or list-like object | |
1199 | |
1200 def __getitem__(self, name): | |
1201 results = self._name_xpath(self.form, name=name) | |
1202 if results: | |
1203 type = results[0].get('type') | |
1204 if type == 'radio' and len(results) > 1: | |
1205 group = RadioGroup(results) | |
1206 group.name = name | |
1207 return group | |
1208 elif type == 'checkbox' and len(results) > 1: | |
1209 group = CheckboxGroup(results) | |
1210 group.name = name | |
1211 return group | |
1212 else: | |
1213 # I don't like throwing away elements like this | |
1214 return results[0] | |
1215 else: | |
1216 raise KeyError( | |
1217 "No input element with the name %r" % name) | |
1218 | |
1219 def __contains__(self, name): | |
1220 results = self._name_xpath(self.form, name=name) | |
1221 return bool(results) | |
1222 | |
1223 def keys(self): | |
1224 names = set() | |
1225 for el in self: | |
1226 names.add(el.name) | |
1227 if None in names: | |
1228 names.remove(None) | |
1229 return list(names) | |
1230 | |
1231 def __iter__(self): | |
1232 ## FIXME: kind of dumb to turn a list into an iterator, only | |
1233 ## to have it likely turned back into a list again :( | |
1234 return iter(self._all_xpath(self.form)) | |
1235 | |
1236 | |
1237 class InputMixin(object): | |
1238 """ | |
1239 Mix-in for all input elements (input, select, and textarea) | |
1240 """ | |
1241 @property | |
1242 def name(self): | |
1243 """ | |
1244 Get/set the name of the element | |
1245 """ | |
1246 return self.get('name') | |
1247 | |
1248 @name.setter | |
1249 def name(self, value): | |
1250 self.set('name', value) | |
1251 | |
1252 @name.deleter | |
1253 def name(self): | |
1254 attrib = self.attrib | |
1255 if 'name' in attrib: | |
1256 del attrib['name'] | |
1257 | |
1258 def __repr__(self): | |
1259 type_name = getattr(self, 'type', None) | |
1260 if type_name: | |
1261 type_name = ' type=%r' % type_name | |
1262 else: | |
1263 type_name = '' | |
1264 return '<%s %x name=%r%s>' % ( | |
1265 self.__class__.__name__, id(self), self.name, type_name) | |
1266 | |
1267 | |
1268 class TextareaElement(InputMixin, HtmlElement): | |
1269 """ | |
1270 ``<textarea>`` element. You can get the name with ``.name`` and | |
1271 get/set the value with ``.value`` | |
1272 """ | |
1273 @property | |
1274 def value(self): | |
1275 """ | |
1276 Get/set the value (which is the contents of this element) | |
1277 """ | |
1278 content = self.text or '' | |
1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): | |
1280 serialisation_method = 'xml' | |
1281 else: | |
1282 serialisation_method = 'html' | |
1283 for el in self: | |
1284 # it's rare that we actually get here, so let's not use ''.join() | |
1285 content += etree.tostring( | |
1286 el, method=serialisation_method, encoding='unicode') | |
1287 return content | |
1288 | |
1289 @value.setter | |
1290 def value(self, value): | |
1291 del self[:] | |
1292 self.text = value | |
1293 | |
1294 @value.deleter | |
1295 def value(self): | |
1296 self.text = '' | |
1297 del self[:] | |
1298 | |
1299 | |
1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement | |
1301 | |
1302 | |
1303 class SelectElement(InputMixin, HtmlElement): | |
1304 """ | |
1305 ``<select>`` element. You can get the name with ``.name``. | |
1306 | |
1307 ``.value`` will be the value of the selected option, unless this | |
1308 is a multi-select element (``<select multiple>``), in which case | |
1309 it will be a set-like object. In either case ``.value_options`` | |
1310 gives the possible values. | |
1311 | |
1312 The boolean attribute ``.multiple`` shows if this is a | |
1313 multi-select. | |
1314 """ | |
1315 @property | |
1316 def value(self): | |
1317 """ | |
1318 Get/set the value of this select (the selected option). | |
1319 | |
1320 If this is a multi-select, this is a set-like object that | |
1321 represents all the selected options. | |
1322 """ | |
1323 if self.multiple: | |
1324 return MultipleSelectOptions(self) | |
1325 options = _options_xpath(self) | |
1326 | |
1327 try: | |
1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) | |
1329 except StopIteration: | |
1330 try: | |
1331 selected_option = next(el for el in options if el.get('disabled') is None) | |
1332 except StopIteration: | |
1333 return None | |
1334 value = selected_option.get('value') | |
1335 if value is None: | |
1336 value = (selected_option.text or '').strip() | |
1337 return value | |
1338 | |
1339 @value.setter | |
1340 def value(self, value): | |
1341 if self.multiple: | |
1342 if isinstance(value, basestring): | |
1343 raise TypeError("You must pass in a sequence") | |
1344 values = self.value | |
1345 values.clear() | |
1346 values.update(value) | |
1347 return | |
1348 checked_option = None | |
1349 if value is not None: | |
1350 for el in _options_xpath(self): | |
1351 opt_value = el.get('value') | |
1352 if opt_value is None: | |
1353 opt_value = (el.text or '').strip() | |
1354 if opt_value == value: | |
1355 checked_option = el | |
1356 break | |
1357 else: | |
1358 raise ValueError( | |
1359 "There is no option with the value of %r" % value) | |
1360 for el in _options_xpath(self): | |
1361 if 'selected' in el.attrib: | |
1362 del el.attrib['selected'] | |
1363 if checked_option is not None: | |
1364 checked_option.set('selected', '') | |
1365 | |
1366 @value.deleter | |
1367 def value(self): | |
1368 # FIXME: should del be allowed at all? | |
1369 if self.multiple: | |
1370 self.value.clear() | |
1371 else: | |
1372 self.value = None | |
1373 | |
1374 @property | |
1375 def value_options(self): | |
1376 """ | |
1377 All the possible values this select can have (the ``value`` | |
1378 attribute of all the ``<option>`` elements. | |
1379 """ | |
1380 options = [] | |
1381 for el in _options_xpath(self): | |
1382 value = el.get('value') | |
1383 if value is None: | |
1384 value = (el.text or '').strip() | |
1385 options.append(value) | |
1386 return options | |
1387 | |
1388 @property | |
1389 def multiple(self): | |
1390 """ | |
1391 Boolean attribute: is there a ``multiple`` attribute on this element. | |
1392 """ | |
1393 return 'multiple' in self.attrib | |
1394 | |
1395 @multiple.setter | |
1396 def multiple(self, value): | |
1397 if value: | |
1398 self.set('multiple', '') | |
1399 elif 'multiple' in self.attrib: | |
1400 del self.attrib['multiple'] | |
1401 | |
1402 | |
1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement | |
1404 | |
1405 | |
1406 class MultipleSelectOptions(SetMixin): | |
1407 """ | |
1408 Represents all the selected options in a ``<select multiple>`` element. | |
1409 | |
1410 You can add to this set-like option to select an option, or remove | |
1411 to unselect the option. | |
1412 """ | |
1413 | |
1414 def __init__(self, select): | |
1415 self.select = select | |
1416 | |
1417 @property | |
1418 def options(self): | |
1419 """ | |
1420 Iterator of all the ``<option>`` elements. | |
1421 """ | |
1422 return iter(_options_xpath(self.select)) | |
1423 | |
1424 def __iter__(self): | |
1425 for option in self.options: | |
1426 if 'selected' in option.attrib: | |
1427 opt_value = option.get('value') | |
1428 if opt_value is None: | |
1429 opt_value = (option.text or '').strip() | |
1430 yield opt_value | |
1431 | |
1432 def add(self, item): | |
1433 for option in self.options: | |
1434 opt_value = option.get('value') | |
1435 if opt_value is None: | |
1436 opt_value = (option.text or '').strip() | |
1437 if opt_value == item: | |
1438 option.set('selected', '') | |
1439 break | |
1440 else: | |
1441 raise ValueError( | |
1442 "There is no option with the value %r" % item) | |
1443 | |
1444 def remove(self, item): | |
1445 for option in self.options: | |
1446 opt_value = option.get('value') | |
1447 if opt_value is None: | |
1448 opt_value = (option.text or '').strip() | |
1449 if opt_value == item: | |
1450 if 'selected' in option.attrib: | |
1451 del option.attrib['selected'] | |
1452 else: | |
1453 raise ValueError( | |
1454 "The option %r is not currently selected" % item) | |
1455 break | |
1456 else: | |
1457 raise ValueError( | |
1458 "There is not option with the value %r" % item) | |
1459 | |
1460 def __repr__(self): | |
1461 return '<%s {%s} for select name=%r>' % ( | |
1462 self.__class__.__name__, | |
1463 ', '.join([repr(v) for v in self]), | |
1464 self.select.name) | |
1465 | |
1466 | |
1467 class RadioGroup(list): | |
1468 """ | |
1469 This object represents several ``<input type=radio>`` elements | |
1470 that have the same name. | |
1471 | |
1472 You can use this like a list, but also use the property | |
1473 ``.value`` to check/uncheck inputs. Also you can use | |
1474 ``.value_options`` to get the possible values. | |
1475 """ | |
1476 @property | |
1477 def value(self): | |
1478 """ | |
1479 Get/set the value, which checks the radio with that value (and | |
1480 unchecks any other value). | |
1481 """ | |
1482 for el in self: | |
1483 if 'checked' in el.attrib: | |
1484 return el.get('value') | |
1485 return None | |
1486 | |
1487 @value.setter | |
1488 def value(self, value): | |
1489 checked_option = None | |
1490 if value is not None: | |
1491 for el in self: | |
1492 if el.get('value') == value: | |
1493 checked_option = el | |
1494 break | |
1495 else: | |
1496 raise ValueError("There is no radio input with the value %r" % value) | |
1497 for el in self: | |
1498 if 'checked' in el.attrib: | |
1499 del el.attrib['checked'] | |
1500 if checked_option is not None: | |
1501 checked_option.set('checked', '') | |
1502 | |
1503 @value.deleter | |
1504 def value(self): | |
1505 self.value = None | |
1506 | |
1507 @property | |
1508 def value_options(self): | |
1509 """ | |
1510 Returns a list of all the possible values. | |
1511 """ | |
1512 return [el.get('value') for el in self] | |
1513 | |
1514 def __repr__(self): | |
1515 return '%s(%s)' % ( | |
1516 self.__class__.__name__, | |
1517 list.__repr__(self)) | |
1518 | |
1519 | |
1520 class CheckboxGroup(list): | |
1521 """ | |
1522 Represents a group of checkboxes (``<input type=checkbox>``) that | |
1523 have the same name. | |
1524 | |
1525 In addition to using this like a list, the ``.value`` attribute | |
1526 returns a set-like object that you can add to or remove from to | |
1527 check and uncheck checkboxes. You can also use ``.value_options`` | |
1528 to get the possible values. | |
1529 """ | |
1530 @property | |
1531 def value(self): | |
1532 """ | |
1533 Return a set-like object that can be modified to check or | |
1534 uncheck individual checkboxes according to their value. | |
1535 """ | |
1536 return CheckboxValues(self) | |
1537 | |
1538 @value.setter | |
1539 def value(self, value): | |
1540 values = self.value | |
1541 values.clear() | |
1542 if not hasattr(value, '__iter__'): | |
1543 raise ValueError( | |
1544 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" | |
1545 % (self[0].name, value)) | |
1546 values.update(value) | |
1547 | |
1548 @value.deleter | |
1549 def value(self): | |
1550 self.value.clear() | |
1551 | |
1552 @property | |
1553 def value_options(self): | |
1554 """ | |
1555 Returns a list of all the possible values. | |
1556 """ | |
1557 return [el.get('value') for el in self] | |
1558 | |
1559 def __repr__(self): | |
1560 return '%s(%s)' % ( | |
1561 self.__class__.__name__, list.__repr__(self)) | |
1562 | |
1563 | |
1564 class CheckboxValues(SetMixin): | |
1565 """ | |
1566 Represents the values of the checked checkboxes in a group of | |
1567 checkboxes with the same name. | |
1568 """ | |
1569 | |
1570 def __init__(self, group): | |
1571 self.group = group | |
1572 | |
1573 def __iter__(self): | |
1574 return iter([ | |
1575 el.get('value') | |
1576 for el in self.group | |
1577 if 'checked' in el.attrib]) | |
1578 | |
1579 def add(self, value): | |
1580 for el in self.group: | |
1581 if el.get('value') == value: | |
1582 el.set('checked', '') | |
1583 break | |
1584 else: | |
1585 raise KeyError("No checkbox with value %r" % value) | |
1586 | |
1587 def remove(self, value): | |
1588 for el in self.group: | |
1589 if el.get('value') == value: | |
1590 if 'checked' in el.attrib: | |
1591 del el.attrib['checked'] | |
1592 else: | |
1593 raise KeyError( | |
1594 "The checkbox with value %r was already unchecked" % value) | |
1595 break | |
1596 else: | |
1597 raise KeyError( | |
1598 "No checkbox with value %r" % value) | |
1599 | |
1600 def __repr__(self): | |
1601 return '<%s {%s} for checkboxes name=%r>' % ( | |
1602 self.__class__.__name__, | |
1603 ', '.join([repr(v) for v in self]), | |
1604 self.group.name) | |
1605 | |
1606 | |
1607 class InputElement(InputMixin, HtmlElement): | |
1608 """ | |
1609 Represents an ``<input>`` element. | |
1610 | |
1611 You can get the type with ``.type`` (which is lower-cased and | |
1612 defaults to ``'text'``). | |
1613 | |
1614 Also you can get and set the value with ``.value`` | |
1615 | |
1616 Checkboxes and radios have the attribute ``input.checkable == | |
1617 True`` (for all others it is false) and a boolean attribute | |
1618 ``.checked``. | |
1619 | |
1620 """ | |
1621 | |
1622 ## FIXME: I'm a little uncomfortable with the use of .checked | |
1623 @property | |
1624 def value(self): | |
1625 """ | |
1626 Get/set the value of this element, using the ``value`` attribute. | |
1627 | |
1628 Also, if this is a checkbox and it has no value, this defaults | |
1629 to ``'on'``. If it is a checkbox or radio that is not | |
1630 checked, this returns None. | |
1631 """ | |
1632 if self.checkable: | |
1633 if self.checked: | |
1634 return self.get('value') or 'on' | |
1635 else: | |
1636 return None | |
1637 return self.get('value') | |
1638 | |
1639 @value.setter | |
1640 def value(self, value): | |
1641 if self.checkable: | |
1642 if not value: | |
1643 self.checked = False | |
1644 else: | |
1645 self.checked = True | |
1646 if isinstance(value, basestring): | |
1647 self.set('value', value) | |
1648 else: | |
1649 self.set('value', value) | |
1650 | |
1651 @value.deleter | |
1652 def value(self): | |
1653 if self.checkable: | |
1654 self.checked = False | |
1655 else: | |
1656 if 'value' in self.attrib: | |
1657 del self.attrib['value'] | |
1658 | |
1659 @property | |
1660 def type(self): | |
1661 """ | |
1662 Return the type of this element (using the type attribute). | |
1663 """ | |
1664 return self.get('type', 'text').lower() | |
1665 | |
1666 @type.setter | |
1667 def type(self, value): | |
1668 self.set('type', value) | |
1669 | |
1670 @property | |
1671 def checkable(self): | |
1672 """ | |
1673 Boolean: can this element be checked? | |
1674 """ | |
1675 return self.type in ('checkbox', 'radio') | |
1676 | |
1677 @property | |
1678 def checked(self): | |
1679 """ | |
1680 Boolean attribute to get/set the presence of the ``checked`` | |
1681 attribute. | |
1682 | |
1683 You can only use this on checkable input types. | |
1684 """ | |
1685 if not self.checkable: | |
1686 raise AttributeError('Not a checkable input type') | |
1687 return 'checked' in self.attrib | |
1688 | |
1689 @checked.setter | |
1690 def checked(self, value): | |
1691 if not self.checkable: | |
1692 raise AttributeError('Not a checkable input type') | |
1693 if value: | |
1694 self.set('checked', '') | |
1695 else: | |
1696 attrib = self.attrib | |
1697 if 'checked' in attrib: | |
1698 del attrib['checked'] | |
1699 | |
1700 | |
1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement | |
1702 | |
1703 | |
1704 class LabelElement(HtmlElement): | |
1705 """ | |
1706 Represents a ``<label>`` element. | |
1707 | |
1708 Label elements are linked to other elements with their ``for`` | |
1709 attribute. You can access this element with ``label.for_element``. | |
1710 """ | |
1711 @property | |
1712 def for_element(self): | |
1713 """ | |
1714 Get/set the element this label points to. Return None if it | |
1715 can't be found. | |
1716 """ | |
1717 id = self.get('for') | |
1718 if not id: | |
1719 return None | |
1720 return self.body.get_element_by_id(id) | |
1721 | |
1722 @for_element.setter | |
1723 def for_element(self, other): | |
1724 id = other.get('id') | |
1725 if not id: | |
1726 raise TypeError( | |
1727 "Element %r has no id attribute" % other) | |
1728 self.set('for', id) | |
1729 | |
1730 @for_element.deleter | |
1731 def for_element(self): | |
1732 attrib = self.attrib | |
1733 if 'id' in attrib: | |
1734 del attrib['id'] | |
1735 | |
1736 | |
1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement | |
1738 | |
1739 | |
1740 ############################################################ | |
1741 ## Serialization | |
1742 ############################################################ | |
1743 | |
1744 def html_to_xhtml(html): | |
1745 """Convert all tags in an HTML tree to XHTML by moving them to the | |
1746 XHTML namespace. | |
1747 """ | |
1748 try: | |
1749 html = html.getroot() | |
1750 except AttributeError: | |
1751 pass | |
1752 prefix = "{%s}" % XHTML_NAMESPACE | |
1753 for el in html.iter(etree.Element): | |
1754 tag = el.tag | |
1755 if tag[0] != '{': | |
1756 el.tag = prefix + tag | |
1757 | |
1758 | |
1759 def xhtml_to_html(xhtml): | |
1760 """Convert all tags in an XHTML tree to HTML by removing their | |
1761 XHTML namespace. | |
1762 """ | |
1763 try: | |
1764 xhtml = xhtml.getroot() | |
1765 except AttributeError: | |
1766 pass | |
1767 prefix = "{%s}" % XHTML_NAMESPACE | |
1768 prefix_len = len(prefix) | |
1769 for el in xhtml.iter(prefix + "*"): | |
1770 el.tag = el.tag[prefix_len:] | |
1771 | |
1772 | |
1773 # This isn't a general match, but it's a match for what libxml2 | |
1774 # specifically serialises: | |
1775 __str_replace_meta_content_type = re.compile( | |
1776 r'<meta http-equiv="Content-Type"[^>]*>').sub | |
1777 __bytes_replace_meta_content_type = re.compile( | |
1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub | |
1779 | |
1780 | |
1781 def tostring(doc, pretty_print=False, include_meta_content_type=False, | |
1782 encoding=None, method="html", with_tail=True, doctype=None): | |
1783 """Return an HTML string representation of the document. | |
1784 | |
1785 Note: if include_meta_content_type is true this will create a | |
1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head; | |
1787 regardless of the value of include_meta_content_type any existing | |
1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed | |
1789 | |
1790 The ``encoding`` argument controls the output encoding (defaults to | |
1791 ASCII, with &#...; character references for any characters outside | |
1792 of ASCII). Note that you can pass the name ``'unicode'`` as | |
1793 ``encoding`` argument to serialise to a Unicode string. | |
1794 | |
1795 The ``method`` argument defines the output method. It defaults to | |
1796 'html', but can also be 'xml' for xhtml output, or 'text' to | |
1797 serialise to plain text without markup. | |
1798 | |
1799 To leave out the tail text of the top-level element that is being | |
1800 serialised, pass ``with_tail=False``. | |
1801 | |
1802 The ``doctype`` option allows passing in a plain string that will | |
1803 be serialised before the XML tree. Note that passing in non | |
1804 well-formed content here will make the XML output non well-formed. | |
1805 Also, an existing doctype in the document tree will not be removed | |
1806 when serialising an ElementTree instance. | |
1807 | |
1808 Example:: | |
1809 | |
1810 >>> from lxml import html | |
1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') | |
1812 | |
1813 >>> html.tostring(root) | |
1814 b'<p>Hello<br>world!</p>' | |
1815 >>> html.tostring(root, method='html') | |
1816 b'<p>Hello<br>world!</p>' | |
1817 | |
1818 >>> html.tostring(root, method='xml') | |
1819 b'<p>Hello<br/>world!</p>' | |
1820 | |
1821 >>> html.tostring(root, method='text') | |
1822 b'Helloworld!' | |
1823 | |
1824 >>> html.tostring(root, method='text', encoding='unicode') | |
1825 u'Helloworld!' | |
1826 | |
1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') | |
1828 >>> html.tostring(root[0], method='text', encoding='unicode') | |
1829 u'Helloworld!TAIL' | |
1830 | |
1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) | |
1832 u'Helloworld!' | |
1833 | |
1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') | |
1835 >>> html.tostring(doc, method='html', encoding='unicode') | |
1836 u'<html><body><p>Hello<br>world!</p></body></html>' | |
1837 | |
1838 >>> print(html.tostring(doc, method='html', encoding='unicode', | |
1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' | |
1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) | |
1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> | |
1842 <html><body><p>Hello<br>world!</p></body></html> | |
1843 """ | |
1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print, | |
1845 encoding=encoding, with_tail=with_tail, | |
1846 doctype=doctype) | |
1847 if method == 'html' and not include_meta_content_type: | |
1848 if isinstance(html, str): | |
1849 html = __str_replace_meta_content_type('', html) | |
1850 else: | |
1851 html = __bytes_replace_meta_content_type(bytes(), html) | |
1852 return html | |
1853 | |
1854 | |
1855 tostring.__doc__ = __fix_docstring(tostring.__doc__) | |
1856 | |
1857 | |
1858 def open_in_browser(doc, encoding=None): | |
1859 """ | |
1860 Open the HTML document in a web browser, saving it to a temporary | |
1861 file to open it. Note that this does not delete the file after | |
1862 use. This is mainly meant for debugging. | |
1863 """ | |
1864 import os | |
1865 import webbrowser | |
1866 import tempfile | |
1867 if not isinstance(doc, etree._ElementTree): | |
1868 doc = etree.ElementTree(doc) | |
1869 handle, fn = tempfile.mkstemp(suffix='.html') | |
1870 f = os.fdopen(handle, 'wb') | |
1871 try: | |
1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") | |
1873 finally: | |
1874 # we leak the file itself here, but we should at least close it | |
1875 f.close() | |
1876 url = 'file://' + fn.replace(os.path.sep, '/') | |
1877 print(url) | |
1878 webbrowser.open(url) | |
1879 | |
1880 | |
1881 ################################################################################ | |
1882 # configure Element class lookup | |
1883 ################################################################################ | |
1884 | |
1885 class HTMLParser(etree.HTMLParser): | |
1886 """An HTML parser that is configured to return lxml.html Element | |
1887 objects. | |
1888 """ | |
1889 def __init__(self, **kwargs): | |
1890 super(HTMLParser, self).__init__(**kwargs) | |
1891 self.set_element_class_lookup(HtmlElementClassLookup()) | |
1892 | |
1893 | |
1894 class XHTMLParser(etree.XMLParser): | |
1895 """An XML parser that is configured to return lxml.html Element | |
1896 objects. | |
1897 | |
1898 Note that this parser is not really XHTML aware unless you let it | |
1899 load a DTD that declares the HTML entities. To do this, make sure | |
1900 you have the XHTML DTDs installed in your catalogs, and create the | |
1901 parser like this:: | |
1902 | |
1903 >>> parser = XHTMLParser(load_dtd=True) | |
1904 | |
1905 If you additionally want to validate the document, use this:: | |
1906 | |
1907 >>> parser = XHTMLParser(dtd_validation=True) | |
1908 | |
1909 For catalog support, see http://www.xmlsoft.org/catalog.html. | |
1910 """ | |
1911 def __init__(self, **kwargs): | |
1912 super(XHTMLParser, self).__init__(**kwargs) | |
1913 self.set_element_class_lookup(HtmlElementClassLookup()) | |
1914 | |
1915 | |
1916 def Element(*args, **kw): | |
1917 """Create a new HTML Element. | |
1918 | |
1919 This can also be used for XHTML documents. | |
1920 """ | |
1921 v = html_parser.makeelement(*args, **kw) | |
1922 return v | |
1923 | |
1924 | |
1925 html_parser = HTMLParser() | |
1926 xhtml_parser = XHTMLParser() |