Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/lxml/html/__init__.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 # Copyright (c) 2004 Ian Bicking. All rights reserved. | |
| 2 # | |
| 3 # Redistribution and use in source and binary forms, with or without | |
| 4 # modification, are permitted provided that the following conditions are | |
| 5 # met: | |
| 6 # | |
| 7 # 1. Redistributions of source code must retain the above copyright | |
| 8 # notice, this list of conditions and the following disclaimer. | |
| 9 # | |
| 10 # 2. Redistributions in binary form must reproduce the above copyright | |
| 11 # notice, this list of conditions and the following disclaimer in | |
| 12 # the documentation and/or other materials provided with the | |
| 13 # distribution. | |
| 14 # | |
| 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may | |
| 16 # be used to endorse or promote products derived from this software | |
| 17 # without specific prior written permission. | |
| 18 # | |
| 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR | |
| 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 30 | |
| 31 """The ``lxml.html`` tool set for HTML handling. | |
| 32 """ | |
| 33 | |
| 34 from __future__ import absolute_import | |
| 35 | |
| 36 __all__ = [ | |
| 37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', | |
| 38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', | |
| 39 'find_rel_links', 'find_class', 'make_links_absolute', | |
| 40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] | |
| 41 | |
| 42 | |
| 43 import copy | |
| 44 import sys | |
| 45 import re | |
| 46 from functools import partial | |
| 47 | |
| 48 try: | |
| 49 from collections.abc import MutableMapping, MutableSet | |
| 50 except ImportError: | |
| 51 from collections import MutableMapping, MutableSet | |
| 52 | |
| 53 from .. import etree | |
| 54 from . import defs | |
| 55 from ._setmixin import SetMixin | |
| 56 | |
| 57 try: | |
| 58 from urlparse import urljoin | |
| 59 except ImportError: | |
| 60 # Python 3 | |
| 61 from urllib.parse import urljoin | |
| 62 | |
| 63 try: | |
| 64 unicode | |
| 65 except NameError: | |
| 66 # Python 3 | |
| 67 unicode = str | |
| 68 try: | |
| 69 basestring | |
| 70 except NameError: | |
| 71 # Python 3 | |
| 72 basestring = (str, bytes) | |
| 73 | |
| 74 | |
| 75 def __fix_docstring(s): | |
| 76 if not s: | |
| 77 return s | |
| 78 if sys.version_info[0] >= 3: | |
| 79 sub = re.compile(r"^(\s*)u'", re.M).sub | |
| 80 else: | |
| 81 sub = re.compile(r"^(\s*)b'", re.M).sub | |
| 82 return sub(r"\1'", s) | |
| 83 | |
| 84 | |
| 85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" | |
| 86 | |
| 87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", | |
| 88 namespaces={'x':XHTML_NAMESPACE}) | |
| 89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", | |
| 90 namespaces={'x':XHTML_NAMESPACE}) | |
| 91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", | |
| 92 namespaces={'x':XHTML_NAMESPACE}) | |
| 93 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) | |
| 94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") | |
| 95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") | |
| 96 _collect_string_content = etree.XPath("string()") | |
| 97 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer | |
| 98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer | |
| 99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", | |
| 100 namespaces={'x':XHTML_NAMESPACE}) | |
| 101 _archive_re = re.compile(r'[^ ]+') | |
| 102 _parse_meta_refresh_url = re.compile( | |
| 103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search | |
| 104 | |
| 105 | |
| 106 def _unquote_match(s, pos): | |
| 107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": | |
| 108 return s[1:-1], pos+1 | |
| 109 else: | |
| 110 return s,pos | |
| 111 | |
| 112 | |
| 113 def _transform_result(typ, result): | |
| 114 """Convert the result back into the input type. | |
| 115 """ | |
| 116 if issubclass(typ, bytes): | |
| 117 return tostring(result, encoding='utf-8') | |
| 118 elif issubclass(typ, unicode): | |
| 119 return tostring(result, encoding='unicode') | |
| 120 else: | |
| 121 return result | |
| 122 | |
| 123 | |
| 124 def _nons(tag): | |
| 125 if isinstance(tag, basestring): | |
| 126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: | |
| 127 return tag.split('}')[-1] | |
| 128 return tag | |
| 129 | |
| 130 | |
| 131 class Classes(MutableSet): | |
| 132 """Provides access to an element's class attribute as a set-like collection. | |
| 133 Usage:: | |
| 134 | |
| 135 >>> el = fromstring('<p class="hidden large">Text</p>') | |
| 136 >>> classes = el.classes # or: classes = Classes(el.attrib) | |
| 137 >>> classes |= ['block', 'paragraph'] | |
| 138 >>> el.get('class') | |
| 139 'hidden large block paragraph' | |
| 140 >>> classes.toggle('hidden') | |
| 141 False | |
| 142 >>> el.get('class') | |
| 143 'large block paragraph' | |
| 144 >>> classes -= ('some', 'classes', 'block') | |
| 145 >>> el.get('class') | |
| 146 'large paragraph' | |
| 147 """ | |
| 148 def __init__(self, attributes): | |
| 149 self._attributes = attributes | |
| 150 self._get_class_value = partial(attributes.get, 'class', '') | |
| 151 | |
| 152 def add(self, value): | |
| 153 """ | |
| 154 Add a class. | |
| 155 | |
| 156 This has no effect if the class is already present. | |
| 157 """ | |
| 158 if not value or re.search(r'\s', value): | |
| 159 raise ValueError("Invalid class name: %r" % value) | |
| 160 classes = self._get_class_value().split() | |
| 161 if value in classes: | |
| 162 return | |
| 163 classes.append(value) | |
| 164 self._attributes['class'] = ' '.join(classes) | |
| 165 | |
| 166 def discard(self, value): | |
| 167 """ | |
| 168 Remove a class if it is currently present. | |
| 169 | |
| 170 If the class is not present, do nothing. | |
| 171 """ | |
| 172 if not value or re.search(r'\s', value): | |
| 173 raise ValueError("Invalid class name: %r" % value) | |
| 174 classes = [name for name in self._get_class_value().split() | |
| 175 if name != value] | |
| 176 if classes: | |
| 177 self._attributes['class'] = ' '.join(classes) | |
| 178 elif 'class' in self._attributes: | |
| 179 del self._attributes['class'] | |
| 180 | |
| 181 def remove(self, value): | |
| 182 """ | |
| 183 Remove a class; it must currently be present. | |
| 184 | |
| 185 If the class is not present, raise a KeyError. | |
| 186 """ | |
| 187 if not value or re.search(r'\s', value): | |
| 188 raise ValueError("Invalid class name: %r" % value) | |
| 189 super(Classes, self).remove(value) | |
| 190 | |
| 191 def __contains__(self, name): | |
| 192 classes = self._get_class_value() | |
| 193 return name in classes and name in classes.split() | |
| 194 | |
| 195 def __iter__(self): | |
| 196 return iter(self._get_class_value().split()) | |
| 197 | |
| 198 def __len__(self): | |
| 199 return len(self._get_class_value().split()) | |
| 200 | |
| 201 # non-standard methods | |
| 202 | |
| 203 def update(self, values): | |
| 204 """ | |
| 205 Add all names from 'values'. | |
| 206 """ | |
| 207 classes = self._get_class_value().split() | |
| 208 extended = False | |
| 209 for value in values: | |
| 210 if value not in classes: | |
| 211 classes.append(value) | |
| 212 extended = True | |
| 213 if extended: | |
| 214 self._attributes['class'] = ' '.join(classes) | |
| 215 | |
| 216 def toggle(self, value): | |
| 217 """ | |
| 218 Add a class name if it isn't there yet, or remove it if it exists. | |
| 219 | |
| 220 Returns true if the class was added (and is now enabled) and | |
| 221 false if it was removed (and is now disabled). | |
| 222 """ | |
| 223 if not value or re.search(r'\s', value): | |
| 224 raise ValueError("Invalid class name: %r" % value) | |
| 225 classes = self._get_class_value().split() | |
| 226 try: | |
| 227 classes.remove(value) | |
| 228 enabled = False | |
| 229 except ValueError: | |
| 230 classes.append(value) | |
| 231 enabled = True | |
| 232 if classes: | |
| 233 self._attributes['class'] = ' '.join(classes) | |
| 234 else: | |
| 235 del self._attributes['class'] | |
| 236 return enabled | |
| 237 | |
| 238 | |
| 239 class HtmlMixin(object): | |
| 240 | |
| 241 def set(self, key, value=None): | |
| 242 """set(self, key, value=None) | |
| 243 | |
| 244 Sets an element attribute. If no value is provided, or if the value is None, | |
| 245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" | |
| 246 for ``form.set('novalidate')``. | |
| 247 """ | |
| 248 super(HtmlElement, self).set(key, value) | |
| 249 | |
| 250 @property | |
| 251 def classes(self): | |
| 252 """ | |
| 253 A set-like wrapper around the 'class' attribute. | |
| 254 """ | |
| 255 return Classes(self.attrib) | |
| 256 | |
| 257 @classes.setter | |
| 258 def classes(self, classes): | |
| 259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. | |
| 260 value = classes._get_class_value() | |
| 261 if value: | |
| 262 self.set('class', value) | |
| 263 elif self.get('class') is not None: | |
| 264 del self.attrib['class'] | |
| 265 | |
| 266 @property | |
| 267 def base_url(self): | |
| 268 """ | |
| 269 Returns the base URL, given when the page was parsed. | |
| 270 | |
| 271 Use with ``urlparse.urljoin(el.base_url, href)`` to get | |
| 272 absolute URLs. | |
| 273 """ | |
| 274 return self.getroottree().docinfo.URL | |
| 275 | |
| 276 @property | |
| 277 def forms(self): | |
| 278 """ | |
| 279 Return a list of all the forms | |
| 280 """ | |
| 281 return _forms_xpath(self) | |
| 282 | |
| 283 @property | |
| 284 def body(self): | |
| 285 """ | |
| 286 Return the <body> element. Can be called from a child element | |
| 287 to get the document's head. | |
| 288 """ | |
| 289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] | |
| 290 | |
| 291 @property | |
| 292 def head(self): | |
| 293 """ | |
| 294 Returns the <head> element. Can be called from a child | |
| 295 element to get the document's head. | |
| 296 """ | |
| 297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] | |
| 298 | |
| 299 @property | |
| 300 def label(self): | |
| 301 """ | |
| 302 Get or set any <label> element associated with this element. | |
| 303 """ | |
| 304 id = self.get('id') | |
| 305 if not id: | |
| 306 return None | |
| 307 result = _label_xpath(self, id=id) | |
| 308 if not result: | |
| 309 return None | |
| 310 else: | |
| 311 return result[0] | |
| 312 | |
| 313 @label.setter | |
| 314 def label(self, label): | |
| 315 id = self.get('id') | |
| 316 if not id: | |
| 317 raise TypeError( | |
| 318 "You cannot set a label for an element (%r) that has no id" | |
| 319 % self) | |
| 320 if _nons(label.tag) != 'label': | |
| 321 raise TypeError( | |
| 322 "You can only assign label to a label element (not %r)" | |
| 323 % label) | |
| 324 label.set('for', id) | |
| 325 | |
| 326 @label.deleter | |
| 327 def label(self): | |
| 328 label = self.label | |
| 329 if label is not None: | |
| 330 del label.attrib['for'] | |
| 331 | |
| 332 def drop_tree(self): | |
| 333 """ | |
| 334 Removes this element from the tree, including its children and | |
| 335 text. The tail text is joined to the previous element or | |
| 336 parent. | |
| 337 """ | |
| 338 parent = self.getparent() | |
| 339 assert parent is not None | |
| 340 if self.tail: | |
| 341 previous = self.getprevious() | |
| 342 if previous is None: | |
| 343 parent.text = (parent.text or '') + self.tail | |
| 344 else: | |
| 345 previous.tail = (previous.tail or '') + self.tail | |
| 346 parent.remove(self) | |
| 347 | |
| 348 def drop_tag(self): | |
| 349 """ | |
| 350 Remove the tag, but not its children or text. The children and text | |
| 351 are merged into the parent. | |
| 352 | |
| 353 Example:: | |
| 354 | |
| 355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') | |
| 356 >>> h.find('.//b').drop_tag() | |
| 357 >>> print(tostring(h, encoding='unicode')) | |
| 358 <div>Hello World!</div> | |
| 359 """ | |
| 360 parent = self.getparent() | |
| 361 assert parent is not None | |
| 362 previous = self.getprevious() | |
| 363 if self.text and isinstance(self.tag, basestring): | |
| 364 # not a Comment, etc. | |
| 365 if previous is None: | |
| 366 parent.text = (parent.text or '') + self.text | |
| 367 else: | |
| 368 previous.tail = (previous.tail or '') + self.text | |
| 369 if self.tail: | |
| 370 if len(self): | |
| 371 last = self[-1] | |
| 372 last.tail = (last.tail or '') + self.tail | |
| 373 elif previous is None: | |
| 374 parent.text = (parent.text or '') + self.tail | |
| 375 else: | |
| 376 previous.tail = (previous.tail or '') + self.tail | |
| 377 index = parent.index(self) | |
| 378 parent[index:index+1] = self[:] | |
| 379 | |
| 380 def find_rel_links(self, rel): | |
| 381 """ | |
| 382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. | |
| 383 """ | |
| 384 rel = rel.lower() | |
| 385 return [el for el in _rel_links_xpath(self) | |
| 386 if el.get('rel').lower() == rel] | |
| 387 | |
| 388 def find_class(self, class_name): | |
| 389 """ | |
| 390 Find any elements with the given class name. | |
| 391 """ | |
| 392 return _class_xpath(self, class_name=class_name) | |
| 393 | |
| 394 def get_element_by_id(self, id, *default): | |
| 395 """ | |
| 396 Get the first element in a document with the given id. If none is | |
| 397 found, return the default argument if provided or raise KeyError | |
| 398 otherwise. | |
| 399 | |
| 400 Note that there can be more than one element with the same id, | |
| 401 and this isn't uncommon in HTML documents found in the wild. | |
| 402 Browsers return only the first match, and this function does | |
| 403 the same. | |
| 404 """ | |
| 405 try: | |
| 406 # FIXME: should this check for multiple matches? | |
| 407 # browsers just return the first one | |
| 408 return _id_xpath(self, id=id)[0] | |
| 409 except IndexError: | |
| 410 if default: | |
| 411 return default[0] | |
| 412 else: | |
| 413 raise KeyError(id) | |
| 414 | |
| 415 def text_content(self): | |
| 416 """ | |
| 417 Return the text content of the tag (and the text in any children). | |
| 418 """ | |
| 419 return _collect_string_content(self) | |
| 420 | |
| 421 def cssselect(self, expr, translator='html'): | |
| 422 """ | |
| 423 Run the CSS expression on this element and its children, | |
| 424 returning a list of the results. | |
| 425 | |
| 426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) | |
| 427 -- note that pre-compiling the expression can provide a substantial | |
| 428 speedup. | |
| 429 """ | |
| 430 # Do the import here to make the dependency optional. | |
| 431 from lxml.cssselect import CSSSelector | |
| 432 return CSSSelector(expr, translator=translator)(self) | |
| 433 | |
| 434 ######################################## | |
| 435 ## Link functions | |
| 436 ######################################## | |
| 437 | |
| 438 def make_links_absolute(self, base_url=None, resolve_base_href=True, | |
| 439 handle_failures=None): | |
| 440 """ | |
| 441 Make all links in the document absolute, given the | |
| 442 ``base_url`` for the document (the full URL where the document | |
| 443 came from), or if no ``base_url`` is given, then the ``.base_url`` | |
| 444 of the document. | |
| 445 | |
| 446 If ``resolve_base_href`` is true, then any ``<base href>`` | |
| 447 tags in the document are used *and* removed from the document. | |
| 448 If it is false then any such tag is ignored. | |
| 449 | |
| 450 If ``handle_failures`` is None (default), a failure to process | |
| 451 a URL will abort the processing. If set to 'ignore', errors | |
| 452 are ignored. If set to 'discard', failing URLs will be removed. | |
| 453 """ | |
| 454 if base_url is None: | |
| 455 base_url = self.base_url | |
| 456 if base_url is None: | |
| 457 raise TypeError( | |
| 458 "No base_url given, and the document has no base_url") | |
| 459 if resolve_base_href: | |
| 460 self.resolve_base_href() | |
| 461 | |
| 462 if handle_failures == 'ignore': | |
| 463 def link_repl(href): | |
| 464 try: | |
| 465 return urljoin(base_url, href) | |
| 466 except ValueError: | |
| 467 return href | |
| 468 elif handle_failures == 'discard': | |
| 469 def link_repl(href): | |
| 470 try: | |
| 471 return urljoin(base_url, href) | |
| 472 except ValueError: | |
| 473 return None | |
| 474 elif handle_failures is None: | |
| 475 def link_repl(href): | |
| 476 return urljoin(base_url, href) | |
| 477 else: | |
| 478 raise ValueError( | |
| 479 "unexpected value for handle_failures: %r" % handle_failures) | |
| 480 | |
| 481 self.rewrite_links(link_repl) | |
| 482 | |
| 483 def resolve_base_href(self, handle_failures=None): | |
| 484 """ | |
| 485 Find any ``<base href>`` tag in the document, and apply its | |
| 486 values to all links found in the document. Also remove the | |
| 487 tag once it has been applied. | |
| 488 | |
| 489 If ``handle_failures`` is None (default), a failure to process | |
| 490 a URL will abort the processing. If set to 'ignore', errors | |
| 491 are ignored. If set to 'discard', failing URLs will be removed. | |
| 492 """ | |
| 493 base_href = None | |
| 494 basetags = self.xpath('//base[@href]|//x:base[@href]', | |
| 495 namespaces={'x': XHTML_NAMESPACE}) | |
| 496 for b in basetags: | |
| 497 base_href = b.get('href') | |
| 498 b.drop_tree() | |
| 499 if not base_href: | |
| 500 return | |
| 501 self.make_links_absolute(base_href, resolve_base_href=False, | |
| 502 handle_failures=handle_failures) | |
| 503 | |
| 504 def iterlinks(self): | |
| 505 """ | |
| 506 Yield (element, attribute, link, pos), where attribute may be None | |
| 507 (indicating the link is in the text). ``pos`` is the position | |
| 508 where the link occurs; often 0, but sometimes something else in | |
| 509 the case of links in stylesheets or style tags. | |
| 510 | |
| 511 Note: <base href> is *not* taken into account in any way. The | |
| 512 link you get is exactly the link in the document. | |
| 513 | |
| 514 Note: multiple links inside of a single text string or | |
| 515 attribute value are returned in reversed order. This makes it | |
| 516 possible to replace or delete them from the text string value | |
| 517 based on their reported text positions. Otherwise, a | |
| 518 modification at one text position can change the positions of | |
| 519 links reported later on. | |
| 520 """ | |
| 521 link_attrs = defs.link_attrs | |
| 522 for el in self.iter(etree.Element): | |
| 523 attribs = el.attrib | |
| 524 tag = _nons(el.tag) | |
| 525 if tag == 'object': | |
| 526 codebase = None | |
| 527 ## <object> tags have attributes that are relative to | |
| 528 ## codebase | |
| 529 if 'codebase' in attribs: | |
| 530 codebase = el.get('codebase') | |
| 531 yield (el, 'codebase', codebase, 0) | |
| 532 for attrib in ('classid', 'data'): | |
| 533 if attrib in attribs: | |
| 534 value = el.get(attrib) | |
| 535 if codebase is not None: | |
| 536 value = urljoin(codebase, value) | |
| 537 yield (el, attrib, value, 0) | |
| 538 if 'archive' in attribs: | |
| 539 for match in _archive_re.finditer(el.get('archive')): | |
| 540 value = match.group(0) | |
| 541 if codebase is not None: | |
| 542 value = urljoin(codebase, value) | |
| 543 yield (el, 'archive', value, match.start()) | |
| 544 else: | |
| 545 for attrib in link_attrs: | |
| 546 if attrib in attribs: | |
| 547 yield (el, attrib, attribs[attrib], 0) | |
| 548 if tag == 'meta': | |
| 549 http_equiv = attribs.get('http-equiv', '').lower() | |
| 550 if http_equiv == 'refresh': | |
| 551 content = attribs.get('content', '') | |
| 552 match = _parse_meta_refresh_url(content) | |
| 553 url = (match.group('url') if match else content).strip() | |
| 554 # unexpected content means the redirect won't work, but we might | |
| 555 # as well be permissive and return the entire string. | |
| 556 if url: | |
| 557 url, pos = _unquote_match( | |
| 558 url, match.start('url') if match else content.find(url)) | |
| 559 yield (el, 'content', url, pos) | |
| 560 elif tag == 'param': | |
| 561 valuetype = el.get('valuetype') or '' | |
| 562 if valuetype.lower() == 'ref': | |
| 563 ## FIXME: while it's fine we *find* this link, | |
| 564 ## according to the spec we aren't supposed to | |
| 565 ## actually change the value, including resolving | |
| 566 ## it. It can also still be a link, even if it | |
| 567 ## doesn't have a valuetype="ref" (which seems to be the norm) | |
| 568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype | |
| 569 yield (el, 'value', el.get('value'), 0) | |
| 570 elif tag == 'style' and el.text: | |
| 571 urls = [ | |
| 572 # (start_pos, url) | |
| 573 _unquote_match(match.group(1), match.start(1))[::-1] | |
| 574 for match in _iter_css_urls(el.text) | |
| 575 ] + [ | |
| 576 (match.start(1), match.group(1)) | |
| 577 for match in _iter_css_imports(el.text) | |
| 578 ] | |
| 579 if urls: | |
| 580 # sort by start pos to bring both match sets back into order | |
| 581 # and reverse the list to report correct positions despite | |
| 582 # modifications | |
| 583 urls.sort(reverse=True) | |
| 584 for start, url in urls: | |
| 585 yield (el, None, url, start) | |
| 586 if 'style' in attribs: | |
| 587 urls = list(_iter_css_urls(attribs['style'])) | |
| 588 if urls: | |
| 589 # return in reversed order to simplify in-place modifications | |
| 590 for match in urls[::-1]: | |
| 591 url, start = _unquote_match(match.group(1), match.start(1)) | |
| 592 yield (el, 'style', url, start) | |
| 593 | |
| 594 def rewrite_links(self, link_repl_func, resolve_base_href=True, | |
| 595 base_href=None): | |
| 596 """ | |
| 597 Rewrite all the links in the document. For each link | |
| 598 ``link_repl_func(link)`` will be called, and the return value | |
| 599 will replace the old link. | |
| 600 | |
| 601 Note that links may not be absolute (unless you first called | |
| 602 ``make_links_absolute()``), and may be internal (e.g., | |
| 603 ``'#anchor'``). They can also be values like | |
| 604 ``'mailto:email'`` or ``'javascript:expr'``. | |
| 605 | |
| 606 If you give ``base_href`` then all links passed to | |
| 607 ``link_repl_func()`` will take that into account. | |
| 608 | |
| 609 If the ``link_repl_func`` returns None, the attribute or | |
| 610 tag text will be removed completely. | |
| 611 """ | |
| 612 if base_href is not None: | |
| 613 # FIXME: this can be done in one pass with a wrapper | |
| 614 # around link_repl_func | |
| 615 self.make_links_absolute( | |
| 616 base_href, resolve_base_href=resolve_base_href) | |
| 617 elif resolve_base_href: | |
| 618 self.resolve_base_href() | |
| 619 | |
| 620 for el, attrib, link, pos in self.iterlinks(): | |
| 621 new_link = link_repl_func(link.strip()) | |
| 622 if new_link == link: | |
| 623 continue | |
| 624 if new_link is None: | |
| 625 # Remove the attribute or element content | |
| 626 if attrib is None: | |
| 627 el.text = '' | |
| 628 else: | |
| 629 del el.attrib[attrib] | |
| 630 continue | |
| 631 | |
| 632 if attrib is None: | |
| 633 new = el.text[:pos] + new_link + el.text[pos+len(link):] | |
| 634 el.text = new | |
| 635 else: | |
| 636 cur = el.get(attrib) | |
| 637 if not pos and len(cur) == len(link): | |
| 638 new = new_link # most common case | |
| 639 else: | |
| 640 new = cur[:pos] + new_link + cur[pos+len(link):] | |
| 641 el.set(attrib, new) | |
| 642 | |
| 643 | |
| 644 class _MethodFunc(object): | |
| 645 """ | |
| 646 An object that represents a method on an element as a function; | |
| 647 the function takes either an element or an HTML string. It | |
| 648 returns whatever the function normally returns, or if the function | |
| 649 works in-place (and so returns None) it returns a serialized form | |
| 650 of the resulting document. | |
| 651 """ | |
| 652 def __init__(self, name, copy=False, source_class=HtmlMixin): | |
| 653 self.name = name | |
| 654 self.copy = copy | |
| 655 self.__doc__ = getattr(source_class, self.name).__doc__ | |
| 656 def __call__(self, doc, *args, **kw): | |
| 657 result_type = type(doc) | |
| 658 if isinstance(doc, basestring): | |
| 659 if 'copy' in kw: | |
| 660 raise TypeError( | |
| 661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) | |
| 662 doc = fromstring(doc, **kw) | |
| 663 else: | |
| 664 if 'copy' in kw: | |
| 665 make_a_copy = kw.pop('copy') | |
| 666 else: | |
| 667 make_a_copy = self.copy | |
| 668 if make_a_copy: | |
| 669 doc = copy.deepcopy(doc) | |
| 670 meth = getattr(doc, self.name) | |
| 671 result = meth(*args, **kw) | |
| 672 # FIXME: this None test is a bit sloppy | |
| 673 if result is None: | |
| 674 # Then return what we got in | |
| 675 return _transform_result(result_type, doc) | |
| 676 else: | |
| 677 return result | |
| 678 | |
| 679 | |
| 680 find_rel_links = _MethodFunc('find_rel_links', copy=False) | |
| 681 find_class = _MethodFunc('find_class', copy=False) | |
| 682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) | |
| 683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) | |
| 684 iterlinks = _MethodFunc('iterlinks', copy=False) | |
| 685 rewrite_links = _MethodFunc('rewrite_links', copy=True) | |
| 686 | |
| 687 | |
| 688 class HtmlComment(etree.CommentBase, HtmlMixin): | |
| 689 pass | |
| 690 | |
| 691 | |
| 692 class HtmlElement(etree.ElementBase, HtmlMixin): | |
| 693 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) | |
| 694 cssselect = HtmlMixin.cssselect | |
| 695 set = HtmlMixin.set | |
| 696 | |
| 697 | |
| 698 class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): | |
| 699 pass | |
| 700 | |
| 701 | |
| 702 class HtmlEntity(etree.EntityBase, HtmlMixin): | |
| 703 pass | |
| 704 | |
| 705 | |
| 706 class HtmlElementClassLookup(etree.CustomElementClassLookup): | |
| 707 """A lookup scheme for HTML Element classes. | |
| 708 | |
| 709 To create a lookup instance with different Element classes, pass a tag | |
| 710 name mapping of Element classes in the ``classes`` keyword argument and/or | |
| 711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. | |
| 712 The special key '*' denotes a Mixin class that should be mixed into all | |
| 713 Element classes. | |
| 714 """ | |
| 715 _default_element_classes = {} | |
| 716 | |
| 717 def __init__(self, classes=None, mixins=None): | |
| 718 etree.CustomElementClassLookup.__init__(self) | |
| 719 if classes is None: | |
| 720 classes = self._default_element_classes.copy() | |
| 721 if mixins: | |
| 722 mixers = {} | |
| 723 for name, value in mixins: | |
| 724 if name == '*': | |
| 725 for n in classes.keys(): | |
| 726 mixers.setdefault(n, []).append(value) | |
| 727 else: | |
| 728 mixers.setdefault(name, []).append(value) | |
| 729 for name, mix_bases in mixers.items(): | |
| 730 cur = classes.get(name, HtmlElement) | |
| 731 bases = tuple(mix_bases + [cur]) | |
| 732 classes[name] = type(cur.__name__, bases, {}) | |
| 733 self._element_classes = classes | |
| 734 | |
| 735 def lookup(self, node_type, document, namespace, name): | |
| 736 if node_type == 'element': | |
| 737 return self._element_classes.get(name.lower(), HtmlElement) | |
| 738 elif node_type == 'comment': | |
| 739 return HtmlComment | |
| 740 elif node_type == 'PI': | |
| 741 return HtmlProcessingInstruction | |
| 742 elif node_type == 'entity': | |
| 743 return HtmlEntity | |
| 744 # Otherwise normal lookup | |
| 745 return None | |
| 746 | |
| 747 | |
| 748 ################################################################################ | |
| 749 # parsing | |
| 750 ################################################################################ | |
| 751 | |
| 752 _looks_like_full_html_unicode = re.compile( | |
| 753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match | |
| 754 _looks_like_full_html_bytes = re.compile( | |
| 755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match | |
| 756 | |
| 757 | |
| 758 def document_fromstring(html, parser=None, ensure_head_body=False, **kw): | |
| 759 if parser is None: | |
| 760 parser = html_parser | |
| 761 value = etree.fromstring(html, parser, **kw) | |
| 762 if value is None: | |
| 763 raise etree.ParserError( | |
| 764 "Document is empty") | |
| 765 if ensure_head_body and value.find('head') is None: | |
| 766 value.insert(0, Element('head')) | |
| 767 if ensure_head_body and value.find('body') is None: | |
| 768 value.append(Element('body')) | |
| 769 return value | |
| 770 | |
| 771 | |
| 772 def fragments_fromstring(html, no_leading_text=False, base_url=None, | |
| 773 parser=None, **kw): | |
| 774 """Parses several HTML elements, returning a list of elements. | |
| 775 | |
| 776 The first item in the list may be a string. | |
| 777 If no_leading_text is true, then it will be an error if there is | |
| 778 leading text, and it will always be a list of only elements. | |
| 779 | |
| 780 base_url will set the document's base_url attribute | |
| 781 (and the tree's docinfo.URL). | |
| 782 """ | |
| 783 if parser is None: | |
| 784 parser = html_parser | |
| 785 # FIXME: check what happens when you give html with a body, head, etc. | |
| 786 if isinstance(html, bytes): | |
| 787 if not _looks_like_full_html_bytes(html): | |
| 788 # can't use %-formatting in early Py3 versions | |
| 789 html = ('<html><body>'.encode('ascii') + html + | |
| 790 '</body></html>'.encode('ascii')) | |
| 791 else: | |
| 792 if not _looks_like_full_html_unicode(html): | |
| 793 html = '<html><body>%s</body></html>' % html | |
| 794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) | |
| 795 assert _nons(doc.tag) == 'html' | |
| 796 bodies = [e for e in doc if _nons(e.tag) == 'body'] | |
| 797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) | |
| 798 body = bodies[0] | |
| 799 elements = [] | |
| 800 if no_leading_text and body.text and body.text.strip(): | |
| 801 raise etree.ParserError( | |
| 802 "There is leading text: %r" % body.text) | |
| 803 if body.text and body.text.strip(): | |
| 804 elements.append(body.text) | |
| 805 elements.extend(body) | |
| 806 # FIXME: removing the reference to the parent artificial document | |
| 807 # would be nice | |
| 808 return elements | |
| 809 | |
| 810 | |
| 811 def fragment_fromstring(html, create_parent=False, base_url=None, | |
| 812 parser=None, **kw): | |
| 813 """ | |
| 814 Parses a single HTML element; it is an error if there is more than | |
| 815 one element, or if anything but whitespace precedes or follows the | |
| 816 element. | |
| 817 | |
| 818 If ``create_parent`` is true (or is a tag name) then a parent node | |
| 819 will be created to encapsulate the HTML in a single element. In this | |
| 820 case, leading or trailing text is also allowed, as are multiple elements | |
| 821 as result of the parsing. | |
| 822 | |
| 823 Passing a ``base_url`` will set the document's ``base_url`` attribute | |
| 824 (and the tree's docinfo.URL). | |
| 825 """ | |
| 826 if parser is None: | |
| 827 parser = html_parser | |
| 828 | |
| 829 accept_leading_text = bool(create_parent) | |
| 830 | |
| 831 elements = fragments_fromstring( | |
| 832 html, parser=parser, no_leading_text=not accept_leading_text, | |
| 833 base_url=base_url, **kw) | |
| 834 | |
| 835 if create_parent: | |
| 836 if not isinstance(create_parent, basestring): | |
| 837 create_parent = 'div' | |
| 838 new_root = Element(create_parent) | |
| 839 if elements: | |
| 840 if isinstance(elements[0], basestring): | |
| 841 new_root.text = elements[0] | |
| 842 del elements[0] | |
| 843 new_root.extend(elements) | |
| 844 return new_root | |
| 845 | |
| 846 if not elements: | |
| 847 raise etree.ParserError('No elements found') | |
| 848 if len(elements) > 1: | |
| 849 raise etree.ParserError( | |
| 850 "Multiple elements found (%s)" | |
| 851 % ', '.join([_element_name(e) for e in elements])) | |
| 852 el = elements[0] | |
| 853 if el.tail and el.tail.strip(): | |
| 854 raise etree.ParserError( | |
| 855 "Element followed by text: %r" % el.tail) | |
| 856 el.tail = None | |
| 857 return el | |
| 858 | |
| 859 | |
| 860 def fromstring(html, base_url=None, parser=None, **kw): | |
| 861 """ | |
| 862 Parse the html, returning a single element/document. | |
| 863 | |
| 864 This tries to minimally parse the chunk of text, without knowing if it | |
| 865 is a fragment or a document. | |
| 866 | |
| 867 base_url will set the document's base_url attribute (and the tree's docinfo.URL) | |
| 868 """ | |
| 869 if parser is None: | |
| 870 parser = html_parser | |
| 871 if isinstance(html, bytes): | |
| 872 is_full_html = _looks_like_full_html_bytes(html) | |
| 873 else: | |
| 874 is_full_html = _looks_like_full_html_unicode(html) | |
| 875 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) | |
| 876 if is_full_html: | |
| 877 return doc | |
| 878 # otherwise, lets parse it out... | |
| 879 bodies = doc.findall('body') | |
| 880 if not bodies: | |
| 881 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) | |
| 882 if bodies: | |
| 883 body = bodies[0] | |
| 884 if len(bodies) > 1: | |
| 885 # Somehow there are multiple bodies, which is bad, but just | |
| 886 # smash them into one body | |
| 887 for other_body in bodies[1:]: | |
| 888 if other_body.text: | |
| 889 if len(body): | |
| 890 body[-1].tail = (body[-1].tail or '') + other_body.text | |
| 891 else: | |
| 892 body.text = (body.text or '') + other_body.text | |
| 893 body.extend(other_body) | |
| 894 # We'll ignore tail | |
| 895 # I guess we are ignoring attributes too | |
| 896 other_body.drop_tree() | |
| 897 else: | |
| 898 body = None | |
| 899 heads = doc.findall('head') | |
| 900 if not heads: | |
| 901 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) | |
| 902 if heads: | |
| 903 # Well, we have some sort of structure, so lets keep it all | |
| 904 head = heads[0] | |
| 905 if len(heads) > 1: | |
| 906 for other_head in heads[1:]: | |
| 907 head.extend(other_head) | |
| 908 # We don't care about text or tail in a head | |
| 909 other_head.drop_tree() | |
| 910 return doc | |
| 911 if body is None: | |
| 912 return doc | |
| 913 if (len(body) == 1 and (not body.text or not body.text.strip()) | |
| 914 and (not body[-1].tail or not body[-1].tail.strip())): | |
| 915 # The body has just one element, so it was probably a single | |
| 916 # element passed in | |
| 917 return body[0] | |
| 918 # Now we have a body which represents a bunch of tags which have the | |
| 919 # content that was passed in. We will create a fake container, which | |
| 920 # is the body tag, except <body> implies too much structure. | |
| 921 if _contains_block_level_tag(body): | |
| 922 body.tag = 'div' | |
| 923 else: | |
| 924 body.tag = 'span' | |
| 925 return body | |
| 926 | |
| 927 | |
| 928 def parse(filename_or_url, parser=None, base_url=None, **kw): | |
| 929 """ | |
| 930 Parse a filename, URL, or file-like object into an HTML document | |
| 931 tree. Note: this returns a tree, not an element. Use | |
| 932 ``parse(...).getroot()`` to get the document root. | |
| 933 | |
| 934 You can override the base URL with the ``base_url`` keyword. This | |
| 935 is most useful when parsing from a file-like object. | |
| 936 """ | |
| 937 if parser is None: | |
| 938 parser = html_parser | |
| 939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw) | |
| 940 | |
| 941 | |
| 942 def _contains_block_level_tag(el): | |
| 943 # FIXME: I could do this with XPath, but would that just be | |
| 944 # unnecessarily slow? | |
| 945 for el in el.iter(etree.Element): | |
| 946 if _nons(el.tag) in defs.block_tags: | |
| 947 return True | |
| 948 return False | |
| 949 | |
| 950 | |
| 951 def _element_name(el): | |
| 952 if isinstance(el, etree.CommentBase): | |
| 953 return 'comment' | |
| 954 elif isinstance(el, basestring): | |
| 955 return 'string' | |
| 956 else: | |
| 957 return _nons(el.tag) | |
| 958 | |
| 959 | |
| 960 ################################################################################ | |
| 961 # form handling | |
| 962 ################################################################################ | |
| 963 | |
| 964 class FormElement(HtmlElement): | |
| 965 """ | |
| 966 Represents a <form> element. | |
| 967 """ | |
| 968 | |
| 969 @property | |
| 970 def inputs(self): | |
| 971 """ | |
| 972 Returns an accessor for all the input elements in the form. | |
| 973 | |
| 974 See `InputGetter` for more information about the object. | |
| 975 """ | |
| 976 return InputGetter(self) | |
| 977 | |
| 978 @property | |
| 979 def fields(self): | |
| 980 """ | |
| 981 Dictionary-like object that represents all the fields in this | |
| 982 form. You can set values in this dictionary to effect the | |
| 983 form. | |
| 984 """ | |
| 985 return FieldsDict(self.inputs) | |
| 986 | |
| 987 @fields.setter | |
| 988 def fields(self, value): | |
| 989 fields = self.fields | |
| 990 prev_keys = fields.keys() | |
| 991 for key, value in value.items(): | |
| 992 if key in prev_keys: | |
| 993 prev_keys.remove(key) | |
| 994 fields[key] = value | |
| 995 for key in prev_keys: | |
| 996 if key is None: | |
| 997 # Case of an unnamed input; these aren't really | |
| 998 # expressed in form_values() anyway. | |
| 999 continue | |
| 1000 fields[key] = None | |
| 1001 | |
| 1002 def _name(self): | |
| 1003 if self.get('name'): | |
| 1004 return self.get('name') | |
| 1005 elif self.get('id'): | |
| 1006 return '#' + self.get('id') | |
| 1007 iter_tags = self.body.iter | |
| 1008 forms = list(iter_tags('form')) | |
| 1009 if not forms: | |
| 1010 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) | |
| 1011 return str(forms.index(self)) | |
| 1012 | |
| 1013 def form_values(self): | |
| 1014 """ | |
| 1015 Return a list of tuples of the field values for the form. | |
| 1016 This is suitable to be passed to ``urllib.urlencode()``. | |
| 1017 """ | |
| 1018 results = [] | |
| 1019 for el in self.inputs: | |
| 1020 name = el.name | |
| 1021 if not name or 'disabled' in el.attrib: | |
| 1022 continue | |
| 1023 tag = _nons(el.tag) | |
| 1024 if tag == 'textarea': | |
| 1025 results.append((name, el.value)) | |
| 1026 elif tag == 'select': | |
| 1027 value = el.value | |
| 1028 if el.multiple: | |
| 1029 for v in value: | |
| 1030 results.append((name, v)) | |
| 1031 elif value is not None: | |
| 1032 results.append((name, el.value)) | |
| 1033 else: | |
| 1034 assert tag == 'input', ( | |
| 1035 "Unexpected tag: %r" % el) | |
| 1036 if el.checkable and not el.checked: | |
| 1037 continue | |
| 1038 if el.type in ('submit', 'image', 'reset', 'file'): | |
| 1039 continue | |
| 1040 value = el.value | |
| 1041 if value is not None: | |
| 1042 results.append((name, el.value)) | |
| 1043 return results | |
| 1044 | |
| 1045 @property | |
| 1046 def action(self): | |
| 1047 """ | |
| 1048 Get/set the form's ``action`` attribute. | |
| 1049 """ | |
| 1050 base_url = self.base_url | |
| 1051 action = self.get('action') | |
| 1052 if base_url and action is not None: | |
| 1053 return urljoin(base_url, action) | |
| 1054 else: | |
| 1055 return action | |
| 1056 | |
| 1057 @action.setter | |
| 1058 def action(self, value): | |
| 1059 self.set('action', value) | |
| 1060 | |
| 1061 @action.deleter | |
| 1062 def action(self): | |
| 1063 attrib = self.attrib | |
| 1064 if 'action' in attrib: | |
| 1065 del attrib['action'] | |
| 1066 | |
| 1067 @property | |
| 1068 def method(self): | |
| 1069 """ | |
| 1070 Get/set the form's method. Always returns a capitalized | |
| 1071 string, and defaults to ``'GET'`` | |
| 1072 """ | |
| 1073 return self.get('method', 'GET').upper() | |
| 1074 | |
| 1075 @method.setter | |
| 1076 def method(self, value): | |
| 1077 self.set('method', value.upper()) | |
| 1078 | |
| 1079 | |
| 1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement | |
| 1081 | |
| 1082 | |
| 1083 def submit_form(form, extra_values=None, open_http=None): | |
| 1084 """ | |
| 1085 Helper function to submit a form. Returns a file-like object, as from | |
| 1086 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, | |
| 1087 which shows the URL if there were any redirects. | |
| 1088 | |
| 1089 You can use this like:: | |
| 1090 | |
| 1091 form = doc.forms[0] | |
| 1092 form.inputs['foo'].value = 'bar' # etc | |
| 1093 response = form.submit() | |
| 1094 doc = parse(response) | |
| 1095 doc.make_links_absolute(response.geturl()) | |
| 1096 | |
| 1097 To change the HTTP requester, pass a function as ``open_http`` keyword | |
| 1098 argument that opens the URL for you. The function must have the following | |
| 1099 signature:: | |
| 1100 | |
| 1101 open_http(method, URL, values) | |
| 1102 | |
| 1103 The action is one of 'GET' or 'POST', the URL is the target URL as a | |
| 1104 string, and the values are a sequence of ``(name, value)`` tuples with the | |
| 1105 form data. | |
| 1106 """ | |
| 1107 values = form.form_values() | |
| 1108 if extra_values: | |
| 1109 if hasattr(extra_values, 'items'): | |
| 1110 extra_values = extra_values.items() | |
| 1111 values.extend(extra_values) | |
| 1112 if open_http is None: | |
| 1113 open_http = open_http_urllib | |
| 1114 if form.action: | |
| 1115 url = form.action | |
| 1116 else: | |
| 1117 url = form.base_url | |
| 1118 return open_http(form.method, url, values) | |
| 1119 | |
| 1120 | |
| 1121 def open_http_urllib(method, url, values): | |
| 1122 if not url: | |
| 1123 raise ValueError("cannot submit, no URL provided") | |
| 1124 ## FIXME: should test that it's not a relative URL or something | |
| 1125 try: | |
| 1126 from urllib import urlencode, urlopen | |
| 1127 except ImportError: # Python 3 | |
| 1128 from urllib.request import urlopen | |
| 1129 from urllib.parse import urlencode | |
| 1130 if method == 'GET': | |
| 1131 if '?' in url: | |
| 1132 url += '&' | |
| 1133 else: | |
| 1134 url += '?' | |
| 1135 url += urlencode(values) | |
| 1136 data = None | |
| 1137 else: | |
| 1138 data = urlencode(values) | |
| 1139 if not isinstance(data, bytes): | |
| 1140 data = data.encode('ASCII') | |
| 1141 return urlopen(url, data) | |
| 1142 | |
| 1143 | |
| 1144 class FieldsDict(MutableMapping): | |
| 1145 | |
| 1146 def __init__(self, inputs): | |
| 1147 self.inputs = inputs | |
| 1148 def __getitem__(self, item): | |
| 1149 return self.inputs[item].value | |
| 1150 def __setitem__(self, item, value): | |
| 1151 self.inputs[item].value = value | |
| 1152 def __delitem__(self, item): | |
| 1153 raise KeyError( | |
| 1154 "You cannot remove keys from ElementDict") | |
| 1155 def keys(self): | |
| 1156 return self.inputs.keys() | |
| 1157 def __contains__(self, item): | |
| 1158 return item in self.inputs | |
| 1159 def __iter__(self): | |
| 1160 return iter(self.inputs.keys()) | |
| 1161 def __len__(self): | |
| 1162 return len(self.inputs) | |
| 1163 | |
| 1164 def __repr__(self): | |
| 1165 return '<%s for form %s>' % ( | |
| 1166 self.__class__.__name__, | |
| 1167 self.inputs.form._name()) | |
| 1168 | |
| 1169 | |
| 1170 class InputGetter(object): | |
| 1171 | |
| 1172 """ | |
| 1173 An accessor that represents all the input fields in a form. | |
| 1174 | |
| 1175 You can get fields by name from this, with | |
| 1176 ``form.inputs['field_name']``. If there are a set of checkboxes | |
| 1177 with the same name, they are returned as a list (a `CheckboxGroup` | |
| 1178 which also allows value setting). Radio inputs are handled | |
| 1179 similarly. | |
| 1180 | |
| 1181 You can also iterate over this to get all input elements. This | |
| 1182 won't return the same thing as if you get all the names, as | |
| 1183 checkboxes and radio elements are returned individually. | |
| 1184 """ | |
| 1185 | |
| 1186 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") | |
| 1187 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") | |
| 1188 | |
| 1189 def __init__(self, form): | |
| 1190 self.form = form | |
| 1191 | |
| 1192 def __repr__(self): | |
| 1193 return '<%s for form %s>' % ( | |
| 1194 self.__class__.__name__, | |
| 1195 self.form._name()) | |
| 1196 | |
| 1197 ## FIXME: there should be more methods, and it's unclear if this is | |
| 1198 ## a dictionary-like object or list-like object | |
| 1199 | |
| 1200 def __getitem__(self, name): | |
| 1201 results = self._name_xpath(self.form, name=name) | |
| 1202 if results: | |
| 1203 type = results[0].get('type') | |
| 1204 if type == 'radio' and len(results) > 1: | |
| 1205 group = RadioGroup(results) | |
| 1206 group.name = name | |
| 1207 return group | |
| 1208 elif type == 'checkbox' and len(results) > 1: | |
| 1209 group = CheckboxGroup(results) | |
| 1210 group.name = name | |
| 1211 return group | |
| 1212 else: | |
| 1213 # I don't like throwing away elements like this | |
| 1214 return results[0] | |
| 1215 else: | |
| 1216 raise KeyError( | |
| 1217 "No input element with the name %r" % name) | |
| 1218 | |
| 1219 def __contains__(self, name): | |
| 1220 results = self._name_xpath(self.form, name=name) | |
| 1221 return bool(results) | |
| 1222 | |
| 1223 def keys(self): | |
| 1224 names = set() | |
| 1225 for el in self: | |
| 1226 names.add(el.name) | |
| 1227 if None in names: | |
| 1228 names.remove(None) | |
| 1229 return list(names) | |
| 1230 | |
| 1231 def __iter__(self): | |
| 1232 ## FIXME: kind of dumb to turn a list into an iterator, only | |
| 1233 ## to have it likely turned back into a list again :( | |
| 1234 return iter(self._all_xpath(self.form)) | |
| 1235 | |
| 1236 | |
| 1237 class InputMixin(object): | |
| 1238 """ | |
| 1239 Mix-in for all input elements (input, select, and textarea) | |
| 1240 """ | |
| 1241 @property | |
| 1242 def name(self): | |
| 1243 """ | |
| 1244 Get/set the name of the element | |
| 1245 """ | |
| 1246 return self.get('name') | |
| 1247 | |
| 1248 @name.setter | |
| 1249 def name(self, value): | |
| 1250 self.set('name', value) | |
| 1251 | |
| 1252 @name.deleter | |
| 1253 def name(self): | |
| 1254 attrib = self.attrib | |
| 1255 if 'name' in attrib: | |
| 1256 del attrib['name'] | |
| 1257 | |
| 1258 def __repr__(self): | |
| 1259 type_name = getattr(self, 'type', None) | |
| 1260 if type_name: | |
| 1261 type_name = ' type=%r' % type_name | |
| 1262 else: | |
| 1263 type_name = '' | |
| 1264 return '<%s %x name=%r%s>' % ( | |
| 1265 self.__class__.__name__, id(self), self.name, type_name) | |
| 1266 | |
| 1267 | |
| 1268 class TextareaElement(InputMixin, HtmlElement): | |
| 1269 """ | |
| 1270 ``<textarea>`` element. You can get the name with ``.name`` and | |
| 1271 get/set the value with ``.value`` | |
| 1272 """ | |
| 1273 @property | |
| 1274 def value(self): | |
| 1275 """ | |
| 1276 Get/set the value (which is the contents of this element) | |
| 1277 """ | |
| 1278 content = self.text or '' | |
| 1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): | |
| 1280 serialisation_method = 'xml' | |
| 1281 else: | |
| 1282 serialisation_method = 'html' | |
| 1283 for el in self: | |
| 1284 # it's rare that we actually get here, so let's not use ''.join() | |
| 1285 content += etree.tostring( | |
| 1286 el, method=serialisation_method, encoding='unicode') | |
| 1287 return content | |
| 1288 | |
| 1289 @value.setter | |
| 1290 def value(self, value): | |
| 1291 del self[:] | |
| 1292 self.text = value | |
| 1293 | |
| 1294 @value.deleter | |
| 1295 def value(self): | |
| 1296 self.text = '' | |
| 1297 del self[:] | |
| 1298 | |
| 1299 | |
| 1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement | |
| 1301 | |
| 1302 | |
| 1303 class SelectElement(InputMixin, HtmlElement): | |
| 1304 """ | |
| 1305 ``<select>`` element. You can get the name with ``.name``. | |
| 1306 | |
| 1307 ``.value`` will be the value of the selected option, unless this | |
| 1308 is a multi-select element (``<select multiple>``), in which case | |
| 1309 it will be a set-like object. In either case ``.value_options`` | |
| 1310 gives the possible values. | |
| 1311 | |
| 1312 The boolean attribute ``.multiple`` shows if this is a | |
| 1313 multi-select. | |
| 1314 """ | |
| 1315 @property | |
| 1316 def value(self): | |
| 1317 """ | |
| 1318 Get/set the value of this select (the selected option). | |
| 1319 | |
| 1320 If this is a multi-select, this is a set-like object that | |
| 1321 represents all the selected options. | |
| 1322 """ | |
| 1323 if self.multiple: | |
| 1324 return MultipleSelectOptions(self) | |
| 1325 options = _options_xpath(self) | |
| 1326 | |
| 1327 try: | |
| 1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) | |
| 1329 except StopIteration: | |
| 1330 try: | |
| 1331 selected_option = next(el for el in options if el.get('disabled') is None) | |
| 1332 except StopIteration: | |
| 1333 return None | |
| 1334 value = selected_option.get('value') | |
| 1335 if value is None: | |
| 1336 value = (selected_option.text or '').strip() | |
| 1337 return value | |
| 1338 | |
| 1339 @value.setter | |
| 1340 def value(self, value): | |
| 1341 if self.multiple: | |
| 1342 if isinstance(value, basestring): | |
| 1343 raise TypeError("You must pass in a sequence") | |
| 1344 values = self.value | |
| 1345 values.clear() | |
| 1346 values.update(value) | |
| 1347 return | |
| 1348 checked_option = None | |
| 1349 if value is not None: | |
| 1350 for el in _options_xpath(self): | |
| 1351 opt_value = el.get('value') | |
| 1352 if opt_value is None: | |
| 1353 opt_value = (el.text or '').strip() | |
| 1354 if opt_value == value: | |
| 1355 checked_option = el | |
| 1356 break | |
| 1357 else: | |
| 1358 raise ValueError( | |
| 1359 "There is no option with the value of %r" % value) | |
| 1360 for el in _options_xpath(self): | |
| 1361 if 'selected' in el.attrib: | |
| 1362 del el.attrib['selected'] | |
| 1363 if checked_option is not None: | |
| 1364 checked_option.set('selected', '') | |
| 1365 | |
| 1366 @value.deleter | |
| 1367 def value(self): | |
| 1368 # FIXME: should del be allowed at all? | |
| 1369 if self.multiple: | |
| 1370 self.value.clear() | |
| 1371 else: | |
| 1372 self.value = None | |
| 1373 | |
| 1374 @property | |
| 1375 def value_options(self): | |
| 1376 """ | |
| 1377 All the possible values this select can have (the ``value`` | |
| 1378 attribute of all the ``<option>`` elements. | |
| 1379 """ | |
| 1380 options = [] | |
| 1381 for el in _options_xpath(self): | |
| 1382 value = el.get('value') | |
| 1383 if value is None: | |
| 1384 value = (el.text or '').strip() | |
| 1385 options.append(value) | |
| 1386 return options | |
| 1387 | |
| 1388 @property | |
| 1389 def multiple(self): | |
| 1390 """ | |
| 1391 Boolean attribute: is there a ``multiple`` attribute on this element. | |
| 1392 """ | |
| 1393 return 'multiple' in self.attrib | |
| 1394 | |
| 1395 @multiple.setter | |
| 1396 def multiple(self, value): | |
| 1397 if value: | |
| 1398 self.set('multiple', '') | |
| 1399 elif 'multiple' in self.attrib: | |
| 1400 del self.attrib['multiple'] | |
| 1401 | |
| 1402 | |
| 1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement | |
| 1404 | |
| 1405 | |
| 1406 class MultipleSelectOptions(SetMixin): | |
| 1407 """ | |
| 1408 Represents all the selected options in a ``<select multiple>`` element. | |
| 1409 | |
| 1410 You can add to this set-like option to select an option, or remove | |
| 1411 to unselect the option. | |
| 1412 """ | |
| 1413 | |
| 1414 def __init__(self, select): | |
| 1415 self.select = select | |
| 1416 | |
| 1417 @property | |
| 1418 def options(self): | |
| 1419 """ | |
| 1420 Iterator of all the ``<option>`` elements. | |
| 1421 """ | |
| 1422 return iter(_options_xpath(self.select)) | |
| 1423 | |
| 1424 def __iter__(self): | |
| 1425 for option in self.options: | |
| 1426 if 'selected' in option.attrib: | |
| 1427 opt_value = option.get('value') | |
| 1428 if opt_value is None: | |
| 1429 opt_value = (option.text or '').strip() | |
| 1430 yield opt_value | |
| 1431 | |
| 1432 def add(self, item): | |
| 1433 for option in self.options: | |
| 1434 opt_value = option.get('value') | |
| 1435 if opt_value is None: | |
| 1436 opt_value = (option.text or '').strip() | |
| 1437 if opt_value == item: | |
| 1438 option.set('selected', '') | |
| 1439 break | |
| 1440 else: | |
| 1441 raise ValueError( | |
| 1442 "There is no option with the value %r" % item) | |
| 1443 | |
| 1444 def remove(self, item): | |
| 1445 for option in self.options: | |
| 1446 opt_value = option.get('value') | |
| 1447 if opt_value is None: | |
| 1448 opt_value = (option.text or '').strip() | |
| 1449 if opt_value == item: | |
| 1450 if 'selected' in option.attrib: | |
| 1451 del option.attrib['selected'] | |
| 1452 else: | |
| 1453 raise ValueError( | |
| 1454 "The option %r is not currently selected" % item) | |
| 1455 break | |
| 1456 else: | |
| 1457 raise ValueError( | |
| 1458 "There is not option with the value %r" % item) | |
| 1459 | |
| 1460 def __repr__(self): | |
| 1461 return '<%s {%s} for select name=%r>' % ( | |
| 1462 self.__class__.__name__, | |
| 1463 ', '.join([repr(v) for v in self]), | |
| 1464 self.select.name) | |
| 1465 | |
| 1466 | |
| 1467 class RadioGroup(list): | |
| 1468 """ | |
| 1469 This object represents several ``<input type=radio>`` elements | |
| 1470 that have the same name. | |
| 1471 | |
| 1472 You can use this like a list, but also use the property | |
| 1473 ``.value`` to check/uncheck inputs. Also you can use | |
| 1474 ``.value_options`` to get the possible values. | |
| 1475 """ | |
| 1476 @property | |
| 1477 def value(self): | |
| 1478 """ | |
| 1479 Get/set the value, which checks the radio with that value (and | |
| 1480 unchecks any other value). | |
| 1481 """ | |
| 1482 for el in self: | |
| 1483 if 'checked' in el.attrib: | |
| 1484 return el.get('value') | |
| 1485 return None | |
| 1486 | |
| 1487 @value.setter | |
| 1488 def value(self, value): | |
| 1489 checked_option = None | |
| 1490 if value is not None: | |
| 1491 for el in self: | |
| 1492 if el.get('value') == value: | |
| 1493 checked_option = el | |
| 1494 break | |
| 1495 else: | |
| 1496 raise ValueError("There is no radio input with the value %r" % value) | |
| 1497 for el in self: | |
| 1498 if 'checked' in el.attrib: | |
| 1499 del el.attrib['checked'] | |
| 1500 if checked_option is not None: | |
| 1501 checked_option.set('checked', '') | |
| 1502 | |
| 1503 @value.deleter | |
| 1504 def value(self): | |
| 1505 self.value = None | |
| 1506 | |
| 1507 @property | |
| 1508 def value_options(self): | |
| 1509 """ | |
| 1510 Returns a list of all the possible values. | |
| 1511 """ | |
| 1512 return [el.get('value') for el in self] | |
| 1513 | |
| 1514 def __repr__(self): | |
| 1515 return '%s(%s)' % ( | |
| 1516 self.__class__.__name__, | |
| 1517 list.__repr__(self)) | |
| 1518 | |
| 1519 | |
| 1520 class CheckboxGroup(list): | |
| 1521 """ | |
| 1522 Represents a group of checkboxes (``<input type=checkbox>``) that | |
| 1523 have the same name. | |
| 1524 | |
| 1525 In addition to using this like a list, the ``.value`` attribute | |
| 1526 returns a set-like object that you can add to or remove from to | |
| 1527 check and uncheck checkboxes. You can also use ``.value_options`` | |
| 1528 to get the possible values. | |
| 1529 """ | |
| 1530 @property | |
| 1531 def value(self): | |
| 1532 """ | |
| 1533 Return a set-like object that can be modified to check or | |
| 1534 uncheck individual checkboxes according to their value. | |
| 1535 """ | |
| 1536 return CheckboxValues(self) | |
| 1537 | |
| 1538 @value.setter | |
| 1539 def value(self, value): | |
| 1540 values = self.value | |
| 1541 values.clear() | |
| 1542 if not hasattr(value, '__iter__'): | |
| 1543 raise ValueError( | |
| 1544 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" | |
| 1545 % (self[0].name, value)) | |
| 1546 values.update(value) | |
| 1547 | |
| 1548 @value.deleter | |
| 1549 def value(self): | |
| 1550 self.value.clear() | |
| 1551 | |
| 1552 @property | |
| 1553 def value_options(self): | |
| 1554 """ | |
| 1555 Returns a list of all the possible values. | |
| 1556 """ | |
| 1557 return [el.get('value') for el in self] | |
| 1558 | |
| 1559 def __repr__(self): | |
| 1560 return '%s(%s)' % ( | |
| 1561 self.__class__.__name__, list.__repr__(self)) | |
| 1562 | |
| 1563 | |
| 1564 class CheckboxValues(SetMixin): | |
| 1565 """ | |
| 1566 Represents the values of the checked checkboxes in a group of | |
| 1567 checkboxes with the same name. | |
| 1568 """ | |
| 1569 | |
| 1570 def __init__(self, group): | |
| 1571 self.group = group | |
| 1572 | |
| 1573 def __iter__(self): | |
| 1574 return iter([ | |
| 1575 el.get('value') | |
| 1576 for el in self.group | |
| 1577 if 'checked' in el.attrib]) | |
| 1578 | |
| 1579 def add(self, value): | |
| 1580 for el in self.group: | |
| 1581 if el.get('value') == value: | |
| 1582 el.set('checked', '') | |
| 1583 break | |
| 1584 else: | |
| 1585 raise KeyError("No checkbox with value %r" % value) | |
| 1586 | |
| 1587 def remove(self, value): | |
| 1588 for el in self.group: | |
| 1589 if el.get('value') == value: | |
| 1590 if 'checked' in el.attrib: | |
| 1591 del el.attrib['checked'] | |
| 1592 else: | |
| 1593 raise KeyError( | |
| 1594 "The checkbox with value %r was already unchecked" % value) | |
| 1595 break | |
| 1596 else: | |
| 1597 raise KeyError( | |
| 1598 "No checkbox with value %r" % value) | |
| 1599 | |
| 1600 def __repr__(self): | |
| 1601 return '<%s {%s} for checkboxes name=%r>' % ( | |
| 1602 self.__class__.__name__, | |
| 1603 ', '.join([repr(v) for v in self]), | |
| 1604 self.group.name) | |
| 1605 | |
| 1606 | |
| 1607 class InputElement(InputMixin, HtmlElement): | |
| 1608 """ | |
| 1609 Represents an ``<input>`` element. | |
| 1610 | |
| 1611 You can get the type with ``.type`` (which is lower-cased and | |
| 1612 defaults to ``'text'``). | |
| 1613 | |
| 1614 Also you can get and set the value with ``.value`` | |
| 1615 | |
| 1616 Checkboxes and radios have the attribute ``input.checkable == | |
| 1617 True`` (for all others it is false) and a boolean attribute | |
| 1618 ``.checked``. | |
| 1619 | |
| 1620 """ | |
| 1621 | |
| 1622 ## FIXME: I'm a little uncomfortable with the use of .checked | |
| 1623 @property | |
| 1624 def value(self): | |
| 1625 """ | |
| 1626 Get/set the value of this element, using the ``value`` attribute. | |
| 1627 | |
| 1628 Also, if this is a checkbox and it has no value, this defaults | |
| 1629 to ``'on'``. If it is a checkbox or radio that is not | |
| 1630 checked, this returns None. | |
| 1631 """ | |
| 1632 if self.checkable: | |
| 1633 if self.checked: | |
| 1634 return self.get('value') or 'on' | |
| 1635 else: | |
| 1636 return None | |
| 1637 return self.get('value') | |
| 1638 | |
| 1639 @value.setter | |
| 1640 def value(self, value): | |
| 1641 if self.checkable: | |
| 1642 if not value: | |
| 1643 self.checked = False | |
| 1644 else: | |
| 1645 self.checked = True | |
| 1646 if isinstance(value, basestring): | |
| 1647 self.set('value', value) | |
| 1648 else: | |
| 1649 self.set('value', value) | |
| 1650 | |
| 1651 @value.deleter | |
| 1652 def value(self): | |
| 1653 if self.checkable: | |
| 1654 self.checked = False | |
| 1655 else: | |
| 1656 if 'value' in self.attrib: | |
| 1657 del self.attrib['value'] | |
| 1658 | |
| 1659 @property | |
| 1660 def type(self): | |
| 1661 """ | |
| 1662 Return the type of this element (using the type attribute). | |
| 1663 """ | |
| 1664 return self.get('type', 'text').lower() | |
| 1665 | |
| 1666 @type.setter | |
| 1667 def type(self, value): | |
| 1668 self.set('type', value) | |
| 1669 | |
| 1670 @property | |
| 1671 def checkable(self): | |
| 1672 """ | |
| 1673 Boolean: can this element be checked? | |
| 1674 """ | |
| 1675 return self.type in ('checkbox', 'radio') | |
| 1676 | |
| 1677 @property | |
| 1678 def checked(self): | |
| 1679 """ | |
| 1680 Boolean attribute to get/set the presence of the ``checked`` | |
| 1681 attribute. | |
| 1682 | |
| 1683 You can only use this on checkable input types. | |
| 1684 """ | |
| 1685 if not self.checkable: | |
| 1686 raise AttributeError('Not a checkable input type') | |
| 1687 return 'checked' in self.attrib | |
| 1688 | |
| 1689 @checked.setter | |
| 1690 def checked(self, value): | |
| 1691 if not self.checkable: | |
| 1692 raise AttributeError('Not a checkable input type') | |
| 1693 if value: | |
| 1694 self.set('checked', '') | |
| 1695 else: | |
| 1696 attrib = self.attrib | |
| 1697 if 'checked' in attrib: | |
| 1698 del attrib['checked'] | |
| 1699 | |
| 1700 | |
| 1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement | |
| 1702 | |
| 1703 | |
| 1704 class LabelElement(HtmlElement): | |
| 1705 """ | |
| 1706 Represents a ``<label>`` element. | |
| 1707 | |
| 1708 Label elements are linked to other elements with their ``for`` | |
| 1709 attribute. You can access this element with ``label.for_element``. | |
| 1710 """ | |
| 1711 @property | |
| 1712 def for_element(self): | |
| 1713 """ | |
| 1714 Get/set the element this label points to. Return None if it | |
| 1715 can't be found. | |
| 1716 """ | |
| 1717 id = self.get('for') | |
| 1718 if not id: | |
| 1719 return None | |
| 1720 return self.body.get_element_by_id(id) | |
| 1721 | |
| 1722 @for_element.setter | |
| 1723 def for_element(self, other): | |
| 1724 id = other.get('id') | |
| 1725 if not id: | |
| 1726 raise TypeError( | |
| 1727 "Element %r has no id attribute" % other) | |
| 1728 self.set('for', id) | |
| 1729 | |
| 1730 @for_element.deleter | |
| 1731 def for_element(self): | |
| 1732 attrib = self.attrib | |
| 1733 if 'id' in attrib: | |
| 1734 del attrib['id'] | |
| 1735 | |
| 1736 | |
| 1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement | |
| 1738 | |
| 1739 | |
| 1740 ############################################################ | |
| 1741 ## Serialization | |
| 1742 ############################################################ | |
| 1743 | |
| 1744 def html_to_xhtml(html): | |
| 1745 """Convert all tags in an HTML tree to XHTML by moving them to the | |
| 1746 XHTML namespace. | |
| 1747 """ | |
| 1748 try: | |
| 1749 html = html.getroot() | |
| 1750 except AttributeError: | |
| 1751 pass | |
| 1752 prefix = "{%s}" % XHTML_NAMESPACE | |
| 1753 for el in html.iter(etree.Element): | |
| 1754 tag = el.tag | |
| 1755 if tag[0] != '{': | |
| 1756 el.tag = prefix + tag | |
| 1757 | |
| 1758 | |
| 1759 def xhtml_to_html(xhtml): | |
| 1760 """Convert all tags in an XHTML tree to HTML by removing their | |
| 1761 XHTML namespace. | |
| 1762 """ | |
| 1763 try: | |
| 1764 xhtml = xhtml.getroot() | |
| 1765 except AttributeError: | |
| 1766 pass | |
| 1767 prefix = "{%s}" % XHTML_NAMESPACE | |
| 1768 prefix_len = len(prefix) | |
| 1769 for el in xhtml.iter(prefix + "*"): | |
| 1770 el.tag = el.tag[prefix_len:] | |
| 1771 | |
| 1772 | |
| 1773 # This isn't a general match, but it's a match for what libxml2 | |
| 1774 # specifically serialises: | |
| 1775 __str_replace_meta_content_type = re.compile( | |
| 1776 r'<meta http-equiv="Content-Type"[^>]*>').sub | |
| 1777 __bytes_replace_meta_content_type = re.compile( | |
| 1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub | |
| 1779 | |
| 1780 | |
| 1781 def tostring(doc, pretty_print=False, include_meta_content_type=False, | |
| 1782 encoding=None, method="html", with_tail=True, doctype=None): | |
| 1783 """Return an HTML string representation of the document. | |
| 1784 | |
| 1785 Note: if include_meta_content_type is true this will create a | |
| 1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head; | |
| 1787 regardless of the value of include_meta_content_type any existing | |
| 1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed | |
| 1789 | |
| 1790 The ``encoding`` argument controls the output encoding (defaults to | |
| 1791 ASCII, with &#...; character references for any characters outside | |
| 1792 of ASCII). Note that you can pass the name ``'unicode'`` as | |
| 1793 ``encoding`` argument to serialise to a Unicode string. | |
| 1794 | |
| 1795 The ``method`` argument defines the output method. It defaults to | |
| 1796 'html', but can also be 'xml' for xhtml output, or 'text' to | |
| 1797 serialise to plain text without markup. | |
| 1798 | |
| 1799 To leave out the tail text of the top-level element that is being | |
| 1800 serialised, pass ``with_tail=False``. | |
| 1801 | |
| 1802 The ``doctype`` option allows passing in a plain string that will | |
| 1803 be serialised before the XML tree. Note that passing in non | |
| 1804 well-formed content here will make the XML output non well-formed. | |
| 1805 Also, an existing doctype in the document tree will not be removed | |
| 1806 when serialising an ElementTree instance. | |
| 1807 | |
| 1808 Example:: | |
| 1809 | |
| 1810 >>> from lxml import html | |
| 1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') | |
| 1812 | |
| 1813 >>> html.tostring(root) | |
| 1814 b'<p>Hello<br>world!</p>' | |
| 1815 >>> html.tostring(root, method='html') | |
| 1816 b'<p>Hello<br>world!</p>' | |
| 1817 | |
| 1818 >>> html.tostring(root, method='xml') | |
| 1819 b'<p>Hello<br/>world!</p>' | |
| 1820 | |
| 1821 >>> html.tostring(root, method='text') | |
| 1822 b'Helloworld!' | |
| 1823 | |
| 1824 >>> html.tostring(root, method='text', encoding='unicode') | |
| 1825 u'Helloworld!' | |
| 1826 | |
| 1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') | |
| 1828 >>> html.tostring(root[0], method='text', encoding='unicode') | |
| 1829 u'Helloworld!TAIL' | |
| 1830 | |
| 1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) | |
| 1832 u'Helloworld!' | |
| 1833 | |
| 1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') | |
| 1835 >>> html.tostring(doc, method='html', encoding='unicode') | |
| 1836 u'<html><body><p>Hello<br>world!</p></body></html>' | |
| 1837 | |
| 1838 >>> print(html.tostring(doc, method='html', encoding='unicode', | |
| 1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' | |
| 1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) | |
| 1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> | |
| 1842 <html><body><p>Hello<br>world!</p></body></html> | |
| 1843 """ | |
| 1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print, | |
| 1845 encoding=encoding, with_tail=with_tail, | |
| 1846 doctype=doctype) | |
| 1847 if method == 'html' and not include_meta_content_type: | |
| 1848 if isinstance(html, str): | |
| 1849 html = __str_replace_meta_content_type('', html) | |
| 1850 else: | |
| 1851 html = __bytes_replace_meta_content_type(bytes(), html) | |
| 1852 return html | |
| 1853 | |
| 1854 | |
| 1855 tostring.__doc__ = __fix_docstring(tostring.__doc__) | |
| 1856 | |
| 1857 | |
| 1858 def open_in_browser(doc, encoding=None): | |
| 1859 """ | |
| 1860 Open the HTML document in a web browser, saving it to a temporary | |
| 1861 file to open it. Note that this does not delete the file after | |
| 1862 use. This is mainly meant for debugging. | |
| 1863 """ | |
| 1864 import os | |
| 1865 import webbrowser | |
| 1866 import tempfile | |
| 1867 if not isinstance(doc, etree._ElementTree): | |
| 1868 doc = etree.ElementTree(doc) | |
| 1869 handle, fn = tempfile.mkstemp(suffix='.html') | |
| 1870 f = os.fdopen(handle, 'wb') | |
| 1871 try: | |
| 1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") | |
| 1873 finally: | |
| 1874 # we leak the file itself here, but we should at least close it | |
| 1875 f.close() | |
| 1876 url = 'file://' + fn.replace(os.path.sep, '/') | |
| 1877 print(url) | |
| 1878 webbrowser.open(url) | |
| 1879 | |
| 1880 | |
| 1881 ################################################################################ | |
| 1882 # configure Element class lookup | |
| 1883 ################################################################################ | |
| 1884 | |
| 1885 class HTMLParser(etree.HTMLParser): | |
| 1886 """An HTML parser that is configured to return lxml.html Element | |
| 1887 objects. | |
| 1888 """ | |
| 1889 def __init__(self, **kwargs): | |
| 1890 super(HTMLParser, self).__init__(**kwargs) | |
| 1891 self.set_element_class_lookup(HtmlElementClassLookup()) | |
| 1892 | |
| 1893 | |
| 1894 class XHTMLParser(etree.XMLParser): | |
| 1895 """An XML parser that is configured to return lxml.html Element | |
| 1896 objects. | |
| 1897 | |
| 1898 Note that this parser is not really XHTML aware unless you let it | |
| 1899 load a DTD that declares the HTML entities. To do this, make sure | |
| 1900 you have the XHTML DTDs installed in your catalogs, and create the | |
| 1901 parser like this:: | |
| 1902 | |
| 1903 >>> parser = XHTMLParser(load_dtd=True) | |
| 1904 | |
| 1905 If you additionally want to validate the document, use this:: | |
| 1906 | |
| 1907 >>> parser = XHTMLParser(dtd_validation=True) | |
| 1908 | |
| 1909 For catalog support, see http://www.xmlsoft.org/catalog.html. | |
| 1910 """ | |
| 1911 def __init__(self, **kwargs): | |
| 1912 super(XHTMLParser, self).__init__(**kwargs) | |
| 1913 self.set_element_class_lookup(HtmlElementClassLookup()) | |
| 1914 | |
| 1915 | |
| 1916 def Element(*args, **kw): | |
| 1917 """Create a new HTML Element. | |
| 1918 | |
| 1919 This can also be used for XHTML documents. | |
| 1920 """ | |
| 1921 v = html_parser.makeelement(*args, **kw) | |
| 1922 return v | |
| 1923 | |
| 1924 | |
| 1925 html_parser = HTMLParser() | |
| 1926 xhtml_parser = XHTMLParser() |
