Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bs4/builder/__init__.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 # Use of this source code is governed by the MIT license. | |
| 2 __license__ = "MIT" | |
| 3 | |
| 4 from collections import defaultdict | |
| 5 import itertools | |
| 6 import sys | |
| 7 from bs4.element import ( | |
| 8 CharsetMetaAttributeValue, | |
| 9 ContentMetaAttributeValue, | |
| 10 Stylesheet, | |
| 11 Script, | |
| 12 TemplateString, | |
| 13 nonwhitespace_re | |
| 14 ) | |
| 15 | |
| 16 __all__ = [ | |
| 17 'HTMLTreeBuilder', | |
| 18 'SAXTreeBuilder', | |
| 19 'TreeBuilder', | |
| 20 'TreeBuilderRegistry', | |
| 21 ] | |
| 22 | |
| 23 # Some useful features for a TreeBuilder to have. | |
| 24 FAST = 'fast' | |
| 25 PERMISSIVE = 'permissive' | |
| 26 STRICT = 'strict' | |
| 27 XML = 'xml' | |
| 28 HTML = 'html' | |
| 29 HTML_5 = 'html5' | |
| 30 | |
| 31 | |
| 32 class TreeBuilderRegistry(object): | |
| 33 """A way of looking up TreeBuilder subclasses by their name or by desired | |
| 34 features. | |
| 35 """ | |
| 36 | |
| 37 def __init__(self): | |
| 38 self.builders_for_feature = defaultdict(list) | |
| 39 self.builders = [] | |
| 40 | |
| 41 def register(self, treebuilder_class): | |
| 42 """Register a treebuilder based on its advertised features. | |
| 43 | |
| 44 :param treebuilder_class: A subclass of Treebuilder. its .features | |
| 45 attribute should list its features. | |
| 46 """ | |
| 47 for feature in treebuilder_class.features: | |
| 48 self.builders_for_feature[feature].insert(0, treebuilder_class) | |
| 49 self.builders.insert(0, treebuilder_class) | |
| 50 | |
| 51 def lookup(self, *features): | |
| 52 """Look up a TreeBuilder subclass with the desired features. | |
| 53 | |
| 54 :param features: A list of features to look for. If none are | |
| 55 provided, the most recently registered TreeBuilder subclass | |
| 56 will be used. | |
| 57 :return: A TreeBuilder subclass, or None if there's no | |
| 58 registered subclass with all the requested features. | |
| 59 """ | |
| 60 if len(self.builders) == 0: | |
| 61 # There are no builders at all. | |
| 62 return None | |
| 63 | |
| 64 if len(features) == 0: | |
| 65 # They didn't ask for any features. Give them the most | |
| 66 # recently registered builder. | |
| 67 return self.builders[0] | |
| 68 | |
| 69 # Go down the list of features in order, and eliminate any builders | |
| 70 # that don't match every feature. | |
| 71 features = list(features) | |
| 72 features.reverse() | |
| 73 candidates = None | |
| 74 candidate_set = None | |
| 75 while len(features) > 0: | |
| 76 feature = features.pop() | |
| 77 we_have_the_feature = self.builders_for_feature.get(feature, []) | |
| 78 if len(we_have_the_feature) > 0: | |
| 79 if candidates is None: | |
| 80 candidates = we_have_the_feature | |
| 81 candidate_set = set(candidates) | |
| 82 else: | |
| 83 # Eliminate any candidates that don't have this feature. | |
| 84 candidate_set = candidate_set.intersection( | |
| 85 set(we_have_the_feature)) | |
| 86 | |
| 87 # The only valid candidates are the ones in candidate_set. | |
| 88 # Go through the original list of candidates and pick the first one | |
| 89 # that's in candidate_set. | |
| 90 if candidate_set is None: | |
| 91 return None | |
| 92 for candidate in candidates: | |
| 93 if candidate in candidate_set: | |
| 94 return candidate | |
| 95 return None | |
| 96 | |
| 97 # The BeautifulSoup class will take feature lists from developers and use them | |
| 98 # to look up builders in this registry. | |
| 99 builder_registry = TreeBuilderRegistry() | |
| 100 | |
| 101 class TreeBuilder(object): | |
| 102 """Turn a textual document into a Beautiful Soup object tree.""" | |
| 103 | |
| 104 NAME = "[Unknown tree builder]" | |
| 105 ALTERNATE_NAMES = [] | |
| 106 features = [] | |
| 107 | |
| 108 is_xml = False | |
| 109 picklable = False | |
| 110 empty_element_tags = None # A tag will be considered an empty-element | |
| 111 # tag when and only when it has no contents. | |
| 112 | |
| 113 # A value for these tag/attribute combinations is a space- or | |
| 114 # comma-separated list of CDATA, rather than a single CDATA. | |
| 115 DEFAULT_CDATA_LIST_ATTRIBUTES = {} | |
| 116 | |
| 117 # Whitespace should be preserved inside these tags. | |
| 118 DEFAULT_PRESERVE_WHITESPACE_TAGS = set() | |
| 119 | |
| 120 # The textual contents of tags with these names should be | |
| 121 # instantiated with some class other than NavigableString. | |
| 122 DEFAULT_STRING_CONTAINERS = {} | |
| 123 | |
| 124 USE_DEFAULT = object() | |
| 125 | |
| 126 # Most parsers don't keep track of line numbers. | |
| 127 TRACKS_LINE_NUMBERS = False | |
| 128 | |
| 129 def __init__(self, multi_valued_attributes=USE_DEFAULT, | |
| 130 preserve_whitespace_tags=USE_DEFAULT, | |
| 131 store_line_numbers=USE_DEFAULT, | |
| 132 string_containers=USE_DEFAULT, | |
| 133 ): | |
| 134 """Constructor. | |
| 135 | |
| 136 :param multi_valued_attributes: If this is set to None, the | |
| 137 TreeBuilder will not turn any values for attributes like | |
| 138 'class' into lists. Setting this to a dictionary will | |
| 139 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES | |
| 140 for an example. | |
| 141 | |
| 142 Internally, these are called "CDATA list attributes", but that | |
| 143 probably doesn't make sense to an end-user, so the argument name | |
| 144 is `multi_valued_attributes`. | |
| 145 | |
| 146 :param preserve_whitespace_tags: A list of tags to treat | |
| 147 the way <pre> tags are treated in HTML. Tags in this list | |
| 148 are immune from pretty-printing; their contents will always be | |
| 149 output as-is. | |
| 150 | |
| 151 :param string_containers: A dictionary mapping tag names to | |
| 152 the classes that should be instantiated to contain the textual | |
| 153 contents of those tags. The default is to use NavigableString | |
| 154 for every tag, no matter what the name. You can override the | |
| 155 default by changing DEFAULT_STRING_CONTAINERS. | |
| 156 | |
| 157 :param store_line_numbers: If the parser keeps track of the | |
| 158 line numbers and positions of the original markup, that | |
| 159 information will, by default, be stored in each corresponding | |
| 160 `Tag` object. You can turn this off by passing | |
| 161 store_line_numbers=False. If the parser you're using doesn't | |
| 162 keep track of this information, then setting store_line_numbers=True | |
| 163 will do nothing. | |
| 164 """ | |
| 165 self.soup = None | |
| 166 if multi_valued_attributes is self.USE_DEFAULT: | |
| 167 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES | |
| 168 self.cdata_list_attributes = multi_valued_attributes | |
| 169 if preserve_whitespace_tags is self.USE_DEFAULT: | |
| 170 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS | |
| 171 self.preserve_whitespace_tags = preserve_whitespace_tags | |
| 172 if store_line_numbers == self.USE_DEFAULT: | |
| 173 store_line_numbers = self.TRACKS_LINE_NUMBERS | |
| 174 self.store_line_numbers = store_line_numbers | |
| 175 if string_containers == self.USE_DEFAULT: | |
| 176 string_containers = self.DEFAULT_STRING_CONTAINERS | |
| 177 self.string_containers = string_containers | |
| 178 | |
| 179 def initialize_soup(self, soup): | |
| 180 """The BeautifulSoup object has been initialized and is now | |
| 181 being associated with the TreeBuilder. | |
| 182 | |
| 183 :param soup: A BeautifulSoup object. | |
| 184 """ | |
| 185 self.soup = soup | |
| 186 | |
| 187 def reset(self): | |
| 188 """Do any work necessary to reset the underlying parser | |
| 189 for a new document. | |
| 190 | |
| 191 By default, this does nothing. | |
| 192 """ | |
| 193 pass | |
| 194 | |
| 195 def can_be_empty_element(self, tag_name): | |
| 196 """Might a tag with this name be an empty-element tag? | |
| 197 | |
| 198 The final markup may or may not actually present this tag as | |
| 199 self-closing. | |
| 200 | |
| 201 For instance: an HTMLBuilder does not consider a <p> tag to be | |
| 202 an empty-element tag (it's not in | |
| 203 HTMLBuilder.empty_element_tags). This means an empty <p> tag | |
| 204 will be presented as "<p></p>", not "<p/>" or "<p>". | |
| 205 | |
| 206 The default implementation has no opinion about which tags are | |
| 207 empty-element tags, so a tag will be presented as an | |
| 208 empty-element tag if and only if it has no children. | |
| 209 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will | |
| 210 be left alone. | |
| 211 | |
| 212 :param tag_name: The name of a markup tag. | |
| 213 """ | |
| 214 if self.empty_element_tags is None: | |
| 215 return True | |
| 216 return tag_name in self.empty_element_tags | |
| 217 | |
| 218 def feed(self, markup): | |
| 219 """Run some incoming markup through some parsing process, | |
| 220 populating the `BeautifulSoup` object in self.soup. | |
| 221 | |
| 222 This method is not implemented in TreeBuilder; it must be | |
| 223 implemented in subclasses. | |
| 224 | |
| 225 :return: None. | |
| 226 """ | |
| 227 raise NotImplementedError() | |
| 228 | |
| 229 def prepare_markup(self, markup, user_specified_encoding=None, | |
| 230 document_declared_encoding=None, exclude_encodings=None): | |
| 231 """Run any preliminary steps necessary to make incoming markup | |
| 232 acceptable to the parser. | |
| 233 | |
| 234 :param markup: Some markup -- probably a bytestring. | |
| 235 :param user_specified_encoding: The user asked to try this encoding. | |
| 236 :param document_declared_encoding: The markup itself claims to be | |
| 237 in this encoding. | |
| 238 :param exclude_encodings: The user asked _not_ to try any of | |
| 239 these encodings. | |
| 240 | |
| 241 :yield: A series of 4-tuples: | |
| 242 (markup, encoding, declared encoding, | |
| 243 has undergone character replacement) | |
| 244 | |
| 245 Each 4-tuple represents a strategy for converting the | |
| 246 document to Unicode and parsing it. Each strategy will be tried | |
| 247 in turn. | |
| 248 | |
| 249 By default, the only strategy is to parse the markup | |
| 250 as-is. See `LXMLTreeBuilderForXML` and | |
| 251 `HTMLParserTreeBuilder` for implementations that take into | |
| 252 account the quirks of particular parsers. | |
| 253 """ | |
| 254 yield markup, None, None, False | |
| 255 | |
| 256 def test_fragment_to_document(self, fragment): | |
| 257 """Wrap an HTML fragment to make it look like a document. | |
| 258 | |
| 259 Different parsers do this differently. For instance, lxml | |
| 260 introduces an empty <head> tag, and html5lib | |
| 261 doesn't. Abstracting this away lets us write simple tests | |
| 262 which run HTML fragments through the parser and compare the | |
| 263 results against other HTML fragments. | |
| 264 | |
| 265 This method should not be used outside of tests. | |
| 266 | |
| 267 :param fragment: A string -- fragment of HTML. | |
| 268 :return: A string -- a full HTML document. | |
| 269 """ | |
| 270 return fragment | |
| 271 | |
| 272 def set_up_substitutions(self, tag): | |
| 273 """Set up any substitutions that will need to be performed on | |
| 274 a `Tag` when it's output as a string. | |
| 275 | |
| 276 By default, this does nothing. See `HTMLTreeBuilder` for a | |
| 277 case where this is used. | |
| 278 | |
| 279 :param tag: A `Tag` | |
| 280 :return: Whether or not a substitution was performed. | |
| 281 """ | |
| 282 return False | |
| 283 | |
| 284 def _replace_cdata_list_attribute_values(self, tag_name, attrs): | |
| 285 """When an attribute value is associated with a tag that can | |
| 286 have multiple values for that attribute, convert the string | |
| 287 value to a list of strings. | |
| 288 | |
| 289 Basically, replaces class="foo bar" with class=["foo", "bar"] | |
| 290 | |
| 291 NOTE: This method modifies its input in place. | |
| 292 | |
| 293 :param tag_name: The name of a tag. | |
| 294 :param attrs: A dictionary containing the tag's attributes. | |
| 295 Any appropriate attribute values will be modified in place. | |
| 296 """ | |
| 297 if not attrs: | |
| 298 return attrs | |
| 299 if self.cdata_list_attributes: | |
| 300 universal = self.cdata_list_attributes.get('*', []) | |
| 301 tag_specific = self.cdata_list_attributes.get( | |
| 302 tag_name.lower(), None) | |
| 303 for attr in list(attrs.keys()): | |
| 304 if attr in universal or (tag_specific and attr in tag_specific): | |
| 305 # We have a "class"-type attribute whose string | |
| 306 # value is a whitespace-separated list of | |
| 307 # values. Split it into a list. | |
| 308 value = attrs[attr] | |
| 309 if isinstance(value, str): | |
| 310 values = nonwhitespace_re.findall(value) | |
| 311 else: | |
| 312 # html5lib sometimes calls setAttributes twice | |
| 313 # for the same tag when rearranging the parse | |
| 314 # tree. On the second call the attribute value | |
| 315 # here is already a list. If this happens, | |
| 316 # leave the value alone rather than trying to | |
| 317 # split it again. | |
| 318 values = value | |
| 319 attrs[attr] = values | |
| 320 return attrs | |
| 321 | |
| 322 class SAXTreeBuilder(TreeBuilder): | |
| 323 """A Beautiful Soup treebuilder that listens for SAX events. | |
| 324 | |
| 325 This is not currently used for anything, but it demonstrates | |
| 326 how a simple TreeBuilder would work. | |
| 327 """ | |
| 328 | |
| 329 def feed(self, markup): | |
| 330 raise NotImplementedError() | |
| 331 | |
| 332 def close(self): | |
| 333 pass | |
| 334 | |
| 335 def startElement(self, name, attrs): | |
| 336 attrs = dict((key[1], value) for key, value in list(attrs.items())) | |
| 337 #print("Start %s, %r" % (name, attrs)) | |
| 338 self.soup.handle_starttag(name, attrs) | |
| 339 | |
| 340 def endElement(self, name): | |
| 341 #print("End %s" % name) | |
| 342 self.soup.handle_endtag(name) | |
| 343 | |
| 344 def startElementNS(self, nsTuple, nodeName, attrs): | |
| 345 # Throw away (ns, nodeName) for now. | |
| 346 self.startElement(nodeName, attrs) | |
| 347 | |
| 348 def endElementNS(self, nsTuple, nodeName): | |
| 349 # Throw away (ns, nodeName) for now. | |
| 350 self.endElement(nodeName) | |
| 351 #handler.endElementNS((ns, node.nodeName), node.nodeName) | |
| 352 | |
| 353 def startPrefixMapping(self, prefix, nodeValue): | |
| 354 # Ignore the prefix for now. | |
| 355 pass | |
| 356 | |
| 357 def endPrefixMapping(self, prefix): | |
| 358 # Ignore the prefix for now. | |
| 359 # handler.endPrefixMapping(prefix) | |
| 360 pass | |
| 361 | |
| 362 def characters(self, content): | |
| 363 self.soup.handle_data(content) | |
| 364 | |
| 365 def startDocument(self): | |
| 366 pass | |
| 367 | |
| 368 def endDocument(self): | |
| 369 pass | |
| 370 | |
| 371 | |
| 372 class HTMLTreeBuilder(TreeBuilder): | |
| 373 """This TreeBuilder knows facts about HTML. | |
| 374 | |
| 375 Such as which tags are empty-element tags. | |
| 376 """ | |
| 377 | |
| 378 empty_element_tags = set([ | |
| 379 # These are from HTML5. | |
| 380 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', | |
| 381 | |
| 382 # These are from earlier versions of HTML and are removed in HTML5. | |
| 383 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' | |
| 384 ]) | |
| 385 | |
| 386 # The HTML standard defines these as block-level elements. Beautiful | |
| 387 # Soup does not treat these elements differently from other elements, | |
| 388 # but it may do so eventually, and this information is available if | |
| 389 # you need to use it. | |
| 390 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) | |
| 391 | |
| 392 # The HTML standard defines an unusual content model for these tags. | |
| 393 # We represent this by using a string class other than NavigableString | |
| 394 # inside these tags. | |
| 395 # | |
| 396 # I made this list by going through the HTML spec | |
| 397 # (https://html.spec.whatwg.org/#metadata-content) and looking for | |
| 398 # "metadata content" elements that can contain strings. | |
| 399 # | |
| 400 # TODO: Arguably <noscript> could go here but it seems | |
| 401 # qualitatively different from the other tags. | |
| 402 DEFAULT_STRING_CONTAINERS = { | |
| 403 'style': Stylesheet, | |
| 404 'script': Script, | |
| 405 'template': TemplateString, | |
| 406 } | |
| 407 | |
| 408 # The HTML standard defines these attributes as containing a | |
| 409 # space-separated list of values, not a single value. That is, | |
| 410 # class="foo bar" means that the 'class' attribute has two values, | |
| 411 # 'foo' and 'bar', not the single value 'foo bar'. When we | |
| 412 # encounter one of these attributes, we will parse its value into | |
| 413 # a list of values if possible. Upon output, the list will be | |
| 414 # converted back into a string. | |
| 415 DEFAULT_CDATA_LIST_ATTRIBUTES = { | |
| 416 "*" : ['class', 'accesskey', 'dropzone'], | |
| 417 "a" : ['rel', 'rev'], | |
| 418 "link" : ['rel', 'rev'], | |
| 419 "td" : ["headers"], | |
| 420 "th" : ["headers"], | |
| 421 "td" : ["headers"], | |
| 422 "form" : ["accept-charset"], | |
| 423 "object" : ["archive"], | |
| 424 | |
| 425 # These are HTML5 specific, as are *.accesskey and *.dropzone above. | |
| 426 "area" : ["rel"], | |
| 427 "icon" : ["sizes"], | |
| 428 "iframe" : ["sandbox"], | |
| 429 "output" : ["for"], | |
| 430 } | |
| 431 | |
| 432 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) | |
| 433 | |
| 434 def set_up_substitutions(self, tag): | |
| 435 """Replace the declared encoding in a <meta> tag with a placeholder, | |
| 436 to be substituted when the tag is output to a string. | |
| 437 | |
| 438 An HTML document may come in to Beautiful Soup as one | |
| 439 encoding, but exit in a different encoding, and the <meta> tag | |
| 440 needs to be changed to reflect this. | |
| 441 | |
| 442 :param tag: A `Tag` | |
| 443 :return: Whether or not a substitution was performed. | |
| 444 """ | |
| 445 # We are only interested in <meta> tags | |
| 446 if tag.name != 'meta': | |
| 447 return False | |
| 448 | |
| 449 http_equiv = tag.get('http-equiv') | |
| 450 content = tag.get('content') | |
| 451 charset = tag.get('charset') | |
| 452 | |
| 453 # We are interested in <meta> tags that say what encoding the | |
| 454 # document was originally in. This means HTML 5-style <meta> | |
| 455 # tags that provide the "charset" attribute. It also means | |
| 456 # HTML 4-style <meta> tags that provide the "content" | |
| 457 # attribute and have "http-equiv" set to "content-type". | |
| 458 # | |
| 459 # In both cases we will replace the value of the appropriate | |
| 460 # attribute with a standin object that can take on any | |
| 461 # encoding. | |
| 462 meta_encoding = None | |
| 463 if charset is not None: | |
| 464 # HTML 5 style: | |
| 465 # <meta charset="utf8"> | |
| 466 meta_encoding = charset | |
| 467 tag['charset'] = CharsetMetaAttributeValue(charset) | |
| 468 | |
| 469 elif (content is not None and http_equiv is not None | |
| 470 and http_equiv.lower() == 'content-type'): | |
| 471 # HTML 4 style: | |
| 472 # <meta http-equiv="content-type" content="text/html; charset=utf8"> | |
| 473 tag['content'] = ContentMetaAttributeValue(content) | |
| 474 | |
| 475 return (meta_encoding is not None) | |
| 476 | |
| 477 def register_treebuilders_from(module): | |
| 478 """Copy TreeBuilders from the given module into this module.""" | |
| 479 # I'm fairly sure this is not the best way to do this. | |
| 480 this_module = sys.modules['bs4.builder'] | |
| 481 for name in module.__all__: | |
| 482 obj = getattr(module, name) | |
| 483 | |
| 484 if issubclass(obj, TreeBuilder): | |
| 485 setattr(this_module, name, obj) | |
| 486 this_module.__all__.append(name) | |
| 487 # Register the builder while we're at it. | |
| 488 this_module.builder_registry.register(obj) | |
| 489 | |
| 490 class ParserRejectedMarkup(Exception): | |
| 491 """An Exception to be raised when the underlying parser simply | |
| 492 refuses to parse the given markup. | |
| 493 """ | |
| 494 def __init__(self, message_or_exception): | |
| 495 """Explain why the parser rejected the given markup, either | |
| 496 with a textual explanation or another exception. | |
| 497 """ | |
| 498 if isinstance(message_or_exception, Exception): | |
| 499 e = message_or_exception | |
| 500 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) | |
| 501 super(ParserRejectedMarkup, self).__init__(message_or_exception) | |
| 502 | |
| 503 # Builders are registered in reverse order of priority, so that custom | |
| 504 # builder registrations will take precedence. In general, we want lxml | |
| 505 # to take precedence over html5lib, because it's faster. And we only | |
| 506 # want to use HTMLParser as a last resort. | |
| 507 from . import _htmlparser | |
| 508 register_treebuilders_from(_htmlparser) | |
| 509 try: | |
| 510 from . import _html5lib | |
| 511 register_treebuilders_from(_html5lib) | |
| 512 except ImportError: | |
| 513 # They don't have html5lib installed. | |
| 514 pass | |
| 515 try: | |
| 516 from . import _lxml | |
| 517 register_treebuilders_from(_lxml) | |
| 518 except ImportError: | |
| 519 # They don't have lxml installed. | |
| 520 pass |
