Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bs4/builder/__init__.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 # Use of this source code is governed by the MIT license. | |
2 __license__ = "MIT" | |
3 | |
4 from collections import defaultdict | |
5 import itertools | |
6 import sys | |
7 from bs4.element import ( | |
8 CharsetMetaAttributeValue, | |
9 ContentMetaAttributeValue, | |
10 Stylesheet, | |
11 Script, | |
12 TemplateString, | |
13 nonwhitespace_re | |
14 ) | |
15 | |
16 __all__ = [ | |
17 'HTMLTreeBuilder', | |
18 'SAXTreeBuilder', | |
19 'TreeBuilder', | |
20 'TreeBuilderRegistry', | |
21 ] | |
22 | |
23 # Some useful features for a TreeBuilder to have. | |
24 FAST = 'fast' | |
25 PERMISSIVE = 'permissive' | |
26 STRICT = 'strict' | |
27 XML = 'xml' | |
28 HTML = 'html' | |
29 HTML_5 = 'html5' | |
30 | |
31 | |
32 class TreeBuilderRegistry(object): | |
33 """A way of looking up TreeBuilder subclasses by their name or by desired | |
34 features. | |
35 """ | |
36 | |
37 def __init__(self): | |
38 self.builders_for_feature = defaultdict(list) | |
39 self.builders = [] | |
40 | |
41 def register(self, treebuilder_class): | |
42 """Register a treebuilder based on its advertised features. | |
43 | |
44 :param treebuilder_class: A subclass of Treebuilder. its .features | |
45 attribute should list its features. | |
46 """ | |
47 for feature in treebuilder_class.features: | |
48 self.builders_for_feature[feature].insert(0, treebuilder_class) | |
49 self.builders.insert(0, treebuilder_class) | |
50 | |
51 def lookup(self, *features): | |
52 """Look up a TreeBuilder subclass with the desired features. | |
53 | |
54 :param features: A list of features to look for. If none are | |
55 provided, the most recently registered TreeBuilder subclass | |
56 will be used. | |
57 :return: A TreeBuilder subclass, or None if there's no | |
58 registered subclass with all the requested features. | |
59 """ | |
60 if len(self.builders) == 0: | |
61 # There are no builders at all. | |
62 return None | |
63 | |
64 if len(features) == 0: | |
65 # They didn't ask for any features. Give them the most | |
66 # recently registered builder. | |
67 return self.builders[0] | |
68 | |
69 # Go down the list of features in order, and eliminate any builders | |
70 # that don't match every feature. | |
71 features = list(features) | |
72 features.reverse() | |
73 candidates = None | |
74 candidate_set = None | |
75 while len(features) > 0: | |
76 feature = features.pop() | |
77 we_have_the_feature = self.builders_for_feature.get(feature, []) | |
78 if len(we_have_the_feature) > 0: | |
79 if candidates is None: | |
80 candidates = we_have_the_feature | |
81 candidate_set = set(candidates) | |
82 else: | |
83 # Eliminate any candidates that don't have this feature. | |
84 candidate_set = candidate_set.intersection( | |
85 set(we_have_the_feature)) | |
86 | |
87 # The only valid candidates are the ones in candidate_set. | |
88 # Go through the original list of candidates and pick the first one | |
89 # that's in candidate_set. | |
90 if candidate_set is None: | |
91 return None | |
92 for candidate in candidates: | |
93 if candidate in candidate_set: | |
94 return candidate | |
95 return None | |
96 | |
97 # The BeautifulSoup class will take feature lists from developers and use them | |
98 # to look up builders in this registry. | |
99 builder_registry = TreeBuilderRegistry() | |
100 | |
101 class TreeBuilder(object): | |
102 """Turn a textual document into a Beautiful Soup object tree.""" | |
103 | |
104 NAME = "[Unknown tree builder]" | |
105 ALTERNATE_NAMES = [] | |
106 features = [] | |
107 | |
108 is_xml = False | |
109 picklable = False | |
110 empty_element_tags = None # A tag will be considered an empty-element | |
111 # tag when and only when it has no contents. | |
112 | |
113 # A value for these tag/attribute combinations is a space- or | |
114 # comma-separated list of CDATA, rather than a single CDATA. | |
115 DEFAULT_CDATA_LIST_ATTRIBUTES = {} | |
116 | |
117 # Whitespace should be preserved inside these tags. | |
118 DEFAULT_PRESERVE_WHITESPACE_TAGS = set() | |
119 | |
120 # The textual contents of tags with these names should be | |
121 # instantiated with some class other than NavigableString. | |
122 DEFAULT_STRING_CONTAINERS = {} | |
123 | |
124 USE_DEFAULT = object() | |
125 | |
126 # Most parsers don't keep track of line numbers. | |
127 TRACKS_LINE_NUMBERS = False | |
128 | |
129 def __init__(self, multi_valued_attributes=USE_DEFAULT, | |
130 preserve_whitespace_tags=USE_DEFAULT, | |
131 store_line_numbers=USE_DEFAULT, | |
132 string_containers=USE_DEFAULT, | |
133 ): | |
134 """Constructor. | |
135 | |
136 :param multi_valued_attributes: If this is set to None, the | |
137 TreeBuilder will not turn any values for attributes like | |
138 'class' into lists. Setting this to a dictionary will | |
139 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES | |
140 for an example. | |
141 | |
142 Internally, these are called "CDATA list attributes", but that | |
143 probably doesn't make sense to an end-user, so the argument name | |
144 is `multi_valued_attributes`. | |
145 | |
146 :param preserve_whitespace_tags: A list of tags to treat | |
147 the way <pre> tags are treated in HTML. Tags in this list | |
148 are immune from pretty-printing; their contents will always be | |
149 output as-is. | |
150 | |
151 :param string_containers: A dictionary mapping tag names to | |
152 the classes that should be instantiated to contain the textual | |
153 contents of those tags. The default is to use NavigableString | |
154 for every tag, no matter what the name. You can override the | |
155 default by changing DEFAULT_STRING_CONTAINERS. | |
156 | |
157 :param store_line_numbers: If the parser keeps track of the | |
158 line numbers and positions of the original markup, that | |
159 information will, by default, be stored in each corresponding | |
160 `Tag` object. You can turn this off by passing | |
161 store_line_numbers=False. If the parser you're using doesn't | |
162 keep track of this information, then setting store_line_numbers=True | |
163 will do nothing. | |
164 """ | |
165 self.soup = None | |
166 if multi_valued_attributes is self.USE_DEFAULT: | |
167 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES | |
168 self.cdata_list_attributes = multi_valued_attributes | |
169 if preserve_whitespace_tags is self.USE_DEFAULT: | |
170 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS | |
171 self.preserve_whitespace_tags = preserve_whitespace_tags | |
172 if store_line_numbers == self.USE_DEFAULT: | |
173 store_line_numbers = self.TRACKS_LINE_NUMBERS | |
174 self.store_line_numbers = store_line_numbers | |
175 if string_containers == self.USE_DEFAULT: | |
176 string_containers = self.DEFAULT_STRING_CONTAINERS | |
177 self.string_containers = string_containers | |
178 | |
179 def initialize_soup(self, soup): | |
180 """The BeautifulSoup object has been initialized and is now | |
181 being associated with the TreeBuilder. | |
182 | |
183 :param soup: A BeautifulSoup object. | |
184 """ | |
185 self.soup = soup | |
186 | |
187 def reset(self): | |
188 """Do any work necessary to reset the underlying parser | |
189 for a new document. | |
190 | |
191 By default, this does nothing. | |
192 """ | |
193 pass | |
194 | |
195 def can_be_empty_element(self, tag_name): | |
196 """Might a tag with this name be an empty-element tag? | |
197 | |
198 The final markup may or may not actually present this tag as | |
199 self-closing. | |
200 | |
201 For instance: an HTMLBuilder does not consider a <p> tag to be | |
202 an empty-element tag (it's not in | |
203 HTMLBuilder.empty_element_tags). This means an empty <p> tag | |
204 will be presented as "<p></p>", not "<p/>" or "<p>". | |
205 | |
206 The default implementation has no opinion about which tags are | |
207 empty-element tags, so a tag will be presented as an | |
208 empty-element tag if and only if it has no children. | |
209 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will | |
210 be left alone. | |
211 | |
212 :param tag_name: The name of a markup tag. | |
213 """ | |
214 if self.empty_element_tags is None: | |
215 return True | |
216 return tag_name in self.empty_element_tags | |
217 | |
218 def feed(self, markup): | |
219 """Run some incoming markup through some parsing process, | |
220 populating the `BeautifulSoup` object in self.soup. | |
221 | |
222 This method is not implemented in TreeBuilder; it must be | |
223 implemented in subclasses. | |
224 | |
225 :return: None. | |
226 """ | |
227 raise NotImplementedError() | |
228 | |
229 def prepare_markup(self, markup, user_specified_encoding=None, | |
230 document_declared_encoding=None, exclude_encodings=None): | |
231 """Run any preliminary steps necessary to make incoming markup | |
232 acceptable to the parser. | |
233 | |
234 :param markup: Some markup -- probably a bytestring. | |
235 :param user_specified_encoding: The user asked to try this encoding. | |
236 :param document_declared_encoding: The markup itself claims to be | |
237 in this encoding. | |
238 :param exclude_encodings: The user asked _not_ to try any of | |
239 these encodings. | |
240 | |
241 :yield: A series of 4-tuples: | |
242 (markup, encoding, declared encoding, | |
243 has undergone character replacement) | |
244 | |
245 Each 4-tuple represents a strategy for converting the | |
246 document to Unicode and parsing it. Each strategy will be tried | |
247 in turn. | |
248 | |
249 By default, the only strategy is to parse the markup | |
250 as-is. See `LXMLTreeBuilderForXML` and | |
251 `HTMLParserTreeBuilder` for implementations that take into | |
252 account the quirks of particular parsers. | |
253 """ | |
254 yield markup, None, None, False | |
255 | |
256 def test_fragment_to_document(self, fragment): | |
257 """Wrap an HTML fragment to make it look like a document. | |
258 | |
259 Different parsers do this differently. For instance, lxml | |
260 introduces an empty <head> tag, and html5lib | |
261 doesn't. Abstracting this away lets us write simple tests | |
262 which run HTML fragments through the parser and compare the | |
263 results against other HTML fragments. | |
264 | |
265 This method should not be used outside of tests. | |
266 | |
267 :param fragment: A string -- fragment of HTML. | |
268 :return: A string -- a full HTML document. | |
269 """ | |
270 return fragment | |
271 | |
272 def set_up_substitutions(self, tag): | |
273 """Set up any substitutions that will need to be performed on | |
274 a `Tag` when it's output as a string. | |
275 | |
276 By default, this does nothing. See `HTMLTreeBuilder` for a | |
277 case where this is used. | |
278 | |
279 :param tag: A `Tag` | |
280 :return: Whether or not a substitution was performed. | |
281 """ | |
282 return False | |
283 | |
284 def _replace_cdata_list_attribute_values(self, tag_name, attrs): | |
285 """When an attribute value is associated with a tag that can | |
286 have multiple values for that attribute, convert the string | |
287 value to a list of strings. | |
288 | |
289 Basically, replaces class="foo bar" with class=["foo", "bar"] | |
290 | |
291 NOTE: This method modifies its input in place. | |
292 | |
293 :param tag_name: The name of a tag. | |
294 :param attrs: A dictionary containing the tag's attributes. | |
295 Any appropriate attribute values will be modified in place. | |
296 """ | |
297 if not attrs: | |
298 return attrs | |
299 if self.cdata_list_attributes: | |
300 universal = self.cdata_list_attributes.get('*', []) | |
301 tag_specific = self.cdata_list_attributes.get( | |
302 tag_name.lower(), None) | |
303 for attr in list(attrs.keys()): | |
304 if attr in universal or (tag_specific and attr in tag_specific): | |
305 # We have a "class"-type attribute whose string | |
306 # value is a whitespace-separated list of | |
307 # values. Split it into a list. | |
308 value = attrs[attr] | |
309 if isinstance(value, str): | |
310 values = nonwhitespace_re.findall(value) | |
311 else: | |
312 # html5lib sometimes calls setAttributes twice | |
313 # for the same tag when rearranging the parse | |
314 # tree. On the second call the attribute value | |
315 # here is already a list. If this happens, | |
316 # leave the value alone rather than trying to | |
317 # split it again. | |
318 values = value | |
319 attrs[attr] = values | |
320 return attrs | |
321 | |
322 class SAXTreeBuilder(TreeBuilder): | |
323 """A Beautiful Soup treebuilder that listens for SAX events. | |
324 | |
325 This is not currently used for anything, but it demonstrates | |
326 how a simple TreeBuilder would work. | |
327 """ | |
328 | |
329 def feed(self, markup): | |
330 raise NotImplementedError() | |
331 | |
332 def close(self): | |
333 pass | |
334 | |
335 def startElement(self, name, attrs): | |
336 attrs = dict((key[1], value) for key, value in list(attrs.items())) | |
337 #print("Start %s, %r" % (name, attrs)) | |
338 self.soup.handle_starttag(name, attrs) | |
339 | |
340 def endElement(self, name): | |
341 #print("End %s" % name) | |
342 self.soup.handle_endtag(name) | |
343 | |
344 def startElementNS(self, nsTuple, nodeName, attrs): | |
345 # Throw away (ns, nodeName) for now. | |
346 self.startElement(nodeName, attrs) | |
347 | |
348 def endElementNS(self, nsTuple, nodeName): | |
349 # Throw away (ns, nodeName) for now. | |
350 self.endElement(nodeName) | |
351 #handler.endElementNS((ns, node.nodeName), node.nodeName) | |
352 | |
353 def startPrefixMapping(self, prefix, nodeValue): | |
354 # Ignore the prefix for now. | |
355 pass | |
356 | |
357 def endPrefixMapping(self, prefix): | |
358 # Ignore the prefix for now. | |
359 # handler.endPrefixMapping(prefix) | |
360 pass | |
361 | |
362 def characters(self, content): | |
363 self.soup.handle_data(content) | |
364 | |
365 def startDocument(self): | |
366 pass | |
367 | |
368 def endDocument(self): | |
369 pass | |
370 | |
371 | |
372 class HTMLTreeBuilder(TreeBuilder): | |
373 """This TreeBuilder knows facts about HTML. | |
374 | |
375 Such as which tags are empty-element tags. | |
376 """ | |
377 | |
378 empty_element_tags = set([ | |
379 # These are from HTML5. | |
380 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', | |
381 | |
382 # These are from earlier versions of HTML and are removed in HTML5. | |
383 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' | |
384 ]) | |
385 | |
386 # The HTML standard defines these as block-level elements. Beautiful | |
387 # Soup does not treat these elements differently from other elements, | |
388 # but it may do so eventually, and this information is available if | |
389 # you need to use it. | |
390 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) | |
391 | |
392 # The HTML standard defines an unusual content model for these tags. | |
393 # We represent this by using a string class other than NavigableString | |
394 # inside these tags. | |
395 # | |
396 # I made this list by going through the HTML spec | |
397 # (https://html.spec.whatwg.org/#metadata-content) and looking for | |
398 # "metadata content" elements that can contain strings. | |
399 # | |
400 # TODO: Arguably <noscript> could go here but it seems | |
401 # qualitatively different from the other tags. | |
402 DEFAULT_STRING_CONTAINERS = { | |
403 'style': Stylesheet, | |
404 'script': Script, | |
405 'template': TemplateString, | |
406 } | |
407 | |
408 # The HTML standard defines these attributes as containing a | |
409 # space-separated list of values, not a single value. That is, | |
410 # class="foo bar" means that the 'class' attribute has two values, | |
411 # 'foo' and 'bar', not the single value 'foo bar'. When we | |
412 # encounter one of these attributes, we will parse its value into | |
413 # a list of values if possible. Upon output, the list will be | |
414 # converted back into a string. | |
415 DEFAULT_CDATA_LIST_ATTRIBUTES = { | |
416 "*" : ['class', 'accesskey', 'dropzone'], | |
417 "a" : ['rel', 'rev'], | |
418 "link" : ['rel', 'rev'], | |
419 "td" : ["headers"], | |
420 "th" : ["headers"], | |
421 "td" : ["headers"], | |
422 "form" : ["accept-charset"], | |
423 "object" : ["archive"], | |
424 | |
425 # These are HTML5 specific, as are *.accesskey and *.dropzone above. | |
426 "area" : ["rel"], | |
427 "icon" : ["sizes"], | |
428 "iframe" : ["sandbox"], | |
429 "output" : ["for"], | |
430 } | |
431 | |
432 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) | |
433 | |
434 def set_up_substitutions(self, tag): | |
435 """Replace the declared encoding in a <meta> tag with a placeholder, | |
436 to be substituted when the tag is output to a string. | |
437 | |
438 An HTML document may come in to Beautiful Soup as one | |
439 encoding, but exit in a different encoding, and the <meta> tag | |
440 needs to be changed to reflect this. | |
441 | |
442 :param tag: A `Tag` | |
443 :return: Whether or not a substitution was performed. | |
444 """ | |
445 # We are only interested in <meta> tags | |
446 if tag.name != 'meta': | |
447 return False | |
448 | |
449 http_equiv = tag.get('http-equiv') | |
450 content = tag.get('content') | |
451 charset = tag.get('charset') | |
452 | |
453 # We are interested in <meta> tags that say what encoding the | |
454 # document was originally in. This means HTML 5-style <meta> | |
455 # tags that provide the "charset" attribute. It also means | |
456 # HTML 4-style <meta> tags that provide the "content" | |
457 # attribute and have "http-equiv" set to "content-type". | |
458 # | |
459 # In both cases we will replace the value of the appropriate | |
460 # attribute with a standin object that can take on any | |
461 # encoding. | |
462 meta_encoding = None | |
463 if charset is not None: | |
464 # HTML 5 style: | |
465 # <meta charset="utf8"> | |
466 meta_encoding = charset | |
467 tag['charset'] = CharsetMetaAttributeValue(charset) | |
468 | |
469 elif (content is not None and http_equiv is not None | |
470 and http_equiv.lower() == 'content-type'): | |
471 # HTML 4 style: | |
472 # <meta http-equiv="content-type" content="text/html; charset=utf8"> | |
473 tag['content'] = ContentMetaAttributeValue(content) | |
474 | |
475 return (meta_encoding is not None) | |
476 | |
477 def register_treebuilders_from(module): | |
478 """Copy TreeBuilders from the given module into this module.""" | |
479 # I'm fairly sure this is not the best way to do this. | |
480 this_module = sys.modules['bs4.builder'] | |
481 for name in module.__all__: | |
482 obj = getattr(module, name) | |
483 | |
484 if issubclass(obj, TreeBuilder): | |
485 setattr(this_module, name, obj) | |
486 this_module.__all__.append(name) | |
487 # Register the builder while we're at it. | |
488 this_module.builder_registry.register(obj) | |
489 | |
490 class ParserRejectedMarkup(Exception): | |
491 """An Exception to be raised when the underlying parser simply | |
492 refuses to parse the given markup. | |
493 """ | |
494 def __init__(self, message_or_exception): | |
495 """Explain why the parser rejected the given markup, either | |
496 with a textual explanation or another exception. | |
497 """ | |
498 if isinstance(message_or_exception, Exception): | |
499 e = message_or_exception | |
500 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) | |
501 super(ParserRejectedMarkup, self).__init__(message_or_exception) | |
502 | |
503 # Builders are registered in reverse order of priority, so that custom | |
504 # builder registrations will take precedence. In general, we want lxml | |
505 # to take precedence over html5lib, because it's faster. And we only | |
506 # want to use HTMLParser as a last resort. | |
507 from . import _htmlparser | |
508 register_treebuilders_from(_htmlparser) | |
509 try: | |
510 from . import _html5lib | |
511 register_treebuilders_from(_html5lib) | |
512 except ImportError: | |
513 # They don't have html5lib installed. | |
514 pass | |
515 try: | |
516 from . import _lxml | |
517 register_treebuilders_from(_lxml) | |
518 except ImportError: | |
519 # They don't have lxml installed. | |
520 pass |