comparison planemo/lib/python3.7/site-packages/bs4/__init__.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400 (2020-07-31)
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2
3 http://www.crummy.com/software/BeautifulSoup/
4
5 Beautiful Soup uses a pluggable XML or HTML parser to parse a
6 (possibly invalid) document into a tree representation. Beautiful Soup
7 provides methods and Pythonic idioms that make it easy to navigate,
8 search, and modify the parse tree.
9
10 Beautiful Soup works with Python 2.7 and up. It works better if lxml
11 and/or html5lib is installed.
12
13 For more than you ever wanted to know about Beautiful Soup, see the
14 documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15 """
16
17 __author__ = "Leonard Richardson (leonardr@segfault.org)"
18 __version__ = "4.9.1"
19 __copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
20 # Use of this source code is governed by the MIT license.
21 __license__ = "MIT"
22
23 __all__ = ['BeautifulSoup']
24
25 import os
26 import re
27 import sys
28 import traceback
29 import warnings
30
31 from .builder import builder_registry, ParserRejectedMarkup
32 from .dammit import UnicodeDammit
33 from .element import (
34 CData,
35 Comment,
36 DEFAULT_OUTPUT_ENCODING,
37 Declaration,
38 Doctype,
39 NavigableString,
40 PageElement,
41 ProcessingInstruction,
42 PYTHON_SPECIFIC_ENCODINGS,
43 ResultSet,
44 Script,
45 Stylesheet,
46 SoupStrainer,
47 Tag,
48 TemplateString,
49 )
50
51 # The very first thing we do is give a useful error if someone is
52 # running this code under Python 3 without converting it.
53 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
54
55 # Define some custom warnings.
56 class GuessedAtParserWarning(UserWarning):
57 """The warning issued when BeautifulSoup has to guess what parser to
58 use -- probably because no parser was specified in the constructor.
59 """
60
61 class MarkupResemblesLocatorWarning(UserWarning):
62 """The warning issued when BeautifulSoup is given 'markup' that
63 actually looks like a resource locator -- a URL or a path to a file
64 on disk.
65 """
66
67
68 class BeautifulSoup(Tag):
69 """A data structure representing a parsed HTML or XML document.
70
71 Most of the methods you'll call on a BeautifulSoup object are inherited from
72 PageElement or Tag.
73
74 Internally, this class defines the basic interface called by the
75 tree builders when converting an HTML/XML document into a data
76 structure. The interface abstracts away the differences between
77 parsers. To write a new tree builder, you'll need to understand
78 these methods as a whole.
79
80 These methods will be called by the BeautifulSoup constructor:
81 * reset()
82 * feed(markup)
83
84 The tree builder may call these methods from its feed() implementation:
85 * handle_starttag(name, attrs) # See note about return value
86 * handle_endtag(name)
87 * handle_data(data) # Appends to the current data node
88 * endData(containerClass) # Ends the current data node
89
90 No matter how complicated the underlying parser is, you should be
91 able to build a tree using 'start tag' events, 'end tag' events,
92 'data' events, and "done with data" events.
93
94 If you encounter an empty-element tag (aka a self-closing tag,
95 like HTML's <br> tag), call handle_starttag and then
96 handle_endtag.
97 """
98
99 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
100 # a Tag with a .name. This name makes it clear the BeautifulSoup
101 # object isn't a real markup tag.
102 ROOT_TAG_NAME = '[document]'
103
104 # If the end-user gives no indication which tree builder they
105 # want, look for one with these features.
106 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
107
108 # A string containing all ASCII whitespace characters, used in
109 # endData() to detect data chunks that seem 'empty'.
110 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
111
112 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
113
114 def __init__(self, markup="", features=None, builder=None,
115 parse_only=None, from_encoding=None, exclude_encodings=None,
116 element_classes=None, **kwargs):
117 """Constructor.
118
119 :param markup: A string or a file-like object representing
120 markup to be parsed.
121
122 :param features: Desirable features of the parser to be
123 used. This may be the name of a specific parser ("lxml",
124 "lxml-xml", "html.parser", or "html5lib") or it may be the
125 type of markup to be used ("html", "html5", "xml"). It's
126 recommended that you name a specific parser, so that
127 Beautiful Soup gives you the same results across platforms
128 and virtual environments.
129
130 :param builder: A TreeBuilder subclass to instantiate (or
131 instance to use) instead of looking one up based on
132 `features`. You only need to use this if you've implemented a
133 custom TreeBuilder.
134
135 :param parse_only: A SoupStrainer. Only parts of the document
136 matching the SoupStrainer will be considered. This is useful
137 when parsing part of a document that would otherwise be too
138 large to fit into memory.
139
140 :param from_encoding: A string indicating the encoding of the
141 document to be parsed. Pass this in if Beautiful Soup is
142 guessing wrongly about the document's encoding.
143
144 :param exclude_encodings: A list of strings indicating
145 encodings known to be wrong. Pass this in if you don't know
146 the document's encoding but you know Beautiful Soup's guess is
147 wrong.
148
149 :param element_classes: A dictionary mapping BeautifulSoup
150 classes like Tag and NavigableString, to other classes you'd
151 like to be instantiated instead as the parse tree is
152 built. This is useful for subclassing Tag or NavigableString
153 to modify default behavior.
154
155 :param kwargs: For backwards compatibility purposes, the
156 constructor accepts certain keyword arguments used in
157 Beautiful Soup 3. None of these arguments do anything in
158 Beautiful Soup 4; they will result in a warning and then be
159 ignored.
160
161 Apart from this, any keyword arguments passed into the
162 BeautifulSoup constructor are propagated to the TreeBuilder
163 constructor. This makes it possible to configure a
164 TreeBuilder by passing in arguments, not just by saying which
165 one to use.
166 """
167 if 'convertEntities' in kwargs:
168 del kwargs['convertEntities']
169 warnings.warn(
170 "BS4 does not respect the convertEntities argument to the "
171 "BeautifulSoup constructor. Entities are always converted "
172 "to Unicode characters.")
173
174 if 'markupMassage' in kwargs:
175 del kwargs['markupMassage']
176 warnings.warn(
177 "BS4 does not respect the markupMassage argument to the "
178 "BeautifulSoup constructor. The tree builder is responsible "
179 "for any necessary markup massage.")
180
181 if 'smartQuotesTo' in kwargs:
182 del kwargs['smartQuotesTo']
183 warnings.warn(
184 "BS4 does not respect the smartQuotesTo argument to the "
185 "BeautifulSoup constructor. Smart quotes are always converted "
186 "to Unicode characters.")
187
188 if 'selfClosingTags' in kwargs:
189 del kwargs['selfClosingTags']
190 warnings.warn(
191 "BS4 does not respect the selfClosingTags argument to the "
192 "BeautifulSoup constructor. The tree builder is responsible "
193 "for understanding self-closing tags.")
194
195 if 'isHTML' in kwargs:
196 del kwargs['isHTML']
197 warnings.warn(
198 "BS4 does not respect the isHTML argument to the "
199 "BeautifulSoup constructor. Suggest you use "
200 "features='lxml' for HTML and features='lxml-xml' for "
201 "XML.")
202
203 def deprecated_argument(old_name, new_name):
204 if old_name in kwargs:
205 warnings.warn(
206 'The "%s" argument to the BeautifulSoup constructor '
207 'has been renamed to "%s."' % (old_name, new_name))
208 value = kwargs[old_name]
209 del kwargs[old_name]
210 return value
211 return None
212
213 parse_only = parse_only or deprecated_argument(
214 "parseOnlyThese", "parse_only")
215
216 from_encoding = from_encoding or deprecated_argument(
217 "fromEncoding", "from_encoding")
218
219 if from_encoding and isinstance(markup, str):
220 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
221 from_encoding = None
222
223 self.element_classes = element_classes or dict()
224
225 # We need this information to track whether or not the builder
226 # was specified well enough that we can omit the 'you need to
227 # specify a parser' warning.
228 original_builder = builder
229 original_features = features
230
231 if isinstance(builder, type):
232 # A builder class was passed in; it needs to be instantiated.
233 builder_class = builder
234 builder = None
235 elif builder is None:
236 if isinstance(features, str):
237 features = [features]
238 if features is None or len(features) == 0:
239 features = self.DEFAULT_BUILDER_FEATURES
240 builder_class = builder_registry.lookup(*features)
241 if builder_class is None:
242 raise FeatureNotFound(
243 "Couldn't find a tree builder with the features you "
244 "requested: %s. Do you need to install a parser library?"
245 % ",".join(features))
246
247 # At this point either we have a TreeBuilder instance in
248 # builder, or we have a builder_class that we can instantiate
249 # with the remaining **kwargs.
250 if builder is None:
251 builder = builder_class(**kwargs)
252 if not original_builder and not (
253 original_features == builder.NAME or
254 original_features in builder.ALTERNATE_NAMES
255 ):
256 if builder.is_xml:
257 markup_type = "XML"
258 else:
259 markup_type = "HTML"
260
261 # This code adapted from warnings.py so that we get the same line
262 # of code as our warnings.warn() call gets, even if the answer is wrong
263 # (as it may be in a multithreading situation).
264 caller = None
265 try:
266 caller = sys._getframe(1)
267 except ValueError:
268 pass
269 if caller:
270 globals = caller.f_globals
271 line_number = caller.f_lineno
272 else:
273 globals = sys.__dict__
274 line_number= 1
275 filename = globals.get('__file__')
276 if filename:
277 fnl = filename.lower()
278 if fnl.endswith((".pyc", ".pyo")):
279 filename = filename[:-1]
280 if filename:
281 # If there is no filename at all, the user is most likely in a REPL,
282 # and the warning is not necessary.
283 values = dict(
284 filename=filename,
285 line_number=line_number,
286 parser=builder.NAME,
287 markup_type=markup_type
288 )
289 warnings.warn(
290 self.NO_PARSER_SPECIFIED_WARNING % values,
291 GuessedAtParserWarning, stacklevel=2
292 )
293 else:
294 if kwargs:
295 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
296
297 self.builder = builder
298 self.is_xml = builder.is_xml
299 self.known_xml = self.is_xml
300 self._namespaces = dict()
301 self.parse_only = parse_only
302
303 self.builder.initialize_soup(self)
304
305 if hasattr(markup, 'read'): # It's a file-type object.
306 markup = markup.read()
307 elif len(markup) <= 256 and (
308 (isinstance(markup, bytes) and not b'<' in markup)
309 or (isinstance(markup, str) and not '<' in markup)
310 ):
311 # Print out warnings for a couple beginner problems
312 # involving passing non-markup to Beautiful Soup.
313 # Beautiful Soup will still parse the input as markup,
314 # just in case that's what the user really wants.
315 if (isinstance(markup, str)
316 and not os.path.supports_unicode_filenames):
317 possible_filename = markup.encode("utf8")
318 else:
319 possible_filename = markup
320 is_file = False
321 try:
322 is_file = os.path.exists(possible_filename)
323 except Exception as e:
324 # This is almost certainly a problem involving
325 # characters not valid in filenames on this
326 # system. Just let it go.
327 pass
328 if is_file:
329 warnings.warn(
330 '"%s" looks like a filename, not markup. You should'
331 ' probably open this file and pass the filehandle into'
332 ' Beautiful Soup.' % self._decode_markup(markup),
333 MarkupResemblesLocatorWarning
334 )
335 self._check_markup_is_url(markup)
336
337 rejections = []
338 success = False
339 for (self.markup, self.original_encoding, self.declared_html_encoding,
340 self.contains_replacement_characters) in (
341 self.builder.prepare_markup(
342 markup, from_encoding, exclude_encodings=exclude_encodings)):
343 self.reset()
344 try:
345 self._feed()
346 success = True
347 break
348 except ParserRejectedMarkup as e:
349 rejections.append(e)
350 pass
351
352 if not success:
353 other_exceptions = [str(e) for e in rejections]
354 raise ParserRejectedMarkup(
355 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
356 )
357
358 # Clear out the markup and remove the builder's circular
359 # reference to this object.
360 self.markup = None
361 self.builder.soup = None
362
363 def __copy__(self):
364 """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
365 copy = type(self)(
366 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
367 )
368
369 # Although we encoded the tree to UTF-8, that may not have
370 # been the encoding of the original markup. Set the copy's
371 # .original_encoding to reflect the original object's
372 # .original_encoding.
373 copy.original_encoding = self.original_encoding
374 return copy
375
376 def __getstate__(self):
377 # Frequently a tree builder can't be pickled.
378 d = dict(self.__dict__)
379 if 'builder' in d and not self.builder.picklable:
380 d['builder'] = None
381 return d
382
383 @classmethod
384 def _decode_markup(cls, markup):
385 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
386
387 TODO: warnings.warn had this problem back in 2010 but it might not
388 anymore.
389 """
390 if isinstance(markup, bytes):
391 decoded = markup.decode('utf-8', 'replace')
392 else:
393 decoded = markup
394 return decoded
395
396 @classmethod
397 def _check_markup_is_url(cls, markup):
398 """Error-handling method to raise a warning if incoming markup looks
399 like a URL.
400
401 :param markup: A string.
402 """
403 if isinstance(markup, bytes):
404 space = b' '
405 cant_start_with = (b"http:", b"https:")
406 elif isinstance(markup, str):
407 space = ' '
408 cant_start_with = ("http:", "https:")
409 else:
410 return
411
412 if any(markup.startswith(prefix) for prefix in cant_start_with):
413 if not space in markup:
414 warnings.warn(
415 '"%s" looks like a URL. Beautiful Soup is not an'
416 ' HTTP client. You should probably use an HTTP client like'
417 ' requests to get the document behind the URL, and feed'
418 ' that document to Beautiful Soup.' % cls._decode_markup(
419 markup
420 ),
421 MarkupResemblesLocatorWarning
422 )
423
424 def _feed(self):
425 """Internal method that parses previously set markup, creating a large
426 number of Tag and NavigableString objects.
427 """
428 # Convert the document to Unicode.
429 self.builder.reset()
430
431 self.builder.feed(self.markup)
432 # Close out any unfinished strings and close all the open tags.
433 self.endData()
434 while self.currentTag.name != self.ROOT_TAG_NAME:
435 self.popTag()
436
437 def reset(self):
438 """Reset this object to a state as though it had never parsed any
439 markup.
440 """
441 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
442 self.hidden = 1
443 self.builder.reset()
444 self.current_data = []
445 self.currentTag = None
446 self.tagStack = []
447 self.preserve_whitespace_tag_stack = []
448 self.string_container_stack = []
449 self.pushTag(self)
450
451 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
452 sourceline=None, sourcepos=None, **kwattrs):
453 """Create a new Tag associated with this BeautifulSoup object.
454
455 :param name: The name of the new Tag.
456 :param namespace: The URI of the new Tag's XML namespace, if any.
457 :param prefix: The prefix for the new Tag's XML namespace, if any.
458 :param attrs: A dictionary of this Tag's attribute values; can
459 be used instead of `kwattrs` for attributes like 'class'
460 that are reserved words in Python.
461 :param sourceline: The line number where this tag was
462 (purportedly) found in its source document.
463 :param sourcepos: The character position within `sourceline` where this
464 tag was (purportedly) found.
465 :param kwattrs: Keyword arguments for the new Tag's attribute values.
466
467 """
468 kwattrs.update(attrs)
469 return self.element_classes.get(Tag, Tag)(
470 None, self.builder, name, namespace, nsprefix, kwattrs,
471 sourceline=sourceline, sourcepos=sourcepos
472 )
473
474 def string_container(self, base_class=None):
475 container = base_class or NavigableString
476
477 # There may be a general override of NavigableString.
478 container = self.element_classes.get(
479 container, container
480 )
481
482 # On top of that, we may be inside a tag that needs a special
483 # container class.
484 if self.string_container_stack:
485 container = self.builder.string_containers.get(
486 self.string_container_stack[-1].name, container
487 )
488 return container
489
490 def new_string(self, s, subclass=None):
491 """Create a new NavigableString associated with this BeautifulSoup
492 object.
493 """
494 container = self.string_container(subclass)
495 return container(s)
496
497 def insert_before(self, successor):
498 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
499 it because there is nothing before or after it in the parse tree.
500 """
501 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
502
503 def insert_after(self, successor):
504 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
505 it because there is nothing before or after it in the parse tree.
506 """
507 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
508
509 def popTag(self):
510 """Internal method called by _popToTag when a tag is closed."""
511 tag = self.tagStack.pop()
512 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
513 self.preserve_whitespace_tag_stack.pop()
514 if self.string_container_stack and tag == self.string_container_stack[-1]:
515 self.string_container_stack.pop()
516 #print("Pop", tag.name)
517 if self.tagStack:
518 self.currentTag = self.tagStack[-1]
519 return self.currentTag
520
521 def pushTag(self, tag):
522 """Internal method called by handle_starttag when a tag is opened."""
523 #print("Push", tag.name)
524 if self.currentTag is not None:
525 self.currentTag.contents.append(tag)
526 self.tagStack.append(tag)
527 self.currentTag = self.tagStack[-1]
528 if tag.name in self.builder.preserve_whitespace_tags:
529 self.preserve_whitespace_tag_stack.append(tag)
530 if tag.name in self.builder.string_containers:
531 self.string_container_stack.append(tag)
532
533 def endData(self, containerClass=None):
534 """Method called by the TreeBuilder when the end of a data segment
535 occurs.
536 """
537 containerClass = self.string_container(containerClass)
538
539 if self.current_data:
540 current_data = ''.join(self.current_data)
541 # If whitespace is not preserved, and this string contains
542 # nothing but ASCII spaces, replace it with a single space
543 # or newline.
544 if not self.preserve_whitespace_tag_stack:
545 strippable = True
546 for i in current_data:
547 if i not in self.ASCII_SPACES:
548 strippable = False
549 break
550 if strippable:
551 if '\n' in current_data:
552 current_data = '\n'
553 else:
554 current_data = ' '
555
556 # Reset the data collector.
557 self.current_data = []
558
559 # Should we add this string to the tree at all?
560 if self.parse_only and len(self.tagStack) <= 1 and \
561 (not self.parse_only.text or \
562 not self.parse_only.search(current_data)):
563 return
564
565 o = containerClass(current_data)
566 self.object_was_parsed(o)
567
568 def object_was_parsed(self, o, parent=None, most_recent_element=None):
569 """Method called by the TreeBuilder to integrate an object into the parse tree."""
570 if parent is None:
571 parent = self.currentTag
572 if most_recent_element is not None:
573 previous_element = most_recent_element
574 else:
575 previous_element = self._most_recent_element
576
577 next_element = previous_sibling = next_sibling = None
578 if isinstance(o, Tag):
579 next_element = o.next_element
580 next_sibling = o.next_sibling
581 previous_sibling = o.previous_sibling
582 if previous_element is None:
583 previous_element = o.previous_element
584
585 fix = parent.next_element is not None
586
587 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
588
589 self._most_recent_element = o
590 parent.contents.append(o)
591
592 # Check if we are inserting into an already parsed node.
593 if fix:
594 self._linkage_fixer(parent)
595
596 def _linkage_fixer(self, el):
597 """Make sure linkage of this fragment is sound."""
598
599 first = el.contents[0]
600 child = el.contents[-1]
601 descendant = child
602
603 if child is first and el.parent is not None:
604 # Parent should be linked to first child
605 el.next_element = child
606 # We are no longer linked to whatever this element is
607 prev_el = child.previous_element
608 if prev_el is not None and prev_el is not el:
609 prev_el.next_element = None
610 # First child should be linked to the parent, and no previous siblings.
611 child.previous_element = el
612 child.previous_sibling = None
613
614 # We have no sibling as we've been appended as the last.
615 child.next_sibling = None
616
617 # This index is a tag, dig deeper for a "last descendant"
618 if isinstance(child, Tag) and child.contents:
619 descendant = child._last_descendant(False)
620
621 # As the final step, link last descendant. It should be linked
622 # to the parent's next sibling (if found), else walk up the chain
623 # and find a parent with a sibling. It should have no next sibling.
624 descendant.next_element = None
625 descendant.next_sibling = None
626 target = el
627 while True:
628 if target is None:
629 break
630 elif target.next_sibling is not None:
631 descendant.next_element = target.next_sibling
632 target.next_sibling.previous_element = child
633 break
634 target = target.parent
635
636 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
637 """Pops the tag stack up to and including the most recent
638 instance of the given tag.
639
640 :param name: Pop up to the most recent tag with this name.
641 :param nsprefix: The namespace prefix that goes with `name`.
642 :param inclusivePop: It this is false, pops the tag stack up
643 to but *not* including the most recent instqance of the
644 given tag.
645 """
646 #print("Popping to %s" % name)
647 if name == self.ROOT_TAG_NAME:
648 # The BeautifulSoup object itself can never be popped.
649 return
650
651 most_recently_popped = None
652
653 stack_size = len(self.tagStack)
654 for i in range(stack_size - 1, 0, -1):
655 t = self.tagStack[i]
656 if (name == t.name and nsprefix == t.prefix):
657 if inclusivePop:
658 most_recently_popped = self.popTag()
659 break
660 most_recently_popped = self.popTag()
661
662 return most_recently_popped
663
664 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
665 sourcepos=None):
666 """Called by the tree builder when a new tag is encountered.
667
668 :param name: Name of the tag.
669 :param nsprefix: Namespace prefix for the tag.
670 :param attrs: A dictionary of attribute values.
671 :param sourceline: The line number where this tag was found in its
672 source document.
673 :param sourcepos: The character position within `sourceline` where this
674 tag was found.
675
676 If this method returns None, the tag was rejected by an active
677 SoupStrainer. You should proceed as if the tag had not occurred
678 in the document. For instance, if this was a self-closing tag,
679 don't call handle_endtag.
680 """
681 # print("Start tag %s: %s" % (name, attrs))
682 self.endData()
683
684 if (self.parse_only and len(self.tagStack) <= 1
685 and (self.parse_only.text
686 or not self.parse_only.search_tag(name, attrs))):
687 return None
688
689 tag = self.element_classes.get(Tag, Tag)(
690 self, self.builder, name, namespace, nsprefix, attrs,
691 self.currentTag, self._most_recent_element,
692 sourceline=sourceline, sourcepos=sourcepos
693 )
694 if tag is None:
695 return tag
696 if self._most_recent_element is not None:
697 self._most_recent_element.next_element = tag
698 self._most_recent_element = tag
699 self.pushTag(tag)
700 return tag
701
702 def handle_endtag(self, name, nsprefix=None):
703 """Called by the tree builder when an ending tag is encountered.
704
705 :param name: Name of the tag.
706 :param nsprefix: Namespace prefix for the tag.
707 """
708 #print("End tag: " + name)
709 self.endData()
710 self._popToTag(name, nsprefix)
711
712 def handle_data(self, data):
713 """Called by the tree builder when a chunk of textual data is encountered."""
714 self.current_data.append(data)
715
716 def decode(self, pretty_print=False,
717 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
718 formatter="minimal"):
719 """Returns a string or Unicode representation of the parse tree
720 as an HTML or XML document.
721
722 :param pretty_print: If this is True, indentation will be used to
723 make the document more readable.
724 :param eventual_encoding: The encoding of the final document.
725 If this is None, the document will be a Unicode string.
726 """
727 if self.is_xml:
728 # Print the XML declaration
729 encoding_part = ''
730 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
731 # This is a special Python encoding; it can't actually
732 # go into an XML document because it means nothing
733 # outside of Python.
734 eventual_encoding = None
735 if eventual_encoding != None:
736 encoding_part = ' encoding="%s"' % eventual_encoding
737 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
738 else:
739 prefix = ''
740 if not pretty_print:
741 indent_level = None
742 else:
743 indent_level = 0
744 return prefix + super(BeautifulSoup, self).decode(
745 indent_level, eventual_encoding, formatter)
746
747 # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
748 _s = BeautifulSoup
749 _soup = BeautifulSoup
750
751 class BeautifulStoneSoup(BeautifulSoup):
752 """Deprecated interface to an XML parser."""
753
754 def __init__(self, *args, **kwargs):
755 kwargs['features'] = 'xml'
756 warnings.warn(
757 'The BeautifulStoneSoup class is deprecated. Instead of using '
758 'it, pass features="xml" into the BeautifulSoup constructor.')
759 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
760
761
762 class StopParsing(Exception):
763 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
764 pass
765
766 class FeatureNotFound(ValueError):
767 """Exception raised by the BeautifulSoup constructor if no parser with the
768 requested features is found.
769 """
770 pass
771
772
773 #If this file is run as a script, act as an HTML pretty-printer.
774 if __name__ == '__main__':
775 import sys
776 soup = BeautifulSoup(sys.stdin)
777 print((soup.prettify()))