Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bs4/__init__.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bs4/__init__.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,736 +0,0 @@ -"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". - -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a pluggable XML or HTML parser to parse a -(possibly invalid) document into a tree representation. Beautiful Soup -provides methods and Pythonic idioms that make it easy to navigate, -search, and modify the parse tree. - -Beautiful Soup works with Python 2.7 and up. It works better if lxml -and/or html5lib is installed. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ -""" - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.9.0" -__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = ['BeautifulSoup'] - -import os -import re -import sys -import traceback -import warnings - -from .builder import builder_registry, ParserRejectedMarkup -from .dammit import UnicodeDammit -from .element import ( - CData, - Comment, - DEFAULT_OUTPUT_ENCODING, - Declaration, - Doctype, - NavigableString, - PageElement, - ProcessingInstruction, - ResultSet, - SoupStrainer, - Tag, - ) - -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - -class BeautifulSoup(Tag): - """A data structure representing a parsed HTML or XML document. - - Most of the methods you'll call on a BeautifulSoup object are inherited from - PageElement or Tag. - - Internally, this class defines the basic interface called by the - tree builders when converting an HTML/XML document into a data - structure. The interface abstracts away the differences between - parsers. To write a new tree builder, you'll need to understand - these methods as a whole. - - These methods will be called by the BeautifulSoup constructor: - * reset() - * feed(markup) - - The tree builder may call these methods from its feed() implementation: - * handle_starttag(name, attrs) # See note about return value - * handle_endtag(name) - * handle_data(data) # Appends to the current data node - * endData(containerClass) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's <br> tag), call handle_starttag and then - handle_endtag. - """ - - # Since BeautifulSoup subclasses Tag, it's possible to treat it as - # a Tag with a .name. This name makes it clear the BeautifulSoup - # object isn't a real markup tag. - ROOT_TAG_NAME = '[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - # A string containing all ASCII whitespace characters, used in - # endData() to detect data chunks that seem 'empty'. - ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, exclude_encodings=None, - element_classes=None, **kwargs): - """Constructor. - - :param markup: A string or a file-like object representing - markup to be parsed. - - :param features: Desirable features of the parser to be - used. This may be the name of a specific parser ("lxml", - "lxml-xml", "html.parser", or "html5lib") or it may be the - type of markup to be used ("html", "html5", "xml"). It's - recommended that you name a specific parser, so that - Beautiful Soup gives you the same results across platforms - and virtual environments. - - :param builder: A TreeBuilder subclass to instantiate (or - instance to use) instead of looking one up based on - `features`. You only need to use this if you've implemented a - custom TreeBuilder. - - :param parse_only: A SoupStrainer. Only parts of the document - matching the SoupStrainer will be considered. This is useful - when parsing part of a document that would otherwise be too - large to fit into memory. - - :param from_encoding: A string indicating the encoding of the - document to be parsed. Pass this in if Beautiful Soup is - guessing wrongly about the document's encoding. - - :param exclude_encodings: A list of strings indicating - encodings known to be wrong. Pass this in if you don't know - the document's encoding but you know Beautiful Soup's guess is - wrong. - - :param element_classes: A dictionary mapping BeautifulSoup - classes like Tag and NavigableString, to other classes you'd - like to be instantiated instead as the parse tree is - built. This is useful for subclassing Tag or NavigableString - to modify default behavior. - - :param kwargs: For backwards compatibility purposes, the - constructor accepts certain keyword arguments used in - Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4; they will result in a warning and then be - ignored. - - Apart from this, any keyword arguments passed into the - BeautifulSoup constructor are propagated to the TreeBuilder - constructor. This makes it possible to configure a - TreeBuilder by passing in arguments, not just by saying which - one to use. - """ - if 'convertEntities' in kwargs: - del kwargs['convertEntities'] - warnings.warn( - "BS4 does not respect the convertEntities argument to the " - "BeautifulSoup constructor. Entities are always converted " - "to Unicode characters.") - - if 'markupMassage' in kwargs: - del kwargs['markupMassage'] - warnings.warn( - "BS4 does not respect the markupMassage argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for any necessary markup massage.") - - if 'smartQuotesTo' in kwargs: - del kwargs['smartQuotesTo'] - warnings.warn( - "BS4 does not respect the smartQuotesTo argument to the " - "BeautifulSoup constructor. Smart quotes are always converted " - "to Unicode characters.") - - if 'selfClosingTags' in kwargs: - del kwargs['selfClosingTags'] - warnings.warn( - "BS4 does not respect the selfClosingTags argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for understanding self-closing tags.") - - if 'isHTML' in kwargs: - del kwargs['isHTML'] - warnings.warn( - "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. Suggest you use " - "features='lxml' for HTML and features='lxml-xml' for " - "XML.") - - def deprecated_argument(old_name, new_name): - if old_name in kwargs: - warnings.warn( - 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value - return None - - parse_only = parse_only or deprecated_argument( - "parseOnlyThese", "parse_only") - - from_encoding = from_encoding or deprecated_argument( - "fromEncoding", "from_encoding") - - if from_encoding and isinstance(markup, str): - warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") - from_encoding = None - - self.element_classes = element_classes or dict() - - # We need this information to track whether or not the builder - # was specified well enough that we can omit the 'you need to - # specify a parser' warning. - original_builder = builder - original_features = features - - if isinstance(builder, type): - # A builder class was passed in; it needs to be instantiated. - builder_class = builder - builder = None - elif builder is None: - if isinstance(features, str): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise FeatureNotFound( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - - # At this point either we have a TreeBuilder instance in - # builder, or we have a builder_class that we can instantiate - # with the remaining **kwargs. - if builder is None: - builder = builder_class(**kwargs) - if not original_builder and not ( - original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES - ): - if builder.is_xml: - markup_type = "XML" - else: - markup_type = "HTML" - - # This code adapted from warnings.py so that we get the same line - # of code as our warnings.warn() call gets, even if the answer is wrong - # (as it may be in a multithreading situation). - caller = None - try: - caller = sys._getframe(1) - except ValueError: - pass - if caller: - globals = caller.f_globals - line_number = caller.f_lineno - else: - globals = sys.__dict__ - line_number= 1 - filename = globals.get('__file__') - if filename: - fnl = filename.lower() - if fnl.endswith((".pyc", ".pyo")): - filename = filename[:-1] - if filename: - # If there is no filename at all, the user is most likely in a REPL, - # and the warning is not necessary. - values = dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type - ) - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) - else: - if kwargs: - warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") - - self.builder = builder - self.is_xml = builder.is_xml - self.known_xml = self.is_xml - self._namespaces = dict() - self.parse_only = parse_only - - self.builder.initialize_soup(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - elif len(markup) <= 256 and ( - (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, str) and not '<' in markup) - ): - # Print out warnings for a couple beginner problems - # involving passing non-markup to Beautiful Soup. - # Beautiful Soup will still parse the input as markup, - # just in case that's what the user really wants. - if (isinstance(markup, str) - and not os.path.supports_unicode_filenames): - possible_filename = markup.encode("utf8") - else: - possible_filename = markup - is_file = False - try: - is_file = os.path.exists(possible_filename) - except Exception as e: - # This is almost certainly a problem involving - # characters not valid in filenames on this - # system. Just let it go. - pass - if is_file: - warnings.warn( - '"%s" looks like a filename, not markup. You should' - ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % self._decode_markup(markup) - ) - self._check_markup_is_url(markup) - - rejections = [] - success = False - for (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) in ( - self.builder.prepare_markup( - markup, from_encoding, exclude_encodings=exclude_encodings)): - self.reset() - try: - self._feed() - success = True - break - except ParserRejectedMarkup as e: - rejections.append(e) - pass - - if not success: - other_exceptions = [str(e) for e in rejections] - raise ParserRejectedMarkup( - "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) - ) - - # Clear out the markup and remove the builder's circular - # reference to this object. - self.markup = None - self.builder.soup = None - - def __copy__(self): - """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" - copy = type(self)( - self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' - ) - - # Although we encoded the tree to UTF-8, that may not have - # been the encoding of the original markup. Set the copy's - # .original_encoding to reflect the original object's - # .original_encoding. - copy.original_encoding = self.original_encoding - return copy - - def __getstate__(self): - # Frequently a tree builder can't be pickled. - d = dict(self.__dict__) - if 'builder' in d and not self.builder.picklable: - d['builder'] = None - return d - - @classmethod - def _decode_markup(cls, markup): - """Ensure `markup` is bytes so it's safe to send into warnings.warn. - - TODO: warnings.warn had this problem back in 2010 but it might not - anymore. - """ - if isinstance(markup, bytes): - decoded = markup.decode('utf-8', 'replace') - else: - decoded = markup - return decoded - - @classmethod - def _check_markup_is_url(cls, markup): - """Error-handling method to raise a warning if incoming markup looks - like a URL. - - :param markup: A string. - """ - if isinstance(markup, bytes): - space = b' ' - cant_start_with = (b"http:", b"https:") - elif isinstance(markup, str): - space = ' ' - cant_start_with = ("http:", "https:") - else: - return - - if any(markup.startswith(prefix) for prefix in cant_start_with): - if not space in markup: - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an' - ' HTTP client. You should probably use an HTTP client like' - ' requests to get the document behind the URL, and feed' - ' that document to Beautiful Soup.' % cls._decode_markup( - markup - ) - ) - - def _feed(self): - """Internal method that parses previously set markup, creating a large - number of Tag and NavigableString objects. - """ - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - """Reset this object to a state as though it had never parsed any - markup. - """ - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.current_data = [] - self.currentTag = None - self.tagStack = [] - self.preserve_whitespace_tag_stack = [] - self.string_container_stack = [] - self.pushTag(self) - - def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, - sourceline=None, sourcepos=None, **kwattrs): - """Create a new Tag associated with this BeautifulSoup object.""" - kwattrs.update(attrs) - return self.element_classes.get(Tag, Tag)( - None, self.builder, name, namespace, nsprefix, kwattrs, - sourceline=sourceline, sourcepos=sourcepos - ) - - def string_container(self, base_class=None): - container = base_class or NavigableString - - # There may be a general override of NavigableString. - container = self.element_classes.get( - container, container - ) - - # On top of that, we may be inside a tag that needs a special - # container class. - if self.string_container_stack: - container = self.builder.string_containers.get( - self.string_container_stack[-1].name, container - ) - return container - - def new_string(self, s, subclass=None): - """Create a new NavigableString associated with this BeautifulSoup - object. - """ - container = self.string_container(subclass) - return container(s) - - def insert_before(self, successor): - """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement - it because there is nothing before or after it in the parse tree. - """ - raise NotImplementedError("BeautifulSoup objects don't support insert_before().") - - def insert_after(self, successor): - """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement - it because there is nothing before or after it in the parse tree. - """ - raise NotImplementedError("BeautifulSoup objects don't support insert_after().") - - def popTag(self): - """Internal method called by _popToTag when a tag is closed.""" - tag = self.tagStack.pop() - if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: - self.preserve_whitespace_tag_stack.pop() - if self.string_container_stack and tag == self.string_container_stack[-1]: - self.string_container_stack.pop() - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - """Internal method called by handle_starttag when a tag is opened.""" - #print "Push", tag.name - if self.currentTag is not None: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - if tag.name in self.builder.preserve_whitespace_tags: - self.preserve_whitespace_tag_stack.append(tag) - if tag.name in self.builder.string_containers: - self.string_container_stack.append(tag) - - def endData(self, containerClass=None): - """Method called by the TreeBuilder when the end of a data segment - occurs. - """ - containerClass = self.string_container(containerClass) - - if self.current_data: - current_data = ''.join(self.current_data) - # If whitespace is not preserved, and this string contains - # nothing but ASCII spaces, replace it with a single space - # or newline. - if not self.preserve_whitespace_tag_stack: - strippable = True - for i in current_data: - if i not in self.ASCII_SPACES: - strippable = False - break - if strippable: - if '\n' in current_data: - current_data = '\n' - else: - current_data = ' ' - - # Reset the data collector. - self.current_data = [] - - # Should we add this string to the tree at all? - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(current_data)): - return - - o = containerClass(current_data) - self.object_was_parsed(o) - - def object_was_parsed(self, o, parent=None, most_recent_element=None): - """Method called by the TreeBuilder to integrate an object into the parse tree.""" - if parent is None: - parent = self.currentTag - if most_recent_element is not None: - previous_element = most_recent_element - else: - previous_element = self._most_recent_element - - next_element = previous_sibling = next_sibling = None - if isinstance(o, Tag): - next_element = o.next_element - next_sibling = o.next_sibling - previous_sibling = o.previous_sibling - if previous_element is None: - previous_element = o.previous_element - - fix = parent.next_element is not None - - o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) - - self._most_recent_element = o - parent.contents.append(o) - - # Check if we are inserting into an already parsed node. - if fix: - self._linkage_fixer(parent) - - def _linkage_fixer(self, el): - """Make sure linkage of this fragment is sound.""" - - first = el.contents[0] - child = el.contents[-1] - descendant = child - - if child is first and el.parent is not None: - # Parent should be linked to first child - el.next_element = child - # We are no longer linked to whatever this element is - prev_el = child.previous_element - if prev_el is not None and prev_el is not el: - prev_el.next_element = None - # First child should be linked to the parent, and no previous siblings. - child.previous_element = el - child.previous_sibling = None - - # We have no sibling as we've been appended as the last. - child.next_sibling = None - - # This index is a tag, dig deeper for a "last descendant" - if isinstance(child, Tag) and child.contents: - descendant = child._last_descendant(False) - - # As the final step, link last descendant. It should be linked - # to the parent's next sibling (if found), else walk up the chain - # and find a parent with a sibling. It should have no next sibling. - descendant.next_element = None - descendant.next_sibling = None - target = el - while True: - if target is None: - break - elif target.next_sibling is not None: - descendant.next_element = target.next_sibling - target.next_sibling.previous_element = child - break - target = target.parent - - def _popToTag(self, name, nsprefix=None, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. - - :param name: Pop up to the most recent tag with this name. - :param nsprefix: The namespace prefix that goes with `name`. - :param inclusivePop: It this is false, pops the tag stack up - to but *not* including the most recent instqance of the - given tag. - """ - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - # The BeautifulSoup object itself can never be popped. - return - - most_recently_popped = None - - stack_size = len(self.tagStack) - for i in range(stack_size - 1, 0, -1): - t = self.tagStack[i] - if (name == t.name and nsprefix == t.prefix): - if inclusivePop: - most_recently_popped = self.popTag() - break - most_recently_popped = self.popTag() - - return most_recently_popped - - def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, - sourcepos=None): - """Called by the tree builder when a new tag is encountered. - - :param name: Name of the tag. - :param nsprefix: Namespace prefix for the tag. - :param attrs: A dictionary of attribute values. - :param sourceline: The line number where this tag was found in its - source document. - :param sourcepos: The character position within `sourceline` where this - tag was found. - - If this method returns None, the tag was rejected by an active - SoupStrainer. You should proceed as if the tag had not occurred - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - # print "Start tag %s: %s" % (name, attrs) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.search_tag(name, attrs))): - return None - - tag = self.element_classes.get(Tag, Tag)( - self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element, - sourceline=sourceline, sourcepos=sourcepos - ) - if tag is None: - return tag - if self._most_recent_element is not None: - self._most_recent_element.next_element = tag - self._most_recent_element = tag - self.pushTag(tag) - return tag - - def handle_endtag(self, name, nsprefix=None): - """Called by the tree builder when an ending tag is encountered. - - :param name: Name of the tag. - :param nsprefix: Namespace prefix for the tag. - """ - #print "End tag: " + name - self.endData() - self._popToTag(name, nsprefix) - - def handle_data(self, data): - """Called by the tree builder when a chunk of textual data is encountered.""" - self.current_data.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a string or Unicode representation of the parse tree - as an HTML or XML document. - - :param pretty_print: If this is True, indentation will be used to - make the document more readable. - :param eventual_encoding: The encoding of the final document. - If this is None, the document will be a Unicode string. - """ - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = '<?xml version="1.0"%s?>\n' % encoding_part - else: - prefix = '' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) - -# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' -_s = BeautifulSoup -_soup = BeautifulSoup - -class BeautifulStoneSoup(BeautifulSoup): - """Deprecated interface to an XML parser.""" - - def __init__(self, *args, **kwargs): - kwargs['features'] = 'xml' - warnings.warn( - 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') - super(BeautifulStoneSoup, self).__init__(*args, **kwargs) - - -class StopParsing(Exception): - """Exception raised by a TreeBuilder if it's unable to continue parsing.""" - pass - -class FeatureNotFound(ValueError): - """Exception raised by the BeautifulSoup constructor if no parser with the - requested features is found. - """ - pass - - -#If this file is run as a script, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print(soup.prettify())