Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/bs4/builder/_htmlparser.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/bs4/builder/_htmlparser.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,439 +0,0 @@ -# encoding: utf-8 -"""Use the HTMLParser library to parse HTML files that aren't too bad.""" - -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'HTMLParserTreeBuilder', - ] - -from html.parser import HTMLParser - -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - -import sys -import warnings - -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - -from bs4.element import ( - CData, - Comment, - Declaration, - Doctype, - ProcessingInstruction, - ) -from bs4.dammit import EntitySubstitution, UnicodeDammit - -from bs4.builder import ( - HTML, - HTMLTreeBuilder, - STRICT, - ) - - -HTMLPARSER = 'html.parser' - -class BeautifulSoupHTMLParser(HTMLParser): - """A subclass of the Python standard library's HTMLParser class, which - listens for HTMLParser events and translates them into calls - to Beautiful Soup's tree construction API. - """ - - def __init__(self, *args, **kwargs): - HTMLParser.__init__(self, *args, **kwargs) - - # Keep a list of empty-element tags that were encountered - # without an explicit closing tag. If we encounter a closing tag - # of this type, we'll associate it with one of those entries. - # - # This isn't a stack because we don't care about the - # order. It's a list of closing tags we've already handled and - # will ignore, assuming they ever show up. - self.already_closed_empty_element = [] - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although - this requirement doesn't appear to be documented. - - In Python 2, HTMLParser implements error() by raising an exception, - which we don't want to do. - - In any event, this method is called only on very strange - markup and our best strategy is to pretend it didn't happen - and keep going. - """ - warnings.warn(msg) - - def handle_startendtag(self, name, attrs): - """Handle an incoming empty-element tag. - - This is only called when the markup looks like <tag/>. - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - """ - # is_startend() tells handle_starttag not to close the tag - # just because its name matches a known empty-element tag. We - # know that this is an empty-element tag and we want to call - # handle_endtag ourselves. - tag = self.handle_starttag(name, attrs, handle_empty_element=False) - self.handle_endtag(name) - - def handle_starttag(self, name, attrs, handle_empty_element=True): - """Handle an opening tag, e.g. '<tag>' - - :param name: Name of the tag. - :param attrs: Dictionary of the tag's attributes. - :param handle_empty_element: True if this tag is known to be - an empty-element tag (i.e. there is not expected to be any - closing tag). - """ - # XXX namespace - attr_dict = {} - for key, value in attrs: - # Change None attribute values to the empty string - # for consistency with the other tree builders. - if value is None: - value = '' - attr_dict[key] = value - attrvalue = '""' - #print "START", name - sourceline, sourcepos = self.getpos() - tag = self.soup.handle_starttag( - name, None, None, attr_dict, sourceline=sourceline, - sourcepos=sourcepos - ) - if tag and tag.is_empty_element and handle_empty_element: - # Unlike other parsers, html.parser doesn't send separate end tag - # events for empty-element tags. (It's handled in - # handle_startendtag, but only if the original markup looked like - # <tag/>.) - # - # So we need to call handle_endtag() ourselves. Since we - # know the start event is identical to the end event, we - # don't want handle_endtag() to cross off any previous end - # events for tags of this name. - self.handle_endtag(name, check_already_closed=False) - - # But we might encounter an explicit closing tag for this tag - # later on. If so, we want to ignore it. - self.already_closed_empty_element.append(name) - - def handle_endtag(self, name, check_already_closed=True): - """Handle a closing tag, e.g. '</tag>' - - :param name: A tag name. - :param check_already_closed: True if this tag is expected to - be the closing portion of an empty-element tag, - e.g. '<tag></tag>'. - """ - #print "END", name - if check_already_closed and name in self.already_closed_empty_element: - # This is a redundant end tag for an empty-element tag. - # We've already called handle_endtag() for it, so just - # check it off the list. - # print "ALREADY CLOSED", name - self.already_closed_empty_element.remove(name) - else: - self.soup.handle_endtag(name) - - def handle_data(self, data): - """Handle some textual data that shows up between tags.""" - self.soup.handle_data(data) - - def handle_charref(self, name): - """Handle a numeric character reference by converting it to the - corresponding Unicode character and treating it as textual - data. - - :param name: Character number, possibly in hexadecimal. - """ - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 - if name.startswith('x'): - real_name = int(name.lstrip('x'), 16) - elif name.startswith('X'): - real_name = int(name.lstrip('X'), 16) - else: - real_name = int(name) - - data = None - if real_name < 256: - # HTML numeric entities are supposed to reference Unicode - # code points, but sometimes they reference code points in - # some other encoding (ahem, Windows-1252). E.g. “ - # instead of É for LEFT DOUBLE QUOTATION MARK. This - # code tries to detect this situation and compensate. - for encoding in (self.soup.original_encoding, 'windows-1252'): - if not encoding: - continue - try: - data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError as e: - pass - if not data: - try: - data = chr(real_name) - except (ValueError, OverflowError) as e: - pass - data = data or "\N{REPLACEMENT CHARACTER}" - self.handle_data(data) - - def handle_entityref(self, name): - """Handle a named entity reference by converting it to the - corresponding Unicode character and treating it as textual - data. - - :param name: Name of the entity reference. - """ - character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) - if character is not None: - data = character - else: - # If this were XML, it would be ambiguous whether "&foo" - # was an character entity reference with a missing - # semicolon or the literal string "&foo". Since this is - # HTML, we have a complete list of all character entity references, - # and this one wasn't found, so assume it's the literal string "&foo". - data = "&%s" % name - self.handle_data(data) - - def handle_comment(self, data): - """Handle an HTML comment. - - :param data: The text of the comment. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(Comment) - - def handle_decl(self, data): - """Handle a DOCTYPE declaration. - - :param data: The text of the declaration. - """ - self.soup.endData() - data = data[len("DOCTYPE "):] - self.soup.handle_data(data) - self.soup.endData(Doctype) - - def unknown_decl(self, data): - """Handle a declaration of unknown type -- probably a CDATA block. - - :param data: The text of the declaration. - """ - if data.upper().startswith('CDATA['): - cls = CData - data = data[len('CDATA['):] - else: - cls = Declaration - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(cls) - - def handle_pi(self, data): - """Handle a processing instruction. - - :param data: The text of the instruction. - """ - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(ProcessingInstruction) - - -class HTMLParserTreeBuilder(HTMLTreeBuilder): - """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, - found in the Python standard library. - """ - is_xml = False - picklable = True - NAME = HTMLPARSER - features = [NAME, HTML, STRICT] - - # The html.parser knows which line number and position in the - # original file is the source of an element. - TRACKS_LINE_NUMBERS = True - - def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): - """Constructor. - - :param parser_args: Positional arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param parser_kwargs: Keyword arguments to pass into - the BeautifulSoupHTMLParser constructor, once it's - invoked. - :param kwargs: Keyword arguments for the superclass constructor. - """ - super(HTMLParserTreeBuilder, self).__init__(**kwargs) - parser_args = parser_args or [] - parser_kwargs = parser_kwargs or {} - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - parser_kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - parser_kwargs['convert_charrefs'] = False - self.parser_args = (parser_args, parser_kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - """ - if isinstance(markup, str): - # Parse Unicode as-is. - yield (markup, None, None, False) - return - - # Ask UnicodeDammit to sniff the most likely encoding. - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True, - exclude_encodings=exclude_encodings) - yield (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - """ - args, kwargs = self.parser_args - parser = BeautifulSoupHTMLParser(*args, **kwargs) - parser.soup = self.soup - try: - parser.feed(markup) - parser.close() - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e - parser.already_closed_empty_element = [] - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like <p></p> as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: <span attr="value" /> - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True