Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/lxml/html/html5parser.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/lxml/html/html5parser.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,260 @@ +""" +An interface to html5lib that mimics the lxml.html interface. +""" +import sys +import string + +from html5lib import HTMLParser as _HTMLParser +from html5lib.treebuilders.etree_lxml import TreeBuilder +from lxml import etree +from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag + +# python3 compatibility +try: + _strings = basestring +except NameError: + _strings = (bytes, str) +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + + +class HTMLParser(_HTMLParser): + """An html5lib HTML parser with lxml as tree.""" + + def __init__(self, strict=False, **kwargs): + _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + + +try: + from html5lib import XHTMLParser as _XHTMLParser +except ImportError: + pass +else: + class XHTMLParser(_XHTMLParser): + """An html5lib XHTML Parser with lxml as tree.""" + + def __init__(self, strict=False, **kwargs): + _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + + xhtml_parser = XHTMLParser() + + +def _find_tag(tree, tag): + elem = tree.find(tag) + if elem is not None: + return elem + return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag)) + + +def document_fromstring(html, guess_charset=None, parser=None): + """ + Parse a whole document into a string. + + If `guess_charset` is true, or if the input is not Unicode but a + byte string, the `chardet` library will perform charset guessing + on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + if parser is None: + parser = html_parser + + options = {} + if guess_charset is None and isinstance(html, bytes): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + guess_charset = True + if guess_charset is not None: + options['useChardet'] = guess_charset + return parser.parse(html, **options).getroot() + + +def fragments_fromstring(html, no_leading_text=False, + guess_charset=None, parser=None): + """Parses several HTML elements, returning a list of elements. + + The first item in the list may be a string. If no_leading_text is true, + then it will be an error if there is leading text, and it will always be + a list of only elements. + + If `guess_charset` is true, the `chardet` library will perform charset + guessing on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + if parser is None: + parser = html_parser + + options = {} + if guess_charset is None and isinstance(html, bytes): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + guess_charset = False + if guess_charset is not None: + options['useChardet'] = guess_charset + children = parser.parseFragment(html, 'div', **options) + if children and isinstance(children[0], _strings): + if no_leading_text: + if children[0].strip(): + raise etree.ParserError('There is leading text: %r' % + children[0]) + del children[0] + return children + + +def fragment_fromstring(html, create_parent=False, + guess_charset=None, parser=None): + """Parses a single HTML element; it is an error if there is more than + one element, or if anything but whitespace precedes or follows the + element. + + If 'create_parent' is true (or is a tag name) then a parent node + will be created to encapsulate the HTML in a single element. In + this case, leading or trailing text is allowed. + + If `guess_charset` is true, the `chardet` library will perform charset + guessing on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, guess_charset=guess_charset, parser=parser, + no_leading_text=not accept_leading_text) + + if create_parent: + if not isinstance(create_parent, _strings): + create_parent = 'div' + new_root = Element(create_parent) + if elements: + if isinstance(elements[0], _strings): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root + + if not elements: + raise etree.ParserError('No elements found') + if len(elements) > 1: + raise etree.ParserError('Multiple elements found') + result = elements[0] + if result.tail and result.tail.strip(): + raise etree.ParserError('Element followed by text: %r' % result.tail) + result.tail = None + return result + + +def fromstring(html, guess_charset=None, parser=None): + """Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + + 'base_url' will set the document's base_url attribute (and the tree's + docinfo.URL) + + If `guess_charset` is true, or if the input is not Unicode but a + byte string, the `chardet` library will perform charset guessing + on the string. + """ + if not isinstance(html, _strings): + raise TypeError('string required') + doc = document_fromstring(html, parser=parser, + guess_charset=guess_charset) + + # document starts with doctype or <html>, full document! + start = html[:50] + if isinstance(start, bytes): + # Allow text comparison in python3. + # Decode as ascii, that also covers latin-1 and utf-8 for the + # characters we need. + start = start.decode('ascii', 'replace') + + start = start.lstrip().lower() + if start.startswith('<html') or start.startswith('<!doctype'): + return doc + + head = _find_tag(doc, 'head') + + # if the head is not empty we have a full document + if len(head): + return doc + + body = _find_tag(doc, 'body') + + # The body has just one element, so it was probably a single + # element passed in + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + return body[0] + + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except <body> implies too much structure. + if _contains_block_level_tag(body): + body.tag = 'div' + else: + body.tag = 'span' + return body + + +def parse(filename_url_or_file, guess_charset=None, parser=None): + """Parse a filename, URL, or file-like object into an HTML document + tree. Note: this returns a tree, not an element. Use + ``parse(...).getroot()`` to get the document root. + + If ``guess_charset`` is true, the ``useChardet`` option is passed into + html5lib to enable character detection. This option is on by default + when parsing from URLs, off by default when parsing from file(-like) + objects (which tend to return Unicode more often than not), and on by + default when parsing from a file path (which is read in binary mode). + """ + if parser is None: + parser = html_parser + if not isinstance(filename_url_or_file, _strings): + fp = filename_url_or_file + if guess_charset is None: + # assume that file-like objects return Unicode more often than bytes + guess_charset = False + elif _looks_like_url(filename_url_or_file): + fp = urlopen(filename_url_or_file) + if guess_charset is None: + # assume that URLs return bytes + guess_charset = True + else: + fp = open(filename_url_or_file, 'rb') + if guess_charset is None: + guess_charset = True + + options = {} + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + if guess_charset: + options['useChardet'] = guess_charset + return parser.parse(fp, **options) + + +def _looks_like_url(str): + scheme = urlparse(str)[0] + if not scheme: + return False + elif (sys.platform == 'win32' and + scheme in string.ascii_letters + and len(scheme) == 1): + # looks like a 'normal' absolute path + return False + else: + return True + + +html_parser = HTMLParser()