Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/lxml/html/html5parser.py @ 2:6af9afd405e9 draft
"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
| author | shellac |
|---|---|
| date | Thu, 14 May 2020 14:56:58 -0400 |
| parents | 26e78fe6e8c4 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:75ca89e9b81c | 2:6af9afd405e9 |
|---|---|
| 1 """ | |
| 2 An interface to html5lib that mimics the lxml.html interface. | |
| 3 """ | |
| 4 import sys | |
| 5 import string | |
| 6 | |
| 7 from html5lib import HTMLParser as _HTMLParser | |
| 8 from html5lib.treebuilders.etree_lxml import TreeBuilder | |
| 9 from lxml import etree | |
| 10 from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag | |
| 11 | |
| 12 # python3 compatibility | |
| 13 try: | |
| 14 _strings = basestring | |
| 15 except NameError: | |
| 16 _strings = (bytes, str) | |
| 17 try: | |
| 18 from urllib2 import urlopen | |
| 19 except ImportError: | |
| 20 from urllib.request import urlopen | |
| 21 try: | |
| 22 from urlparse import urlparse | |
| 23 except ImportError: | |
| 24 from urllib.parse import urlparse | |
| 25 | |
| 26 | |
| 27 class HTMLParser(_HTMLParser): | |
| 28 """An html5lib HTML parser with lxml as tree.""" | |
| 29 | |
| 30 def __init__(self, strict=False, **kwargs): | |
| 31 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) | |
| 32 | |
| 33 | |
| 34 try: | |
| 35 from html5lib import XHTMLParser as _XHTMLParser | |
| 36 except ImportError: | |
| 37 pass | |
| 38 else: | |
| 39 class XHTMLParser(_XHTMLParser): | |
| 40 """An html5lib XHTML Parser with lxml as tree.""" | |
| 41 | |
| 42 def __init__(self, strict=False, **kwargs): | |
| 43 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) | |
| 44 | |
| 45 xhtml_parser = XHTMLParser() | |
| 46 | |
| 47 | |
| 48 def _find_tag(tree, tag): | |
| 49 elem = tree.find(tag) | |
| 50 if elem is not None: | |
| 51 return elem | |
| 52 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag)) | |
| 53 | |
| 54 | |
| 55 def document_fromstring(html, guess_charset=None, parser=None): | |
| 56 """ | |
| 57 Parse a whole document into a string. | |
| 58 | |
| 59 If `guess_charset` is true, or if the input is not Unicode but a | |
| 60 byte string, the `chardet` library will perform charset guessing | |
| 61 on the string. | |
| 62 """ | |
| 63 if not isinstance(html, _strings): | |
| 64 raise TypeError('string required') | |
| 65 | |
| 66 if parser is None: | |
| 67 parser = html_parser | |
| 68 | |
| 69 options = {} | |
| 70 if guess_charset is None and isinstance(html, bytes): | |
| 71 # html5lib does not accept useChardet as an argument, if it | |
| 72 # detected the html argument would produce unicode objects. | |
| 73 guess_charset = True | |
| 74 if guess_charset is not None: | |
| 75 options['useChardet'] = guess_charset | |
| 76 return parser.parse(html, **options).getroot() | |
| 77 | |
| 78 | |
| 79 def fragments_fromstring(html, no_leading_text=False, | |
| 80 guess_charset=None, parser=None): | |
| 81 """Parses several HTML elements, returning a list of elements. | |
| 82 | |
| 83 The first item in the list may be a string. If no_leading_text is true, | |
| 84 then it will be an error if there is leading text, and it will always be | |
| 85 a list of only elements. | |
| 86 | |
| 87 If `guess_charset` is true, the `chardet` library will perform charset | |
| 88 guessing on the string. | |
| 89 """ | |
| 90 if not isinstance(html, _strings): | |
| 91 raise TypeError('string required') | |
| 92 | |
| 93 if parser is None: | |
| 94 parser = html_parser | |
| 95 | |
| 96 options = {} | |
| 97 if guess_charset is None and isinstance(html, bytes): | |
| 98 # html5lib does not accept useChardet as an argument, if it | |
| 99 # detected the html argument would produce unicode objects. | |
| 100 guess_charset = False | |
| 101 if guess_charset is not None: | |
| 102 options['useChardet'] = guess_charset | |
| 103 children = parser.parseFragment(html, 'div', **options) | |
| 104 if children and isinstance(children[0], _strings): | |
| 105 if no_leading_text: | |
| 106 if children[0].strip(): | |
| 107 raise etree.ParserError('There is leading text: %r' % | |
| 108 children[0]) | |
| 109 del children[0] | |
| 110 return children | |
| 111 | |
| 112 | |
| 113 def fragment_fromstring(html, create_parent=False, | |
| 114 guess_charset=None, parser=None): | |
| 115 """Parses a single HTML element; it is an error if there is more than | |
| 116 one element, or if anything but whitespace precedes or follows the | |
| 117 element. | |
| 118 | |
| 119 If 'create_parent' is true (or is a tag name) then a parent node | |
| 120 will be created to encapsulate the HTML in a single element. In | |
| 121 this case, leading or trailing text is allowed. | |
| 122 | |
| 123 If `guess_charset` is true, the `chardet` library will perform charset | |
| 124 guessing on the string. | |
| 125 """ | |
| 126 if not isinstance(html, _strings): | |
| 127 raise TypeError('string required') | |
| 128 | |
| 129 accept_leading_text = bool(create_parent) | |
| 130 | |
| 131 elements = fragments_fromstring( | |
| 132 html, guess_charset=guess_charset, parser=parser, | |
| 133 no_leading_text=not accept_leading_text) | |
| 134 | |
| 135 if create_parent: | |
| 136 if not isinstance(create_parent, _strings): | |
| 137 create_parent = 'div' | |
| 138 new_root = Element(create_parent) | |
| 139 if elements: | |
| 140 if isinstance(elements[0], _strings): | |
| 141 new_root.text = elements[0] | |
| 142 del elements[0] | |
| 143 new_root.extend(elements) | |
| 144 return new_root | |
| 145 | |
| 146 if not elements: | |
| 147 raise etree.ParserError('No elements found') | |
| 148 if len(elements) > 1: | |
| 149 raise etree.ParserError('Multiple elements found') | |
| 150 result = elements[0] | |
| 151 if result.tail and result.tail.strip(): | |
| 152 raise etree.ParserError('Element followed by text: %r' % result.tail) | |
| 153 result.tail = None | |
| 154 return result | |
| 155 | |
| 156 | |
| 157 def fromstring(html, guess_charset=None, parser=None): | |
| 158 """Parse the html, returning a single element/document. | |
| 159 | |
| 160 This tries to minimally parse the chunk of text, without knowing if it | |
| 161 is a fragment or a document. | |
| 162 | |
| 163 'base_url' will set the document's base_url attribute (and the tree's | |
| 164 docinfo.URL) | |
| 165 | |
| 166 If `guess_charset` is true, or if the input is not Unicode but a | |
| 167 byte string, the `chardet` library will perform charset guessing | |
| 168 on the string. | |
| 169 """ | |
| 170 if not isinstance(html, _strings): | |
| 171 raise TypeError('string required') | |
| 172 doc = document_fromstring(html, parser=parser, | |
| 173 guess_charset=guess_charset) | |
| 174 | |
| 175 # document starts with doctype or <html>, full document! | |
| 176 start = html[:50] | |
| 177 if isinstance(start, bytes): | |
| 178 # Allow text comparison in python3. | |
| 179 # Decode as ascii, that also covers latin-1 and utf-8 for the | |
| 180 # characters we need. | |
| 181 start = start.decode('ascii', 'replace') | |
| 182 | |
| 183 start = start.lstrip().lower() | |
| 184 if start.startswith('<html') or start.startswith('<!doctype'): | |
| 185 return doc | |
| 186 | |
| 187 head = _find_tag(doc, 'head') | |
| 188 | |
| 189 # if the head is not empty we have a full document | |
| 190 if len(head): | |
| 191 return doc | |
| 192 | |
| 193 body = _find_tag(doc, 'body') | |
| 194 | |
| 195 # The body has just one element, so it was probably a single | |
| 196 # element passed in | |
| 197 if (len(body) == 1 and (not body.text or not body.text.strip()) | |
| 198 and (not body[-1].tail or not body[-1].tail.strip())): | |
| 199 return body[0] | |
| 200 | |
| 201 # Now we have a body which represents a bunch of tags which have the | |
| 202 # content that was passed in. We will create a fake container, which | |
| 203 # is the body tag, except <body> implies too much structure. | |
| 204 if _contains_block_level_tag(body): | |
| 205 body.tag = 'div' | |
| 206 else: | |
| 207 body.tag = 'span' | |
| 208 return body | |
| 209 | |
| 210 | |
| 211 def parse(filename_url_or_file, guess_charset=None, parser=None): | |
| 212 """Parse a filename, URL, or file-like object into an HTML document | |
| 213 tree. Note: this returns a tree, not an element. Use | |
| 214 ``parse(...).getroot()`` to get the document root. | |
| 215 | |
| 216 If ``guess_charset`` is true, the ``useChardet`` option is passed into | |
| 217 html5lib to enable character detection. This option is on by default | |
| 218 when parsing from URLs, off by default when parsing from file(-like) | |
| 219 objects (which tend to return Unicode more often than not), and on by | |
| 220 default when parsing from a file path (which is read in binary mode). | |
| 221 """ | |
| 222 if parser is None: | |
| 223 parser = html_parser | |
| 224 if not isinstance(filename_url_or_file, _strings): | |
| 225 fp = filename_url_or_file | |
| 226 if guess_charset is None: | |
| 227 # assume that file-like objects return Unicode more often than bytes | |
| 228 guess_charset = False | |
| 229 elif _looks_like_url(filename_url_or_file): | |
| 230 fp = urlopen(filename_url_or_file) | |
| 231 if guess_charset is None: | |
| 232 # assume that URLs return bytes | |
| 233 guess_charset = True | |
| 234 else: | |
| 235 fp = open(filename_url_or_file, 'rb') | |
| 236 if guess_charset is None: | |
| 237 guess_charset = True | |
| 238 | |
| 239 options = {} | |
| 240 # html5lib does not accept useChardet as an argument, if it | |
| 241 # detected the html argument would produce unicode objects. | |
| 242 if guess_charset: | |
| 243 options['useChardet'] = guess_charset | |
| 244 return parser.parse(fp, **options) | |
| 245 | |
| 246 | |
| 247 def _looks_like_url(str): | |
| 248 scheme = urlparse(str)[0] | |
| 249 if not scheme: | |
| 250 return False | |
| 251 elif (sys.platform == 'win32' and | |
| 252 scheme in string.ascii_letters | |
| 253 and len(scheme) == 1): | |
| 254 # looks like a 'normal' absolute path | |
| 255 return False | |
| 256 else: | |
| 257 return True | |
| 258 | |
| 259 | |
| 260 html_parser = HTMLParser() |
