Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/lxml/html/soupparser.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d30785e31577 | 1:56ad4e20f292 |
|---|---|
| 1 """External interface to the BeautifulSoup HTML parser. | |
| 2 """ | |
| 3 | |
| 4 __all__ = ["fromstring", "parse", "convert_tree"] | |
| 5 | |
| 6 import re | |
| 7 from lxml import etree, html | |
| 8 | |
| 9 try: | |
| 10 from bs4 import ( | |
| 11 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, | |
| 12 Declaration, Doctype) | |
| 13 _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) | |
| 14 except ImportError: | |
| 15 from BeautifulSoup import ( | |
| 16 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, | |
| 17 Declaration) | |
| 18 _DECLARATION_OR_DOCTYPE = Declaration | |
| 19 | |
| 20 | |
| 21 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): | |
| 22 """Parse a string of HTML data into an Element tree using the | |
| 23 BeautifulSoup parser. | |
| 24 | |
| 25 Returns the root ``<html>`` Element of the tree. | |
| 26 | |
| 27 You can pass a different BeautifulSoup parser through the | |
| 28 `beautifulsoup` keyword, and a diffent Element factory function | |
| 29 through the `makeelement` keyword. By default, the standard | |
| 30 ``BeautifulSoup`` class and the default factory of `lxml.html` are | |
| 31 used. | |
| 32 """ | |
| 33 return _parse(data, beautifulsoup, makeelement, **bsargs) | |
| 34 | |
| 35 | |
| 36 def parse(file, beautifulsoup=None, makeelement=None, **bsargs): | |
| 37 """Parse a file into an ElemenTree using the BeautifulSoup parser. | |
| 38 | |
| 39 You can pass a different BeautifulSoup parser through the | |
| 40 `beautifulsoup` keyword, and a diffent Element factory function | |
| 41 through the `makeelement` keyword. By default, the standard | |
| 42 ``BeautifulSoup`` class and the default factory of `lxml.html` are | |
| 43 used. | |
| 44 """ | |
| 45 if not hasattr(file, 'read'): | |
| 46 file = open(file) | |
| 47 root = _parse(file, beautifulsoup, makeelement, **bsargs) | |
| 48 return etree.ElementTree(root) | |
| 49 | |
| 50 | |
| 51 def convert_tree(beautiful_soup_tree, makeelement=None): | |
| 52 """Convert a BeautifulSoup tree to a list of Element trees. | |
| 53 | |
| 54 Returns a list instead of a single root Element to support | |
| 55 HTML-like soup with more than one root element. | |
| 56 | |
| 57 You can pass a different Element factory through the `makeelement` | |
| 58 keyword. | |
| 59 """ | |
| 60 root = _convert_tree(beautiful_soup_tree, makeelement) | |
| 61 children = root.getchildren() | |
| 62 for child in children: | |
| 63 root.remove(child) | |
| 64 return children | |
| 65 | |
| 66 | |
| 67 # helpers | |
| 68 | |
| 69 def _parse(source, beautifulsoup, makeelement, **bsargs): | |
| 70 if beautifulsoup is None: | |
| 71 beautifulsoup = BeautifulSoup | |
| 72 if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 | |
| 73 if 'convertEntities' not in bsargs: | |
| 74 bsargs['convertEntities'] = 'html' | |
| 75 if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 | |
| 76 if 'features' not in bsargs: | |
| 77 bsargs['features'] = 'html.parser' # use Python html parser | |
| 78 tree = beautifulsoup(source, **bsargs) | |
| 79 root = _convert_tree(tree, makeelement) | |
| 80 # from ET: wrap the document in a html root element, if necessary | |
| 81 if len(root) == 1 and root[0].tag == "html": | |
| 82 return root[0] | |
| 83 root.tag = "html" | |
| 84 return root | |
| 85 | |
| 86 | |
| 87 _parse_doctype_declaration = re.compile( | |
| 88 r'(?:\s|[<!])*DOCTYPE\s*HTML' | |
| 89 r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' | |
| 90 r'(?:\s+(\'[^\']*\'|"[^"]*"))?', | |
| 91 re.IGNORECASE).match | |
| 92 | |
| 93 | |
| 94 class _PseudoTag: | |
| 95 # Minimal imitation of BeautifulSoup.Tag | |
| 96 def __init__(self, contents): | |
| 97 self.name = 'html' | |
| 98 self.attrs = [] | |
| 99 self.contents = contents | |
| 100 | |
| 101 def __iter__(self): | |
| 102 return self.contents.__iter__() | |
| 103 | |
| 104 | |
| 105 def _convert_tree(beautiful_soup_tree, makeelement): | |
| 106 if makeelement is None: | |
| 107 makeelement = html.html_parser.makeelement | |
| 108 | |
| 109 # Split the tree into three parts: | |
| 110 # i) everything before the root element: document type | |
| 111 # declaration, comments, processing instructions, whitespace | |
| 112 # ii) the root(s), | |
| 113 # iii) everything after the root: comments, processing | |
| 114 # instructions, whitespace | |
| 115 first_element_idx = last_element_idx = None | |
| 116 html_root = declaration = None | |
| 117 for i, e in enumerate(beautiful_soup_tree): | |
| 118 if isinstance(e, Tag): | |
| 119 if first_element_idx is None: | |
| 120 first_element_idx = i | |
| 121 last_element_idx = i | |
| 122 if html_root is None and e.name and e.name.lower() == 'html': | |
| 123 html_root = e | |
| 124 elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): | |
| 125 declaration = e | |
| 126 | |
| 127 # For a nice, well-formatted document, the variable roots below is | |
| 128 # a list consisting of a single <html> element. However, the document | |
| 129 # may be a soup like '<meta><head><title>Hello</head><body>Hi | |
| 130 # all<\p>'. In this example roots is a list containing meta, head | |
| 131 # and body elements. | |
| 132 if first_element_idx is None: | |
| 133 pre_root = post_root = [] | |
| 134 roots = beautiful_soup_tree.contents | |
| 135 else: | |
| 136 pre_root = beautiful_soup_tree.contents[:first_element_idx] | |
| 137 roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] | |
| 138 post_root = beautiful_soup_tree.contents[last_element_idx+1:] | |
| 139 | |
| 140 # Reorganize so that there is one <html> root... | |
| 141 if html_root is not None: | |
| 142 # ... use existing one if possible, ... | |
| 143 i = roots.index(html_root) | |
| 144 html_root.contents = roots[:i] + html_root.contents + roots[i+1:] | |
| 145 else: | |
| 146 # ... otherwise create a new one. | |
| 147 html_root = _PseudoTag(roots) | |
| 148 | |
| 149 convert_node = _init_node_converters(makeelement) | |
| 150 | |
| 151 # Process pre_root | |
| 152 res_root = convert_node(html_root) | |
| 153 prev = res_root | |
| 154 for e in reversed(pre_root): | |
| 155 converted = convert_node(e) | |
| 156 if converted is not None: | |
| 157 prev.addprevious(converted) | |
| 158 prev = converted | |
| 159 | |
| 160 # ditto for post_root | |
| 161 prev = res_root | |
| 162 for e in post_root: | |
| 163 converted = convert_node(e) | |
| 164 if converted is not None: | |
| 165 prev.addnext(converted) | |
| 166 prev = converted | |
| 167 | |
| 168 if declaration is not None: | |
| 169 try: | |
| 170 # bs4 provides full Doctype string | |
| 171 doctype_string = declaration.output_ready() | |
| 172 except AttributeError: | |
| 173 doctype_string = declaration.string | |
| 174 | |
| 175 match = _parse_doctype_declaration(doctype_string) | |
| 176 if not match: | |
| 177 # Something is wrong if we end up in here. Since soupparser should | |
| 178 # tolerate errors, do not raise Exception, just let it pass. | |
| 179 pass | |
| 180 else: | |
| 181 external_id, sys_uri = match.groups() | |
| 182 docinfo = res_root.getroottree().docinfo | |
| 183 # strip quotes and update DOCTYPE values (any of None, '', '...') | |
| 184 docinfo.public_id = external_id and external_id[1:-1] | |
| 185 docinfo.system_url = sys_uri and sys_uri[1:-1] | |
| 186 | |
| 187 return res_root | |
| 188 | |
| 189 | |
| 190 def _init_node_converters(makeelement): | |
| 191 converters = {} | |
| 192 ordered_node_types = [] | |
| 193 | |
| 194 def converter(*types): | |
| 195 def add(handler): | |
| 196 for t in types: | |
| 197 converters[t] = handler | |
| 198 ordered_node_types.append(t) | |
| 199 return handler | |
| 200 return add | |
| 201 | |
| 202 def find_best_converter(node): | |
| 203 for t in ordered_node_types: | |
| 204 if isinstance(node, t): | |
| 205 return converters[t] | |
| 206 return None | |
| 207 | |
| 208 def convert_node(bs_node, parent=None): | |
| 209 # duplicated in convert_tag() below | |
| 210 try: | |
| 211 handler = converters[type(bs_node)] | |
| 212 except KeyError: | |
| 213 handler = converters[type(bs_node)] = find_best_converter(bs_node) | |
| 214 if handler is None: | |
| 215 return None | |
| 216 return handler(bs_node, parent) | |
| 217 | |
| 218 def map_attrs(bs_attrs): | |
| 219 if isinstance(bs_attrs, dict): # bs4 | |
| 220 attribs = {} | |
| 221 for k, v in bs_attrs.items(): | |
| 222 if isinstance(v, list): | |
| 223 v = " ".join(v) | |
| 224 attribs[k] = unescape(v) | |
| 225 else: | |
| 226 attribs = dict((k, unescape(v)) for k, v in bs_attrs) | |
| 227 return attribs | |
| 228 | |
| 229 def append_text(parent, text): | |
| 230 if len(parent) == 0: | |
| 231 parent.text = (parent.text or '') + text | |
| 232 else: | |
| 233 parent[-1].tail = (parent[-1].tail or '') + text | |
| 234 | |
| 235 # converters are tried in order of their definition | |
| 236 | |
| 237 @converter(Tag, _PseudoTag) | |
| 238 def convert_tag(bs_node, parent): | |
| 239 attrs = bs_node.attrs | |
| 240 if parent is not None: | |
| 241 attribs = map_attrs(attrs) if attrs else None | |
| 242 res = etree.SubElement(parent, bs_node.name, attrib=attribs) | |
| 243 else: | |
| 244 attribs = map_attrs(attrs) if attrs else {} | |
| 245 res = makeelement(bs_node.name, attrib=attribs) | |
| 246 | |
| 247 for child in bs_node: | |
| 248 # avoid double recursion by inlining convert_node(), see above | |
| 249 try: | |
| 250 handler = converters[type(child)] | |
| 251 except KeyError: | |
| 252 pass | |
| 253 else: | |
| 254 if handler is not None: | |
| 255 handler(child, res) | |
| 256 continue | |
| 257 convert_node(child, res) | |
| 258 return res | |
| 259 | |
| 260 @converter(Comment) | |
| 261 def convert_comment(bs_node, parent): | |
| 262 res = html.HtmlComment(bs_node) | |
| 263 if parent is not None: | |
| 264 parent.append(res) | |
| 265 return res | |
| 266 | |
| 267 @converter(ProcessingInstruction) | |
| 268 def convert_pi(bs_node, parent): | |
| 269 if bs_node.endswith('?'): | |
| 270 # The PI is of XML style (<?as df?>) but BeautifulSoup | |
| 271 # interpreted it as being SGML style (<?as df>). Fix. | |
| 272 bs_node = bs_node[:-1] | |
| 273 res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) | |
| 274 if parent is not None: | |
| 275 parent.append(res) | |
| 276 return res | |
| 277 | |
| 278 @converter(NavigableString) | |
| 279 def convert_text(bs_node, parent): | |
| 280 if parent is not None: | |
| 281 append_text(parent, unescape(bs_node)) | |
| 282 return None | |
| 283 | |
| 284 return convert_node | |
| 285 | |
| 286 | |
| 287 # copied from ET's ElementSoup | |
| 288 | |
| 289 try: | |
| 290 from html.entities import name2codepoint # Python 3 | |
| 291 except ImportError: | |
| 292 from htmlentitydefs import name2codepoint | |
| 293 | |
| 294 | |
| 295 handle_entities = re.compile(r"&(\w+);").sub | |
| 296 | |
| 297 | |
| 298 try: | |
| 299 unichr | |
| 300 except NameError: | |
| 301 # Python 3 | |
| 302 unichr = chr | |
| 303 | |
| 304 | |
| 305 def unescape(string): | |
| 306 if not string: | |
| 307 return '' | |
| 308 # work around oddities in BeautifulSoup's entity handling | |
| 309 def unescape_entity(m): | |
| 310 try: | |
| 311 return unichr(name2codepoint[m.group(1)]) | |
| 312 except KeyError: | |
| 313 return m.group(0) # use as is | |
| 314 return handle_entities(unescape_entity, string) |
