comparison env/lib/python3.7/site-packages/bs4/builder/_lxml.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # Use of this source code is governed by the MIT license.
2 __license__ = "MIT"
3
4 __all__ = [
5 'LXMLTreeBuilderForXML',
6 'LXMLTreeBuilder',
7 ]
8
9 try:
10 from collections.abc import Callable # Python 3.6
11 except ImportError as e:
12 from collections import Callable
13
14 from io import BytesIO
15 from io import StringIO
16 from lxml import etree
17 from bs4.element import (
18 Comment,
19 Doctype,
20 NamespacedAttribute,
21 ProcessingInstruction,
22 XMLProcessingInstruction,
23 )
24 from bs4.builder import (
25 FAST,
26 HTML,
27 HTMLTreeBuilder,
28 PERMISSIVE,
29 ParserRejectedMarkup,
30 TreeBuilder,
31 XML)
32 from bs4.dammit import EncodingDetector
33
34 LXML = 'lxml'
35
36 def _invert(d):
37 "Invert a dictionary."
38 return dict((v,k) for k, v in list(d.items()))
39
40 class LXMLTreeBuilderForXML(TreeBuilder):
41 DEFAULT_PARSER_CLASS = etree.XMLParser
42
43 is_xml = True
44 processing_instruction_class = XMLProcessingInstruction
45
46 NAME = "lxml-xml"
47 ALTERNATE_NAMES = ["xml"]
48
49 # Well, it's permissive by XML parser standards.
50 features = [NAME, LXML, XML, FAST, PERMISSIVE]
51
52 CHUNK_SIZE = 512
53
54 # This namespace mapping is specified in the XML Namespace
55 # standard.
56 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
57
58 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
59
60 # NOTE: If we parsed Element objects and looked at .sourceline,
61 # we'd be able to see the line numbers from the original document.
62 # But instead we build an XMLParser or HTMLParser object to serve
63 # as the target of parse messages, and those messages don't include
64 # line numbers.
65 # See: https://bugs.launchpad.net/lxml/+bug/1846906
66
67 def initialize_soup(self, soup):
68 """Let the BeautifulSoup object know about the standard namespace
69 mapping.
70
71 :param soup: A `BeautifulSoup`.
72 """
73 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
74 self._register_namespaces(self.DEFAULT_NSMAPS)
75
76 def _register_namespaces(self, mapping):
77 """Let the BeautifulSoup object know about namespaces encountered
78 while parsing the document.
79
80 This might be useful later on when creating CSS selectors.
81
82 :param mapping: A dictionary mapping namespace prefixes to URIs.
83 """
84 for key, value in list(mapping.items()):
85 if key and key not in self.soup._namespaces:
86 # Let the BeautifulSoup object know about a new namespace.
87 # If there are multiple namespaces defined with the same
88 # prefix, the first one in the document takes precedence.
89 self.soup._namespaces[key] = value
90
91 def default_parser(self, encoding):
92 """Find the default parser for the given encoding.
93
94 :param encoding: A string.
95 :return: Either a parser object or a class, which
96 will be instantiated with default arguments.
97 """
98 if self._default_parser is not None:
99 return self._default_parser
100 return etree.XMLParser(
101 target=self, strip_cdata=False, recover=True, encoding=encoding)
102
103 def parser_for(self, encoding):
104 """Instantiate an appropriate parser for the given encoding.
105
106 :param encoding: A string.
107 :return: A parser object such as an `etree.XMLParser`.
108 """
109 # Use the default parser.
110 parser = self.default_parser(encoding)
111
112 if isinstance(parser, Callable):
113 # Instantiate the parser with default arguments
114 parser = parser(
115 target=self, strip_cdata=False, recover=True, encoding=encoding
116 )
117 return parser
118
119 def __init__(self, parser=None, empty_element_tags=None, **kwargs):
120 # TODO: Issue a warning if parser is present but not a
121 # callable, since that means there's no way to create new
122 # parsers for different encodings.
123 self._default_parser = parser
124 if empty_element_tags is not None:
125 self.empty_element_tags = set(empty_element_tags)
126 self.soup = None
127 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
128 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
129
130 def _getNsTag(self, tag):
131 # Split the namespace URL out of a fully-qualified lxml tag
132 # name. Copied from lxml's src/lxml/sax.py.
133 if tag[0] == '{':
134 return tuple(tag[1:].split('}', 1))
135 else:
136 return (None, tag)
137
138 def prepare_markup(self, markup, user_specified_encoding=None,
139 exclude_encodings=None,
140 document_declared_encoding=None):
141 """Run any preliminary steps necessary to make incoming markup
142 acceptable to the parser.
143
144 lxml really wants to get a bytestring and convert it to
145 Unicode itself. So instead of using UnicodeDammit to convert
146 the bytestring to Unicode using different encodings, this
147 implementation uses EncodingDetector to iterate over the
148 encodings, and tell lxml to try to parse the document as each
149 one in turn.
150
151 :param markup: Some markup -- hopefully a bytestring.
152 :param user_specified_encoding: The user asked to try this encoding.
153 :param document_declared_encoding: The markup itself claims to be
154 in this encoding.
155 :param exclude_encodings: The user asked _not_ to try any of
156 these encodings.
157
158 :yield: A series of 4-tuples:
159 (markup, encoding, declared encoding,
160 has undergone character replacement)
161
162 Each 4-tuple represents a strategy for converting the
163 document to Unicode and parsing it. Each strategy will be tried
164 in turn.
165 """
166 is_html = not self.is_xml
167 if is_html:
168 self.processing_instruction_class = ProcessingInstruction
169 else:
170 self.processing_instruction_class = XMLProcessingInstruction
171
172 if isinstance(markup, str):
173 # We were given Unicode. Maybe lxml can parse Unicode on
174 # this system?
175 yield markup, None, document_declared_encoding, False
176
177 if isinstance(markup, str):
178 # No, apparently not. Convert the Unicode to UTF-8 and
179 # tell lxml to parse it as UTF-8.
180 yield (markup.encode("utf8"), "utf8",
181 document_declared_encoding, False)
182
183 try_encodings = [user_specified_encoding, document_declared_encoding]
184 detector = EncodingDetector(
185 markup, try_encodings, is_html, exclude_encodings)
186 for encoding in detector.encodings:
187 yield (detector.markup, encoding, document_declared_encoding, False)
188
189 def feed(self, markup):
190 if isinstance(markup, bytes):
191 markup = BytesIO(markup)
192 elif isinstance(markup, str):
193 markup = StringIO(markup)
194
195 # Call feed() at least once, even if the markup is empty,
196 # or the parser won't be initialized.
197 data = markup.read(self.CHUNK_SIZE)
198 try:
199 self.parser = self.parser_for(self.soup.original_encoding)
200 self.parser.feed(data)
201 while len(data) != 0:
202 # Now call feed() on the rest of the data, chunk by chunk.
203 data = markup.read(self.CHUNK_SIZE)
204 if len(data) != 0:
205 self.parser.feed(data)
206 self.parser.close()
207 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
208 raise ParserRejectedMarkup(e)
209
210 def close(self):
211 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
212
213 def start(self, name, attrs, nsmap={}):
214 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
215 attrs = dict(attrs)
216 nsprefix = None
217 # Invert each namespace map as it comes in.
218 if len(nsmap) == 0 and len(self.nsmaps) > 1:
219 # There are no new namespaces for this tag, but
220 # non-default namespaces are in play, so we need a
221 # separate tag stack to know when they end.
222 self.nsmaps.append(None)
223 elif len(nsmap) > 0:
224 # A new namespace mapping has come into play.
225
226 # First, Let the BeautifulSoup object know about it.
227 self._register_namespaces(nsmap)
228
229 # Then, add it to our running list of inverted namespace
230 # mappings.
231 self.nsmaps.append(_invert(nsmap))
232
233 # Also treat the namespace mapping as a set of attributes on the
234 # tag, so we can recreate it later.
235 attrs = attrs.copy()
236 for prefix, namespace in list(nsmap.items()):
237 attribute = NamespacedAttribute(
238 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
239 attrs[attribute] = namespace
240
241 # Namespaces are in play. Find any attributes that came in
242 # from lxml with namespaces attached to their names, and
243 # turn then into NamespacedAttribute objects.
244 new_attrs = {}
245 for attr, value in list(attrs.items()):
246 namespace, attr = self._getNsTag(attr)
247 if namespace is None:
248 new_attrs[attr] = value
249 else:
250 nsprefix = self._prefix_for_namespace(namespace)
251 attr = NamespacedAttribute(nsprefix, attr, namespace)
252 new_attrs[attr] = value
253 attrs = new_attrs
254
255 namespace, name = self._getNsTag(name)
256 nsprefix = self._prefix_for_namespace(namespace)
257 self.soup.handle_starttag(name, namespace, nsprefix, attrs)
258
259 def _prefix_for_namespace(self, namespace):
260 """Find the currently active prefix for the given namespace."""
261 if namespace is None:
262 return None
263 for inverted_nsmap in reversed(self.nsmaps):
264 if inverted_nsmap is not None and namespace in inverted_nsmap:
265 return inverted_nsmap[namespace]
266 return None
267
268 def end(self, name):
269 self.soup.endData()
270 completed_tag = self.soup.tagStack[-1]
271 namespace, name = self._getNsTag(name)
272 nsprefix = None
273 if namespace is not None:
274 for inverted_nsmap in reversed(self.nsmaps):
275 if inverted_nsmap is not None and namespace in inverted_nsmap:
276 nsprefix = inverted_nsmap[namespace]
277 break
278 self.soup.handle_endtag(name, nsprefix)
279 if len(self.nsmaps) > 1:
280 # This tag, or one of its parents, introduced a namespace
281 # mapping, so pop it off the stack.
282 self.nsmaps.pop()
283
284 def pi(self, target, data):
285 self.soup.endData()
286 self.soup.handle_data(target + ' ' + data)
287 self.soup.endData(self.processing_instruction_class)
288
289 def data(self, content):
290 self.soup.handle_data(content)
291
292 def doctype(self, name, pubid, system):
293 self.soup.endData()
294 doctype = Doctype.for_name_and_ids(name, pubid, system)
295 self.soup.object_was_parsed(doctype)
296
297 def comment(self, content):
298 "Handle comments as Comment objects."
299 self.soup.endData()
300 self.soup.handle_data(content)
301 self.soup.endData(Comment)
302
303 def test_fragment_to_document(self, fragment):
304 """See `TreeBuilder`."""
305 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
306
307
308 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
309
310 NAME = LXML
311 ALTERNATE_NAMES = ["lxml-html"]
312
313 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
314 is_xml = False
315 processing_instruction_class = ProcessingInstruction
316
317 def default_parser(self, encoding):
318 return etree.HTMLParser
319
320 def feed(self, markup):
321 encoding = self.soup.original_encoding
322 try:
323 self.parser = self.parser_for(encoding)
324 self.parser.feed(markup)
325 self.parser.close()
326 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
327 raise ParserRejectedMarkup(e)
328
329
330 def test_fragment_to_document(self, fragment):
331 """See `TreeBuilder`."""
332 return '<html><body>%s</body></html>' % fragment