Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/backports/_markupbase.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 """Shared support for scanning document type declarations in HTML and XHTML. | |
2 | |
3 Backported for python-future from Python 3.3. Reason: ParserBase is an | |
4 old-style class in the Python 2.7 source of markupbase.py, which I suspect | |
5 might be the cause of sporadic unit-test failures on travis-ci.org with | |
6 test_htmlparser.py. The test failures look like this: | |
7 | |
8 ====================================================================== | |
9 | |
10 ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStrictTestCase) | |
11 | |
12 ---------------------------------------------------------------------- | |
13 | |
14 Traceback (most recent call last): | |
15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 661, in test_attr_entity_replacement | |
16 [("starttag", "a", [("b", "&><\"'")])]) | |
17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 93, in _run_check | |
18 collector = self.get_collector() | |
19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 617, in get_collector | |
20 return EventCollector(strict=True) | |
21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparser.py", line 27, in __init__ | |
22 html.parser.HTMLParser.__init__(self, *args, **kw) | |
23 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 135, in __init__ | |
24 self.reset() | |
25 File "/home/travis/build/edschofield/python-future/future/backports/html/parser.py", line 143, in reset | |
26 _markupbase.ParserBase.reset(self) | |
27 | |
28 TypeError: unbound method reset() must be called with ParserBase instance as first argument (got EventCollector instance instead) | |
29 | |
30 This module is used as a foundation for the html.parser module. It has no | |
31 documented public API and should not be used directly. | |
32 | |
33 """ | |
34 | |
35 import re | |
36 | |
37 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match | |
38 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match | |
39 _commentclose = re.compile(r'--\s*>') | |
40 _markedsectionclose = re.compile(r']\s*]\s*>') | |
41 | |
42 # An analysis of the MS-Word extensions is available at | |
43 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf | |
44 | |
45 _msmarkedsectionclose = re.compile(r']\s*>') | |
46 | |
47 del re | |
48 | |
49 | |
50 class ParserBase(object): | |
51 """Parser base class which provides some common support methods used | |
52 by the SGML/HTML and XHTML parsers.""" | |
53 | |
54 def __init__(self): | |
55 if self.__class__ is ParserBase: | |
56 raise RuntimeError( | |
57 "_markupbase.ParserBase must be subclassed") | |
58 | |
59 def error(self, message): | |
60 raise NotImplementedError( | |
61 "subclasses of ParserBase must override error()") | |
62 | |
63 def reset(self): | |
64 self.lineno = 1 | |
65 self.offset = 0 | |
66 | |
67 def getpos(self): | |
68 """Return current line number and offset.""" | |
69 return self.lineno, self.offset | |
70 | |
71 # Internal -- update line number and offset. This should be | |
72 # called for each piece of data exactly once, in order -- in other | |
73 # words the concatenation of all the input strings to this | |
74 # function should be exactly the entire input. | |
75 def updatepos(self, i, j): | |
76 if i >= j: | |
77 return j | |
78 rawdata = self.rawdata | |
79 nlines = rawdata.count("\n", i, j) | |
80 if nlines: | |
81 self.lineno = self.lineno + nlines | |
82 pos = rawdata.rindex("\n", i, j) # Should not fail | |
83 self.offset = j-(pos+1) | |
84 else: | |
85 self.offset = self.offset + j-i | |
86 return j | |
87 | |
88 _decl_otherchars = '' | |
89 | |
90 # Internal -- parse declaration (for use by subclasses). | |
91 def parse_declaration(self, i): | |
92 # This is some sort of declaration; in "HTML as | |
93 # deployed," this should only be the document type | |
94 # declaration ("<!DOCTYPE html...>"). | |
95 # ISO 8879:1986, however, has more complex | |
96 # declaration syntax for elements in <!...>, including: | |
97 # --comment-- | |
98 # [marked section] | |
99 # name in the following list: ENTITY, DOCTYPE, ELEMENT, | |
100 # ATTLIST, NOTATION, SHORTREF, USEMAP, | |
101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM | |
102 rawdata = self.rawdata | |
103 j = i + 2 | |
104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" | |
105 if rawdata[j:j+1] == ">": | |
106 # the empty comment <!> | |
107 return j + 1 | |
108 if rawdata[j:j+1] in ("-", ""): | |
109 # Start of comment followed by buffer boundary, | |
110 # or just a buffer boundary. | |
111 return -1 | |
112 # A simple, practical version could look like: ((name|stringlit) S*) + '>' | |
113 n = len(rawdata) | |
114 if rawdata[j:j+2] == '--': #comment | |
115 # Locate --.*-- as the body of the comment | |
116 return self.parse_comment(i) | |
117 elif rawdata[j] == '[': #marked section | |
118 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section | |
119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA | |
120 # Note that this is extended by Microsoft Office "Save as Web" function | |
121 # to include [if...] and [endif]. | |
122 return self.parse_marked_section(i) | |
123 else: #all other declaration elements | |
124 decltype, j = self._scan_name(j, i) | |
125 if j < 0: | |
126 return j | |
127 if decltype == "doctype": | |
128 self._decl_otherchars = '' | |
129 while j < n: | |
130 c = rawdata[j] | |
131 if c == ">": | |
132 # end of declaration syntax | |
133 data = rawdata[i+2:j] | |
134 if decltype == "doctype": | |
135 self.handle_decl(data) | |
136 else: | |
137 # According to the HTML5 specs sections "8.2.4.44 Bogus | |
138 # comment state" and "8.2.4.45 Markup declaration open | |
139 # state", a comment token should be emitted. | |
140 # Calling unknown_decl provides more flexibility though. | |
141 self.unknown_decl(data) | |
142 return j + 1 | |
143 if c in "\"'": | |
144 m = _declstringlit_match(rawdata, j) | |
145 if not m: | |
146 return -1 # incomplete | |
147 j = m.end() | |
148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
149 name, j = self._scan_name(j, i) | |
150 elif c in self._decl_otherchars: | |
151 j = j + 1 | |
152 elif c == "[": | |
153 # this could be handled in a separate doctype parser | |
154 if decltype == "doctype": | |
155 j = self._parse_doctype_subset(j + 1, i) | |
156 elif decltype in set(["attlist", "linktype", "link", "element"]): | |
157 # must tolerate []'d groups in a content model in an element declaration | |
158 # also in data attribute specifications of attlist declaration | |
159 # also link type declaration subsets in linktype declarations | |
160 # also link attribute specification lists in link declarations | |
161 self.error("unsupported '[' char in %s declaration" % decltype) | |
162 else: | |
163 self.error("unexpected '[' char in declaration") | |
164 else: | |
165 self.error( | |
166 "unexpected %r char in declaration" % rawdata[j]) | |
167 if j < 0: | |
168 return j | |
169 return -1 # incomplete | |
170 | |
171 # Internal -- parse a marked section | |
172 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> | |
173 def parse_marked_section(self, i, report=1): | |
174 rawdata= self.rawdata | |
175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" | |
176 sectName, j = self._scan_name( i+3, i ) | |
177 if j < 0: | |
178 return j | |
179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]): | |
180 # look for standard ]]> ending | |
181 match= _markedsectionclose.search(rawdata, i+3) | |
182 elif sectName in set(["if", "else", "endif"]): | |
183 # look for MS Office ]> ending | |
184 match= _msmarkedsectionclose.search(rawdata, i+3) | |
185 else: | |
186 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) | |
187 if not match: | |
188 return -1 | |
189 if report: | |
190 j = match.start(0) | |
191 self.unknown_decl(rawdata[i+3: j]) | |
192 return match.end(0) | |
193 | |
194 # Internal -- parse comment, return length or -1 if not terminated | |
195 def parse_comment(self, i, report=1): | |
196 rawdata = self.rawdata | |
197 if rawdata[i:i+4] != '<!--': | |
198 self.error('unexpected call to parse_comment()') | |
199 match = _commentclose.search(rawdata, i+4) | |
200 if not match: | |
201 return -1 | |
202 if report: | |
203 j = match.start(0) | |
204 self.handle_comment(rawdata[i+4: j]) | |
205 return match.end(0) | |
206 | |
207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, | |
208 # returning the index just past any whitespace following the trailing ']'. | |
209 def _parse_doctype_subset(self, i, declstartpos): | |
210 rawdata = self.rawdata | |
211 n = len(rawdata) | |
212 j = i | |
213 while j < n: | |
214 c = rawdata[j] | |
215 if c == "<": | |
216 s = rawdata[j:j+2] | |
217 if s == "<": | |
218 # end of buffer; incomplete | |
219 return -1 | |
220 if s != "<!": | |
221 self.updatepos(declstartpos, j + 1) | |
222 self.error("unexpected char in internal subset (in %r)" % s) | |
223 if (j + 2) == n: | |
224 # end of buffer; incomplete | |
225 return -1 | |
226 if (j + 4) > n: | |
227 # end of buffer; incomplete | |
228 return -1 | |
229 if rawdata[j:j+4] == "<!--": | |
230 j = self.parse_comment(j, report=0) | |
231 if j < 0: | |
232 return j | |
233 continue | |
234 name, j = self._scan_name(j + 2, declstartpos) | |
235 if j == -1: | |
236 return -1 | |
237 if name not in set(["attlist", "element", "entity", "notation"]): | |
238 self.updatepos(declstartpos, j + 2) | |
239 self.error( | |
240 "unknown declaration %r in internal subset" % name) | |
241 # handle the individual names | |
242 meth = getattr(self, "_parse_doctype_" + name) | |
243 j = meth(j, declstartpos) | |
244 if j < 0: | |
245 return j | |
246 elif c == "%": | |
247 # parameter entity reference | |
248 if (j + 1) == n: | |
249 # end of buffer; incomplete | |
250 return -1 | |
251 s, j = self._scan_name(j + 1, declstartpos) | |
252 if j < 0: | |
253 return j | |
254 if rawdata[j] == ";": | |
255 j = j + 1 | |
256 elif c == "]": | |
257 j = j + 1 | |
258 while j < n and rawdata[j].isspace(): | |
259 j = j + 1 | |
260 if j < n: | |
261 if rawdata[j] == ">": | |
262 return j | |
263 self.updatepos(declstartpos, j) | |
264 self.error("unexpected char after internal subset") | |
265 else: | |
266 return -1 | |
267 elif c.isspace(): | |
268 j = j + 1 | |
269 else: | |
270 self.updatepos(declstartpos, j) | |
271 self.error("unexpected char %r in internal subset" % c) | |
272 # end of buffer reached | |
273 return -1 | |
274 | |
275 # Internal -- scan past <!ELEMENT declarations | |
276 def _parse_doctype_element(self, i, declstartpos): | |
277 name, j = self._scan_name(i, declstartpos) | |
278 if j == -1: | |
279 return -1 | |
280 # style content model; just skip until '>' | |
281 rawdata = self.rawdata | |
282 if '>' in rawdata[j:]: | |
283 return rawdata.find(">", j) + 1 | |
284 return -1 | |
285 | |
286 # Internal -- scan past <!ATTLIST declarations | |
287 def _parse_doctype_attlist(self, i, declstartpos): | |
288 rawdata = self.rawdata | |
289 name, j = self._scan_name(i, declstartpos) | |
290 c = rawdata[j:j+1] | |
291 if c == "": | |
292 return -1 | |
293 if c == ">": | |
294 return j + 1 | |
295 while 1: | |
296 # scan a series of attribute descriptions; simplified: | |
297 # name type [value] [#constraint] | |
298 name, j = self._scan_name(j, declstartpos) | |
299 if j < 0: | |
300 return j | |
301 c = rawdata[j:j+1] | |
302 if c == "": | |
303 return -1 | |
304 if c == "(": | |
305 # an enumerated type; look for ')' | |
306 if ")" in rawdata[j:]: | |
307 j = rawdata.find(")", j) + 1 | |
308 else: | |
309 return -1 | |
310 while rawdata[j:j+1].isspace(): | |
311 j = j + 1 | |
312 if not rawdata[j:]: | |
313 # end of buffer, incomplete | |
314 return -1 | |
315 else: | |
316 name, j = self._scan_name(j, declstartpos) | |
317 c = rawdata[j:j+1] | |
318 if not c: | |
319 return -1 | |
320 if c in "'\"": | |
321 m = _declstringlit_match(rawdata, j) | |
322 if m: | |
323 j = m.end() | |
324 else: | |
325 return -1 | |
326 c = rawdata[j:j+1] | |
327 if not c: | |
328 return -1 | |
329 if c == "#": | |
330 if rawdata[j:] == "#": | |
331 # end of buffer | |
332 return -1 | |
333 name, j = self._scan_name(j + 1, declstartpos) | |
334 if j < 0: | |
335 return j | |
336 c = rawdata[j:j+1] | |
337 if not c: | |
338 return -1 | |
339 if c == '>': | |
340 # all done | |
341 return j + 1 | |
342 | |
343 # Internal -- scan past <!NOTATION declarations | |
344 def _parse_doctype_notation(self, i, declstartpos): | |
345 name, j = self._scan_name(i, declstartpos) | |
346 if j < 0: | |
347 return j | |
348 rawdata = self.rawdata | |
349 while 1: | |
350 c = rawdata[j:j+1] | |
351 if not c: | |
352 # end of buffer; incomplete | |
353 return -1 | |
354 if c == '>': | |
355 return j + 1 | |
356 if c in "'\"": | |
357 m = _declstringlit_match(rawdata, j) | |
358 if not m: | |
359 return -1 | |
360 j = m.end() | |
361 else: | |
362 name, j = self._scan_name(j, declstartpos) | |
363 if j < 0: | |
364 return j | |
365 | |
366 # Internal -- scan past <!ENTITY declarations | |
367 def _parse_doctype_entity(self, i, declstartpos): | |
368 rawdata = self.rawdata | |
369 if rawdata[i:i+1] == "%": | |
370 j = i + 1 | |
371 while 1: | |
372 c = rawdata[j:j+1] | |
373 if not c: | |
374 return -1 | |
375 if c.isspace(): | |
376 j = j + 1 | |
377 else: | |
378 break | |
379 else: | |
380 j = i | |
381 name, j = self._scan_name(j, declstartpos) | |
382 if j < 0: | |
383 return j | |
384 while 1: | |
385 c = self.rawdata[j:j+1] | |
386 if not c: | |
387 return -1 | |
388 if c in "'\"": | |
389 m = _declstringlit_match(rawdata, j) | |
390 if m: | |
391 j = m.end() | |
392 else: | |
393 return -1 # incomplete | |
394 elif c == ">": | |
395 return j + 1 | |
396 else: | |
397 name, j = self._scan_name(j, declstartpos) | |
398 if j < 0: | |
399 return j | |
400 | |
401 # Internal -- scan a name token and the new position and the token, or | |
402 # return -1 if we've reached the end of the buffer. | |
403 def _scan_name(self, i, declstartpos): | |
404 rawdata = self.rawdata | |
405 n = len(rawdata) | |
406 if i == n: | |
407 return None, -1 | |
408 m = _declname_match(rawdata, i) | |
409 if m: | |
410 s = m.group() | |
411 name = s.strip() | |
412 if (i + len(s)) == n: | |
413 return None, -1 # end of buffer | |
414 return name.lower(), m.end() | |
415 else: | |
416 self.updatepos(declstartpos, i) | |
417 self.error("expected name token at %r" | |
418 % rawdata[declstartpos:declstartpos+20]) | |
419 | |
420 # To be overridden -- handlers for unknown objects | |
421 def unknown_decl(self, data): | |
422 pass |