comparison env/lib/python3.7/site-packages/soupsieve/css_parser.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1 """CSS selector parser."""
2 import re
3 from functools import lru_cache
4 from . import util
5 from . import css_match as cm
6 from . import css_types as ct
7 from .util import SelectorSyntaxError
8
9 UNICODE_REPLACEMENT_CHAR = 0xFFFD
10
11 # Simple pseudo classes that take no parameters
12 PSEUDO_SIMPLE = {
13 ":any-link",
14 ":empty",
15 ":first-child",
16 ":first-of-type",
17 ":in-range",
18 ":out-of-range",
19 ":last-child",
20 ":last-of-type",
21 ":link",
22 ":only-child",
23 ":only-of-type",
24 ":root",
25 ':checked',
26 ':default',
27 ':disabled',
28 ':enabled',
29 ':indeterminate',
30 ':optional',
31 ':placeholder-shown',
32 ':read-only',
33 ':read-write',
34 ':required',
35 ':scope',
36 ':defined'
37 }
38
39 # Supported, simple pseudo classes that match nothing in the Soup Sieve environment
40 PSEUDO_SIMPLE_NO_MATCH = {
41 ':active',
42 ':current',
43 ':focus',
44 ':focus-visible',
45 ':focus-within',
46 ':future',
47 ':host',
48 ':hover',
49 ':local-link',
50 ':past',
51 ':paused',
52 ':playing',
53 ':target',
54 ':target-within',
55 ':user-invalid',
56 ':visited'
57 }
58
59 # Complex pseudo classes that take selector lists
60 PSEUDO_COMPLEX = {
61 ':contains',
62 ':has',
63 ':is',
64 ':matches',
65 ':not',
66 ':where'
67 }
68
69 PSEUDO_COMPLEX_NO_MATCH = {
70 ':current',
71 ':host',
72 ':host-context'
73 }
74
75 # Complex pseudo classes that take very specific parameters and are handled special
76 PSEUDO_SPECIAL = {
77 ':dir',
78 ':lang',
79 ':nth-child',
80 ':nth-last-child',
81 ':nth-last-of-type',
82 ':nth-of-type'
83 }
84
85 PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
86
87 # Sub-patterns parts
88 # Whitespace
89 NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
90 WS = r'(?:[ \t]|{})'.format(NEWLINE)
91 # Comments
92 COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
93 # Whitespace with comments included
94 WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS)
95 # CSS escapes
96 CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS)
97 CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE)
98 # CSS Identifier
99 IDENTIFIER = r'''
100 (?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--)
101 (?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*)
102 '''.format(esc=CSS_ESCAPES)
103 # `nth` content
104 NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC)
105 # Value: quoted string or identifier
106 VALUE = r'''
107 (?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+)
108 '''.format(nl=NEWLINE, ident=IDENTIFIER)
109 # Attribute value comparison. `!=` is handled special as it is non-standard.
110 ATTR = r'''
111 (?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
112 '''.format(ws=WSC, value=VALUE)
113
114 # Selector patterns
115 # IDs (`#id`)
116 PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
117 # Classes (`.class`)
118 PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
119 # Prefix:Tag (`prefix|tag`)
120 PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
121 # Attributes (`[attr]`, `[attr=value]`, etc.)
122 PAT_ATTR = r'''
123 \[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
124 '''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
125 # Pseudo class (`:pseudo-class`, `:pseudo-class(`)
126 PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
127 # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
128 PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER)
129 # Custom pseudo class (`:--custom-pseudo`)
130 PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER)
131 # Closing pseudo group (`)`)
132 PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC)
133 # Pseudo element (`::pseudo-element`)
134 PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS)
135 # At rule (`@page`, etc.) (not supported)
136 PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER)
137 # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
138 PAT_PSEUDO_NTH_CHILD = r'''
139 (?P<pseudo_nth_child>{name}
140 (?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*))
141 '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH)
142 # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
143 PAT_PSEUDO_NTH_TYPE = r'''
144 (?P<pseudo_nth_type>{name}
145 (?P<nth_type>{nth}|even|odd)){ws}*\)
146 '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH)
147 # Pseudo class language (`:lang("*-de", en)`)
148 PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
149 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
150 )
151 # Pseudo class direction (`:dir(ltr)`)
152 PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC)
153 # Combining characters (`>`, `~`, ` `, `+`, `,`)
154 PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
155 # Extra: Contains (`:contains(text)`)
156 PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
157 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
158 )
159
160 # Regular expressions
161 # CSS escape pattern
162 RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I)
163 RE_CSS_STR_ESC = re.compile(
164 r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I
165 )
166 # Pattern to break up `nth` specifiers
167 RE_NTH = re.compile(
168 r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC),
169 re.I
170 )
171 # Pattern to iterate multiple values.
172 RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
173 # Whitespace checks
174 RE_WS = re.compile(WS)
175 RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
176 RE_WS_END = re.compile('{}*$'.format(WSC))
177 RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
178
179 # Constants
180 # List split token
181 COMMA_COMBINATOR = ','
182 # Relation token for descendant
183 WS_COMBINATOR = " "
184
185 # Parse flags
186 FLG_PSEUDO = 0x01
187 FLG_NOT = 0x02
188 FLG_RELATIVE = 0x04
189 FLG_DEFAULT = 0x08
190 FLG_HTML = 0x10
191 FLG_INDETERMINATE = 0x20
192 FLG_OPEN = 0x40
193 FLG_IN_RANGE = 0x80
194 FLG_OUT_OF_RANGE = 0x100
195 FLG_PLACEHOLDER_SHOWN = 0x200
196
197 # Maximum cached patterns to store
198 _MAXCACHE = 500
199
200
201 @lru_cache(maxsize=_MAXCACHE)
202 def _cached_css_compile(pattern, namespaces, custom, flags):
203 """Cached CSS compile."""
204
205 custom_selectors = process_custom(custom)
206 return cm.SoupSieve(
207 pattern,
208 CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(),
209 namespaces,
210 custom,
211 flags
212 )
213
214
215 def _purge_cache():
216 """Purge the cache."""
217
218 _cached_css_compile.cache_clear()
219
220
221 def process_custom(custom):
222 """Process custom."""
223
224 custom_selectors = {}
225 if custom is not None:
226 for key, value in custom.items():
227 name = util.lower(key)
228 if RE_CUSTOM.match(name) is None:
229 raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name))
230 if name in custom_selectors:
231 raise KeyError("The custom selector '{}' has already been registered".format(name))
232 custom_selectors[css_unescape(name)] = value
233 return custom_selectors
234
235
236 def css_unescape(content, string=False):
237 """
238 Unescape CSS value.
239
240 Strings allow for spanning the value on multiple strings by escaping a new line.
241 """
242
243 def replace(m):
244 """Replace with the appropriate substitute."""
245
246 if m.group(1):
247 codepoint = int(m.group(1)[1:], 16)
248 if codepoint == 0:
249 codepoint = UNICODE_REPLACEMENT_CHAR
250 value = chr(codepoint)
251 elif m.group(2):
252 value = m.group(2)[1:]
253 elif m.group(3):
254 value = '\ufffd'
255 else:
256 value = ''
257
258 return value
259
260 return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
261
262
263 def escape(ident):
264 """Escape identifier."""
265
266 string = []
267 length = len(ident)
268 start_dash = length > 0 and ident[0] == '-'
269 if length == 1 and start_dash:
270 # Need to escape identifier that is a single `-` with no other characters
271 string.append('\\{}'.format(ident))
272 else:
273 for index, c in enumerate(ident):
274 codepoint = ord(c)
275 if codepoint == 0x00:
276 string.append('\ufffd')
277 elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
278 string.append('\\{:x} '.format(codepoint))
279 elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
280 string.append('\\{:x} '.format(codepoint))
281 elif (
282 codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
283 (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
284 ):
285 string.append(c)
286 else:
287 string.append('\\{}'.format(c))
288 return ''.join(string)
289
290
291 class SelectorPattern(object):
292 """Selector pattern."""
293
294 def __init__(self, name, pattern):
295 """Initialize."""
296
297 self.name = name
298 self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
299
300 def get_name(self):
301 """Get name."""
302
303 return self.name
304
305 def match(self, selector, index, flags):
306 """Match the selector."""
307
308 return self.re_pattern.match(selector, index)
309
310
311 class SpecialPseudoPattern(SelectorPattern):
312 """Selector pattern."""
313
314 def __init__(self, patterns):
315 """Initialize."""
316
317 self.patterns = {}
318 for p in patterns:
319 name = p[0]
320 pattern = p[3](name, p[2])
321 for pseudo in p[1]:
322 self.patterns[pseudo] = pattern
323
324 self.matched_name = None
325 self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
326
327 def get_name(self):
328 """Get name."""
329
330 return self.matched_name.get_name()
331
332 def match(self, selector, index, flags):
333 """Match the selector."""
334
335 pseudo = None
336 m = self.re_pseudo_name.match(selector, index)
337 if m:
338 name = util.lower(css_unescape(m.group('name')))
339 pattern = self.patterns.get(name)
340 if pattern:
341 pseudo = pattern.match(selector, index, flags)
342 if pseudo:
343 self.matched_name = pattern
344
345 return pseudo
346
347
348 class _Selector(object):
349 """
350 Intermediate selector class.
351
352 This stores selector data for a compound selector as we are acquiring them.
353 Once we are done collecting the data for a compound selector, we freeze
354 the data in an object that can be pickled and hashed.
355 """
356
357 def __init__(self, **kwargs):
358 """Initialize."""
359
360 self.tag = kwargs.get('tag', None)
361 self.ids = kwargs.get('ids', [])
362 self.classes = kwargs.get('classes', [])
363 self.attributes = kwargs.get('attributes', [])
364 self.nth = kwargs.get('nth', [])
365 self.selectors = kwargs.get('selectors', [])
366 self.relations = kwargs.get('relations', [])
367 self.rel_type = kwargs.get('rel_type', None)
368 self.contains = kwargs.get('contains', [])
369 self.lang = kwargs.get('lang', [])
370 self.flags = kwargs.get('flags', 0)
371 self.no_match = kwargs.get('no_match', False)
372
373 def _freeze_relations(self, relations):
374 """Freeze relation."""
375
376 if relations:
377 sel = relations[0]
378 sel.relations.extend(relations[1:])
379 return ct.SelectorList([sel.freeze()])
380 else:
381 return ct.SelectorList()
382
383 def freeze(self):
384 """Freeze self."""
385
386 if self.no_match:
387 return ct.SelectorNull()
388 else:
389 return ct.Selector(
390 self.tag,
391 tuple(self.ids),
392 tuple(self.classes),
393 tuple(self.attributes),
394 tuple(self.nth),
395 tuple(self.selectors),
396 self._freeze_relations(self.relations),
397 self.rel_type,
398 tuple(self.contains),
399 tuple(self.lang),
400 self.flags
401 )
402
403 def __str__(self): # pragma: no cover
404 """String representation."""
405
406 return (
407 '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, '
408 'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})'
409 ).format(
410 self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors,
411 self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match
412 )
413
414 __repr__ = __str__
415
416
417 class CSSParser(object):
418 """Parse CSS selectors."""
419
420 css_tokens = (
421 SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
422 SpecialPseudoPattern(
423 (
424 ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS, SelectorPattern),
425 ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
426 ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
427 ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
428 ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
429 )
430 ),
431 SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
432 SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
433 SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
434 SelectorPattern("at_rule", PAT_AT_RULE),
435 SelectorPattern("id", PAT_ID),
436 SelectorPattern("class", PAT_CLASS),
437 SelectorPattern("tag", PAT_TAG),
438 SelectorPattern("attribute", PAT_ATTR),
439 SelectorPattern("combine", PAT_COMBINE)
440 )
441
442 def __init__(self, selector, custom=None, flags=0):
443 """Initialize."""
444
445 self.pattern = selector.replace('\x00', '\ufffd')
446 self.flags = flags
447 self.debug = self.flags & util.DEBUG
448 self.custom = {} if custom is None else custom
449
450 def parse_attribute_selector(self, sel, m, has_selector):
451 """Create attribute selector from the returned regex match."""
452
453 inverse = False
454 op = m.group('cmp')
455 case = util.lower(m.group('case')) if m.group('case') else None
456 ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
457 attr = css_unescape(m.group('attr_name'))
458 is_type = False
459 pattern2 = None
460
461 if case:
462 flags = re.I if case == 'i' else 0
463 elif util.lower(attr) == 'type':
464 flags = re.I
465 is_type = True
466 else:
467 flags = 0
468
469 if op:
470 if m.group('value').startswith(('"', "'")):
471 value = css_unescape(m.group('value')[1:-1], True)
472 else:
473 value = css_unescape(m.group('value'))
474 else:
475 value = None
476 if not op:
477 # Attribute name
478 pattern = None
479 elif op.startswith('^'):
480 # Value start with
481 pattern = re.compile(r'^%s.*' % re.escape(value), flags)
482 elif op.startswith('$'):
483 # Value ends with
484 pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
485 elif op.startswith('*'):
486 # Value contains
487 pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
488 elif op.startswith('~'):
489 # Value contains word within space separated list
490 # `~=` should match nothing if it is empty or contains whitespace,
491 # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
492 value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
493 pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
494 elif op.startswith('|'):
495 # Value starts with word in dash separated list
496 pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
497 else:
498 # Value matches
499 pattern = re.compile(r'^%s$' % re.escape(value), flags)
500 if op.startswith('!'):
501 # Equivalent to `:not([attr=value])`
502 inverse = True
503 if is_type and pattern:
504 pattern2 = re.compile(pattern.pattern)
505
506 # Append the attribute selector
507 sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
508 if inverse:
509 # If we are using `!=`, we need to nest the pattern under a `:not()`.
510 sub_sel = _Selector()
511 sub_sel.attributes.append(sel_attr)
512 not_list = ct.SelectorList([sub_sel.freeze()], True, False)
513 sel.selectors.append(not_list)
514 else:
515 sel.attributes.append(sel_attr)
516
517 has_selector = True
518 return has_selector
519
520 def parse_tag_pattern(self, sel, m, has_selector):
521 """Parse tag pattern from regex match."""
522
523 prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
524 tag = css_unescape(m.group('tag_name'))
525 sel.tag = ct.SelectorTag(tag, prefix)
526 has_selector = True
527 return has_selector
528
529 def parse_pseudo_class_custom(self, sel, m, has_selector):
530 """
531 Parse custom pseudo class alias.
532
533 Compile custom selectors as we need them. When compiling a custom selector,
534 set it to `None` in the dictionary so we can avoid an infinite loop.
535 """
536
537 pseudo = util.lower(css_unescape(m.group('name')))
538 selector = self.custom.get(pseudo)
539 if selector is None:
540 raise SelectorSyntaxError(
541 "Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)),
542 self.pattern,
543 m.end(0)
544 )
545
546 if not isinstance(selector, ct.SelectorList):
547 self.custom[pseudo] = None
548 selector = CSSParser(
549 selector, custom=self.custom, flags=self.flags
550 ).process_selectors(flags=FLG_PSEUDO)
551 self.custom[pseudo] = selector
552
553 sel.selectors.append(selector)
554 has_selector = True
555 return has_selector
556
557 def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html):
558 """Parse pseudo class."""
559
560 complex_pseudo = False
561 pseudo = util.lower(css_unescape(m.group('name')))
562 if m.group('open'):
563 complex_pseudo = True
564 if complex_pseudo and pseudo in PSEUDO_COMPLEX:
565 has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
566 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
567 if pseudo == ':root':
568 sel.flags |= ct.SEL_ROOT
569 elif pseudo == ':defined':
570 sel.flags |= ct.SEL_DEFINED
571 is_html = True
572 elif pseudo == ':scope':
573 sel.flags |= ct.SEL_SCOPE
574 elif pseudo == ':empty':
575 sel.flags |= ct.SEL_EMPTY
576 elif pseudo in (':link', ':any-link'):
577 sel.selectors.append(CSS_LINK)
578 elif pseudo == ':checked':
579 sel.selectors.append(CSS_CHECKED)
580 elif pseudo == ':default':
581 sel.selectors.append(CSS_DEFAULT)
582 elif pseudo == ':indeterminate':
583 sel.selectors.append(CSS_INDETERMINATE)
584 elif pseudo == ":disabled":
585 sel.selectors.append(CSS_DISABLED)
586 elif pseudo == ":enabled":
587 sel.selectors.append(CSS_ENABLED)
588 elif pseudo == ":required":
589 sel.selectors.append(CSS_REQUIRED)
590 elif pseudo == ":optional":
591 sel.selectors.append(CSS_OPTIONAL)
592 elif pseudo == ":read-only":
593 sel.selectors.append(CSS_READ_ONLY)
594 elif pseudo == ":read-write":
595 sel.selectors.append(CSS_READ_WRITE)
596 elif pseudo == ":in-range":
597 sel.selectors.append(CSS_IN_RANGE)
598 elif pseudo == ":out-of-range":
599 sel.selectors.append(CSS_OUT_OF_RANGE)
600 elif pseudo == ":placeholder-shown":
601 sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
602 elif pseudo == ':first-child':
603 sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
604 elif pseudo == ':last-child':
605 sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
606 elif pseudo == ':first-of-type':
607 sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
608 elif pseudo == ':last-of-type':
609 sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
610 elif pseudo == ':only-child':
611 sel.nth.extend(
612 [
613 ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
614 ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
615 ]
616 )
617 elif pseudo == ':only-of-type':
618 sel.nth.extend(
619 [
620 ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
621 ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
622 ]
623 )
624 has_selector = True
625 elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
626 self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
627 sel.no_match = True
628 has_selector = True
629 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
630 sel.no_match = True
631 has_selector = True
632 elif pseudo in PSEUDO_SUPPORTED:
633 raise SelectorSyntaxError(
634 "Invalid syntax for pseudo class '{}'".format(pseudo),
635 self.pattern,
636 m.start(0)
637 )
638 else:
639 raise NotImplementedError(
640 "'{}' pseudo-class is not implemented at this time".format(pseudo)
641 )
642
643 return has_selector, is_html
644
645 def parse_pseudo_nth(self, sel, m, has_selector, iselector):
646 """Parse `nth` pseudo."""
647
648 mdict = m.groupdict()
649 if mdict.get('pseudo_nth_child'):
650 postfix = '_child'
651 else:
652 postfix = '_type'
653 mdict['name'] = util.lower(css_unescape(mdict['name']))
654 content = util.lower(mdict.get('nth' + postfix))
655 if content == 'even':
656 # 2n
657 s1 = 2
658 s2 = 0
659 var = True
660 elif content == 'odd':
661 # 2n+1
662 s1 = 2
663 s2 = 1
664 var = True
665 else:
666 nth_parts = RE_NTH.match(content)
667 s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
668 a = nth_parts.group('a')
669 var = a.endswith('n')
670 if a.startswith('n'):
671 s1 += '1'
672 elif var:
673 s1 += a[:-1]
674 else:
675 s1 += a
676 s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
677 if nth_parts.group('b'):
678 s2 += nth_parts.group('b')
679 else:
680 s2 = '0'
681 s1 = int(s1, 10)
682 s2 = int(s2, 10)
683
684 pseudo_sel = mdict['name']
685 if postfix == '_child':
686 if m.group('of'):
687 # Parse the rest of `of S`.
688 nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
689 else:
690 # Use default `*|*` for `of S`.
691 nth_sel = CSS_NTH_OF_S_DEFAULT
692 if pseudo_sel == ':nth-child':
693 sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
694 elif pseudo_sel == ':nth-last-child':
695 sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
696 else:
697 if pseudo_sel == ':nth-of-type':
698 sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
699 elif pseudo_sel == ':nth-last-of-type':
700 sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
701 has_selector = True
702 return has_selector
703
704 def parse_pseudo_open(self, sel, name, has_selector, iselector, index):
705 """Parse pseudo with opening bracket."""
706
707 flags = FLG_PSEUDO | FLG_OPEN
708 if name == ':not':
709 flags |= FLG_NOT
710 if name == ':has':
711 flags |= FLG_RELATIVE
712
713 sel.selectors.append(self.parse_selectors(iselector, index, flags))
714 has_selector = True
715 return has_selector
716
717 def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index):
718 """Parse combinator tokens."""
719
720 combinator = m.group('relation').strip()
721 if not combinator:
722 combinator = WS_COMBINATOR
723 if combinator == COMMA_COMBINATOR:
724 if not has_selector:
725 # If we've not captured any selector parts, the comma is either at the beginning of the pattern
726 # or following another comma, both of which are unexpected. Commas must split selectors.
727 raise SelectorSyntaxError(
728 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
729 self.pattern,
730 index
731 )
732 sel.rel_type = rel_type
733 selectors[-1].relations.append(sel)
734 rel_type = ":" + WS_COMBINATOR
735 selectors.append(_Selector())
736 else:
737 if has_selector:
738 # End the current selector and associate the leading combinator with this selector.
739 sel.rel_type = rel_type
740 selectors[-1].relations.append(sel)
741 elif rel_type[1:] != WS_COMBINATOR:
742 # It's impossible to have two whitespace combinators after each other as the patterns
743 # will gobble up trailing whitespace. It is also impossible to have a whitespace
744 # combinator after any other kind for the same reason. But we could have
745 # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
746 # then we've hit the multiple combinator case, so we should fail.
747 raise SelectorSyntaxError(
748 'The multiple combinators at position {}'.format(index),
749 self.pattern,
750 index
751 )
752 # Set the leading combinator for the next selector.
753 rel_type = ':' + combinator
754 sel = _Selector()
755
756 has_selector = False
757 return has_selector, sel, rel_type
758
759 def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index):
760 """Parse combinator tokens."""
761
762 combinator = m.group('relation').strip()
763 if not combinator:
764 combinator = WS_COMBINATOR
765 if not has_selector:
766 raise SelectorSyntaxError(
767 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
768 self.pattern,
769 index
770 )
771
772 if combinator == COMMA_COMBINATOR:
773 if not sel.tag and not is_pseudo:
774 # Implied `*`
775 sel.tag = ct.SelectorTag('*', None)
776 sel.relations.extend(relations)
777 selectors.append(sel)
778 del relations[:]
779 else:
780 sel.relations.extend(relations)
781 sel.rel_type = combinator
782 del relations[:]
783 relations.append(sel)
784 sel = _Selector()
785
786 has_selector = False
787 return has_selector, sel
788
789 def parse_class_id(self, sel, m, has_selector):
790 """Parse HTML classes and ids."""
791
792 selector = m.group(0)
793 if selector.startswith('.'):
794 sel.classes.append(css_unescape(selector[1:]))
795 else:
796 sel.ids.append(css_unescape(selector[1:]))
797 has_selector = True
798 return has_selector
799
800 def parse_pseudo_contains(self, sel, m, has_selector):
801 """Parse contains."""
802
803 values = m.group('values')
804 patterns = []
805 for token in RE_VALUES.finditer(values):
806 if token.group('split'):
807 continue
808 value = token.group('value')
809 if value.startswith(("'", '"')):
810 value = css_unescape(value[1:-1], True)
811 else:
812 value = css_unescape(value)
813 patterns.append(value)
814 sel.contains.append(ct.SelectorContains(tuple(patterns)))
815 has_selector = True
816 return has_selector
817
818 def parse_pseudo_lang(self, sel, m, has_selector):
819 """Parse pseudo language."""
820
821 values = m.group('values')
822 patterns = []
823 for token in RE_VALUES.finditer(values):
824 if token.group('split'):
825 continue
826 value = token.group('value')
827 if value.startswith(('"', "'")):
828 value = css_unescape(value[1:-1], True)
829 else:
830 value = css_unescape(value)
831
832 patterns.append(value)
833
834 sel.lang.append(ct.SelectorLang(patterns))
835 has_selector = True
836
837 return has_selector
838
839 def parse_pseudo_dir(self, sel, m, has_selector):
840 """Parse pseudo direction."""
841
842 value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
843 sel.flags |= value
844 has_selector = True
845 return has_selector
846
847 def parse_selectors(self, iselector, index=0, flags=0):
848 """Parse selectors."""
849
850 sel = _Selector()
851 selectors = []
852 has_selector = False
853 closed = False
854 relations = []
855 rel_type = ":" + WS_COMBINATOR
856 is_open = bool(flags & FLG_OPEN)
857 is_pseudo = bool(flags & FLG_PSEUDO)
858 is_relative = bool(flags & FLG_RELATIVE)
859 is_not = bool(flags & FLG_NOT)
860 is_html = bool(flags & FLG_HTML)
861 is_default = bool(flags & FLG_DEFAULT)
862 is_indeterminate = bool(flags & FLG_INDETERMINATE)
863 is_in_range = bool(flags & FLG_IN_RANGE)
864 is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
865 is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
866
867 if self.debug: # pragma: no cover
868 if is_pseudo:
869 print(' is_pseudo: True')
870 if is_open:
871 print(' is_open: True')
872 if is_relative:
873 print(' is_relative: True')
874 if is_not:
875 print(' is_not: True')
876 if is_html:
877 print(' is_html: True')
878 if is_default:
879 print(' is_default: True')
880 if is_indeterminate:
881 print(' is_indeterminate: True')
882 if is_in_range:
883 print(' is_in_range: True')
884 if is_out_of_range:
885 print(' is_out_of_range: True')
886 if is_placeholder_shown:
887 print(' is_placeholder_shown: True')
888
889 if is_relative:
890 selectors.append(_Selector())
891
892 try:
893 while True:
894 key, m = next(iselector)
895
896 # Handle parts
897 if key == "at_rule":
898 raise NotImplementedError("At-rules found at position {}".format(m.start(0)))
899 elif key == 'pseudo_class_custom':
900 has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
901 elif key == 'pseudo_class':
902 has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
903 elif key == 'pseudo_element':
904 raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
905 elif key == 'pseudo_contains':
906 has_selector = self.parse_pseudo_contains(sel, m, has_selector)
907 elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
908 has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
909 elif key == 'pseudo_lang':
910 has_selector = self.parse_pseudo_lang(sel, m, has_selector)
911 elif key == 'pseudo_dir':
912 has_selector = self.parse_pseudo_dir(sel, m, has_selector)
913 # Currently only supports HTML
914 is_html = True
915 elif key == 'pseudo_close':
916 if not has_selector:
917 raise SelectorSyntaxError(
918 "Expected a selector at postion {}".format(m.start(0)),
919 self.pattern,
920 m.start(0)
921 )
922 if is_open:
923 closed = True
924 break
925 else:
926 raise SelectorSyntaxError(
927 "Unmatched pseudo-class close at postion {}".format(m.start(0)),
928 self.pattern,
929 m.start(0)
930 )
931 elif key == 'combine':
932 if is_relative:
933 has_selector, sel, rel_type = self.parse_has_combinator(
934 sel, m, has_selector, selectors, rel_type, index
935 )
936 else:
937 has_selector, sel = self.parse_combinator(
938 sel, m, has_selector, selectors, relations, is_pseudo, index
939 )
940 elif key == 'attribute':
941 has_selector = self.parse_attribute_selector(sel, m, has_selector)
942 elif key == 'tag':
943 if has_selector:
944 raise SelectorSyntaxError(
945 "Tag name found at position {} instead of at the start".format(m.start(0)),
946 self.pattern,
947 m.start(0)
948 )
949 has_selector = self.parse_tag_pattern(sel, m, has_selector)
950 elif key in ('class', 'id'):
951 has_selector = self.parse_class_id(sel, m, has_selector)
952
953 index = m.end(0)
954 except StopIteration:
955 pass
956
957 if is_open and not closed:
958 raise SelectorSyntaxError(
959 "Unclosed pseudo-class at position {}".format(index),
960 self.pattern,
961 index
962 )
963
964 if has_selector:
965 if not sel.tag and not is_pseudo:
966 # Implied `*`
967 sel.tag = ct.SelectorTag('*', None)
968 if is_relative:
969 sel.rel_type = rel_type
970 selectors[-1].relations.append(sel)
971 else:
972 sel.relations.extend(relations)
973 del relations[:]
974 selectors.append(sel)
975 else:
976 # We will always need to finish a selector when `:has()` is used as it leads with combining.
977 raise SelectorSyntaxError(
978 'Expected a selector at position {}'.format(index),
979 self.pattern,
980 index
981 )
982
983 # Some patterns require additional logic, such as default. We try to make these the
984 # last pattern, and append the appropriate flag to that selector which communicates
985 # to the matcher what additional logic is required.
986 if is_default:
987 selectors[-1].flags = ct.SEL_DEFAULT
988 if is_indeterminate:
989 selectors[-1].flags = ct.SEL_INDETERMINATE
990 if is_in_range:
991 selectors[-1].flags = ct.SEL_IN_RANGE
992 if is_out_of_range:
993 selectors[-1].flags = ct.SEL_OUT_OF_RANGE
994 if is_placeholder_shown:
995 selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
996
997 return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
998
999 def selector_iter(self, pattern):
1000 """Iterate selector tokens."""
1001
1002 # Ignore whitespace and comments at start and end of pattern
1003 m = RE_WS_BEGIN.search(pattern)
1004 index = m.end(0) if m else 0
1005 m = RE_WS_END.search(pattern)
1006 end = (m.start(0) - 1) if m else (len(pattern) - 1)
1007
1008 if self.debug: # pragma: no cover
1009 print('## PARSING: {!r}'.format(pattern))
1010 while index <= end:
1011 m = None
1012 for v in self.css_tokens:
1013 m = v.match(pattern, index, self.flags)
1014 if m:
1015 name = v.get_name()
1016 if self.debug: # pragma: no cover
1017 print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0)))
1018 index = m.end(0)
1019 yield name, m
1020 break
1021 if m is None:
1022 c = pattern[index]
1023 # If the character represents the start of one of the known selector types,
1024 # throw an exception mentioning that the known selector type is in error;
1025 # otherwise, report the invalid character.
1026 if c == '[':
1027 msg = "Malformed attribute selector at position {}".format(index)
1028 elif c == '.':
1029 msg = "Malformed class selector at position {}".format(index)
1030 elif c == '#':
1031 msg = "Malformed id selector at position {}".format(index)
1032 elif c == ':':
1033 msg = "Malformed pseudo-class selector at position {}".format(index)
1034 else:
1035 msg = "Invalid character {!r} position {}".format(c, index)
1036 raise SelectorSyntaxError(msg, self.pattern, index)
1037 if self.debug: # pragma: no cover
1038 print('## END PARSING')
1039
1040 def process_selectors(self, index=0, flags=0):
1041 """Process selectors."""
1042
1043 return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
1044
1045
1046 # Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
1047 # A few patterns are order dependent as they use patterns previous compiled.
1048
1049 # CSS pattern for `:link` and `:any-link`
1050 CSS_LINK = CSSParser(
1051 'html|*:is(a, area, link)[href]'
1052 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1053 # CSS pattern for `:checked`
1054 CSS_CHECKED = CSSParser(
1055 '''
1056 html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
1057 '''
1058 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1059 # CSS pattern for `:default` (must compile CSS_CHECKED first)
1060 CSS_DEFAULT = CSSParser(
1061 '''
1062 :checked,
1063
1064 /*
1065 This pattern must be at the end.
1066 Special logic is applied to the last selector.
1067 */
1068 html|form html|*:is(button, input)[type="submit"]
1069 '''
1070 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
1071 # CSS pattern for `:indeterminate`
1072 CSS_INDETERMINATE = CSSParser(
1073 '''
1074 html|input[type="checkbox"][indeterminate],
1075 html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
1076 html|progress:not([value]),
1077
1078 /*
1079 This pattern must be at the end.
1080 Special logic is applied to the last selector.
1081 */
1082 html|input[type="radio"][name][name!='']:not([checked])
1083 '''
1084 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
1085 # CSS pattern for `:disabled`
1086 CSS_DISABLED = CSSParser(
1087 '''
1088 html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
1089 html|optgroup[disabled] > html|option,
1090 html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
1091 html|fieldset[disabled] >
1092 html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
1093 '''
1094 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1095 # CSS pattern for `:enabled`
1096 CSS_ENABLED = CSSParser(
1097 '''
1098 html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
1099 '''
1100 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1101 # CSS pattern for `:required`
1102 CSS_REQUIRED = CSSParser(
1103 'html|*:is(input, textarea, select)[required]'
1104 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1105 # CSS pattern for `:optional`
1106 CSS_OPTIONAL = CSSParser(
1107 'html|*:is(input, textarea, select):not([required])'
1108 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1109 # CSS pattern for `:placeholder-shown`
1110 CSS_PLACEHOLDER_SHOWN = CSSParser(
1111 '''
1112 html|input:is(
1113 :not([type]),
1114 [type=""],
1115 [type=text],
1116 [type=search],
1117 [type=url],
1118 [type=tel],
1119 [type=email],
1120 [type=password],
1121 [type=number]
1122 )[placeholder][placeholder!='']:is(:not([value]), [value=""]),
1123 html|textarea[placeholder][placeholder!='']
1124 '''
1125 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
1126 # CSS pattern default for `:nth-child` "of S" feature
1127 CSS_NTH_OF_S_DEFAULT = CSSParser(
1128 '*|*'
1129 ).process_selectors(flags=FLG_PSEUDO)
1130 # CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
1131 CSS_READ_WRITE = CSSParser(
1132 '''
1133 html|*:is(
1134 textarea,
1135 input:is(
1136 :not([type]),
1137 [type=""],
1138 [type=text],
1139 [type=search],
1140 [type=url],
1141 [type=tel],
1142 [type=email],
1143 [type=number],
1144 [type=password],
1145 [type=date],
1146 [type=datetime-local],
1147 [type=month],
1148 [type=time],
1149 [type=week]
1150 )
1151 ):not([readonly], :disabled),
1152 html|*:is([contenteditable=""], [contenteditable="true" i])
1153 '''
1154 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1155 # CSS pattern for `:read-only`
1156 CSS_READ_ONLY = CSSParser(
1157 '''
1158 html|*:not(:read-write)
1159 '''
1160 ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1161 # CSS pattern for `:in-range`
1162 CSS_IN_RANGE = CSSParser(
1163 '''
1164 html|input:is(
1165 [type="date"],
1166 [type="month"],
1167 [type="week"],
1168 [type="time"],
1169 [type="datetime-local"],
1170 [type="number"],
1171 [type="range"]
1172 ):is(
1173 [min],
1174 [max]
1175 )
1176 '''
1177 ).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
1178 # CSS pattern for `:out-of-range`
1179 CSS_OUT_OF_RANGE = CSSParser(
1180 '''
1181 html|input:is(
1182 [type="date"],
1183 [type="month"],
1184 [type="week"],
1185 [type="time"],
1186 [type="datetime-local"],
1187 [type="number"],
1188 [type="range"]
1189 ):is(
1190 [min],
1191 [max]
1192 )
1193 '''
1194 ).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)