comparison env/lib/python3.7/site-packages/soupsieve/css_match.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 """CSS matcher."""
2 from datetime import datetime
3 from . import util
4 import re
5 from .import css_types as ct
6 import unicodedata
7
8 # Empty tag pattern (whitespace okay)
9 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
10
11 RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
12
13 # Relationships
14 REL_PARENT = ' '
15 REL_CLOSE_PARENT = '>'
16 REL_SIBLING = '~'
17 REL_CLOSE_SIBLING = '+'
18
19 # Relationships for :has() (forward looking)
20 REL_HAS_PARENT = ': '
21 REL_HAS_CLOSE_PARENT = ':>'
22 REL_HAS_SIBLING = ':~'
23 REL_HAS_CLOSE_SIBLING = ':+'
24
25 NS_XHTML = 'http://www.w3.org/1999/xhtml'
26 NS_XML = 'http://www.w3.org/XML/1998/namespace'
27
28 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
29 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
30
31 DIR_MAP = {
32 'ltr': ct.SEL_DIR_LTR,
33 'rtl': ct.SEL_DIR_RTL,
34 'auto': 0
35 }
36
37 RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
38 RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
39 RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
40 RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
41 RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
42 RE_DATETIME = re.compile(
43 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
44 )
45 RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
46
47 MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
48 FEB = 2
49 SHORT_MONTH = 30
50 LONG_MONTH = 31
51 FEB_MONTH = 28
52 FEB_LEAP_MONTH = 29
53 DAYS_IN_WEEK = 7
54
55
56 class _FakeParent(object):
57 """
58 Fake parent class.
59
60 When we have a fragment with no `BeautifulSoup` document object,
61 we can't evaluate `nth` selectors properly. Create a temporary
62 fake parent so we can traverse the root element as a child.
63 """
64
65 def __init__(self, element):
66 """Initialize."""
67
68 self.contents = [element]
69
70 def __len__(self):
71 """Length."""
72
73 return len(self.contents)
74
75
76 class _DocumentNav(object):
77 """Navigate a Beautiful Soup document."""
78
79 @classmethod
80 def assert_valid_input(cls, tag):
81 """Check if valid input tag or document."""
82
83 # Fail on unexpected types.
84 if not cls.is_tag(tag):
85 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
86
87 @staticmethod
88 def is_doc(obj):
89 """Is `BeautifulSoup` object."""
90
91 import bs4
92 return isinstance(obj, bs4.BeautifulSoup)
93
94 @staticmethod
95 def is_tag(obj):
96 """Is tag."""
97
98 import bs4
99 return isinstance(obj, bs4.Tag)
100
101 @staticmethod
102 def is_declaration(obj): # pragma: no cover
103 """Is declaration."""
104
105 import bs4
106 return isinstance(obj, bs4.Declaration)
107
108 @staticmethod
109 def is_cdata(obj):
110 """Is CDATA."""
111
112 import bs4
113 return isinstance(obj, bs4.CData)
114
115 @staticmethod
116 def is_processing_instruction(obj): # pragma: no cover
117 """Is processing instruction."""
118
119 import bs4
120 return isinstance(obj, bs4.ProcessingInstruction)
121
122 @staticmethod
123 def is_navigable_string(obj):
124 """Is navigable string."""
125
126 import bs4
127 return isinstance(obj, bs4.NavigableString)
128
129 @staticmethod
130 def is_special_string(obj):
131 """Is special string."""
132
133 import bs4
134 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
135
136 @classmethod
137 def is_content_string(cls, obj):
138 """Check if node is content string."""
139
140 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
141
142 @staticmethod
143 def create_fake_parent(el):
144 """Create fake parent for a given element."""
145
146 return _FakeParent(el)
147
148 @staticmethod
149 def is_xml_tree(el):
150 """Check if element (or document) is from a XML tree."""
151
152 return el._is_xml
153
154 def is_iframe(self, el):
155 """Check if element is an `iframe`."""
156
157 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)
158
159 def is_root(self, el):
160 """
161 Return whether element is a root element.
162
163 We check that the element is the root of the tree (which we have already pre-calculated),
164 and we check if it is the root element under an `iframe`.
165 """
166
167 root = self.root and self.root is el
168 if not root:
169 parent = self.get_parent(el)
170 root = parent is not None and self.is_html and self.is_iframe(parent)
171 return root
172
173 def get_contents(self, el, no_iframe=False):
174 """Get contents or contents in reverse."""
175 if not no_iframe or not self.is_iframe(el):
176 for content in el.contents:
177 yield content
178
179 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
180 """Get children."""
181
182 if not no_iframe or not self.is_iframe(el):
183 last = len(el.contents) - 1
184 if start is None:
185 index = last if reverse else 0
186 else:
187 index = start
188 end = -1 if reverse else last + 1
189 incr = -1 if reverse else 1
190
191 if 0 <= index <= last:
192 while index != end:
193 node = el.contents[index]
194 index += incr
195 if not tags or self.is_tag(node):
196 yield node
197
198 def get_descendants(self, el, tags=True, no_iframe=False):
199 """Get descendants."""
200
201 if not no_iframe or not self.is_iframe(el):
202 next_good = None
203 for child in el.descendants:
204
205 if next_good is not None:
206 if child is not next_good:
207 continue
208 next_good = None
209
210 is_tag = self.is_tag(child)
211
212 if no_iframe and is_tag and self.is_iframe(child):
213 if child.next_sibling is not None:
214 next_good = child.next_sibling
215 else:
216 last_child = child
217 while self.is_tag(last_child) and last_child.contents:
218 last_child = last_child.contents[-1]
219 next_good = last_child.next_element
220 yield child
221 if next_good is None:
222 break
223 # Coverage isn't seeing this even though it's executed
224 continue # pragma: no cover
225
226 if not tags or is_tag:
227 yield child
228
229 def get_parent(self, el, no_iframe=False):
230 """Get parent."""
231
232 parent = el.parent
233 if no_iframe and parent is not None and self.is_iframe(parent):
234 parent = None
235 return parent
236
237 @staticmethod
238 def get_tag_name(el):
239 """Get tag."""
240
241 return el.name
242
243 @staticmethod
244 def get_prefix_name(el):
245 """Get prefix."""
246
247 return el.prefix
248
249 @staticmethod
250 def get_uri(el):
251 """Get namespace `URI`."""
252
253 return el.namespace
254
255 @classmethod
256 def get_next(cls, el, tags=True):
257 """Get next sibling tag."""
258
259 sibling = el.next_sibling
260 while tags and not cls.is_tag(sibling) and sibling is not None:
261 sibling = sibling.next_sibling
262 return sibling
263
264 @classmethod
265 def get_previous(cls, el, tags=True):
266 """Get previous sibling tag."""
267
268 sibling = el.previous_sibling
269 while tags and not cls.is_tag(sibling) and sibling is not None:
270 sibling = sibling.previous_sibling
271 return sibling
272
273 @staticmethod
274 def has_html_ns(el):
275 """
276 Check if element has an HTML namespace.
277
278 This is a bit different than whether a element is treated as having an HTML namespace,
279 like we do in the case of `is_html_tag`.
280 """
281
282 ns = getattr(el, 'namespace') if el else None
283 return ns and ns == NS_XHTML
284
285 @staticmethod
286 def split_namespace(el, attr_name):
287 """Return namespace and attribute name without the prefix."""
288
289 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
290
291 @staticmethod
292 def get_attribute_by_name(el, name, default=None):
293 """Get attribute by name."""
294
295 value = default
296 if el._is_xml:
297 try:
298 value = el.attrs[name]
299 except KeyError:
300 pass
301 else:
302 for k, v in el.attrs.items():
303 if util.lower(k) == name:
304 value = v
305 break
306 return value
307
308 @staticmethod
309 def iter_attributes(el):
310 """Iterate attributes."""
311
312 for k, v in el.attrs.items():
313 yield k, v
314
315 @classmethod
316 def get_classes(cls, el):
317 """Get classes."""
318
319 classes = cls.get_attribute_by_name(el, 'class', [])
320 if isinstance(classes, str):
321 classes = RE_NOT_WS.findall(classes)
322 return classes
323
324 def get_text(self, el, no_iframe=False):
325 """Get text."""
326
327 return ''.join(
328 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
329 )
330
331
332 class Inputs(object):
333 """Class for parsing and validating input items."""
334
335 @staticmethod
336 def validate_day(year, month, day):
337 """Validate day."""
338
339 max_days = LONG_MONTH
340 if month == FEB:
341 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
342 elif month in MONTHS_30:
343 max_days = SHORT_MONTH
344 return 1 <= day <= max_days
345
346 @staticmethod
347 def validate_week(year, week):
348 """Validate week."""
349
350 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
351 if max_week == 1:
352 max_week = 53
353 return 1 <= week <= max_week
354
355 @staticmethod
356 def validate_month(month):
357 """Validate month."""
358
359 return 1 <= month <= 12
360
361 @staticmethod
362 def validate_year(year):
363 """Validate year."""
364
365 return 1 <= year
366
367 @staticmethod
368 def validate_hour(hour):
369 """Validate hour."""
370
371 return 0 <= hour <= 23
372
373 @staticmethod
374 def validate_minutes(minutes):
375 """Validate minutes."""
376
377 return 0 <= minutes <= 59
378
379 @classmethod
380 def parse_value(cls, itype, value):
381 """Parse the input value."""
382
383 parsed = None
384 if itype == "date":
385 m = RE_DATE.match(value)
386 if m:
387 year = int(m.group('year'), 10)
388 month = int(m.group('month'), 10)
389 day = int(m.group('day'), 10)
390 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
391 parsed = (year, month, day)
392 elif itype == "month":
393 m = RE_MONTH.match(value)
394 if m:
395 year = int(m.group('year'), 10)
396 month = int(m.group('month'), 10)
397 if cls.validate_year(year) and cls.validate_month(month):
398 parsed = (year, month)
399 elif itype == "week":
400 m = RE_WEEK.match(value)
401 if m:
402 year = int(m.group('year'), 10)
403 week = int(m.group('week'), 10)
404 if cls.validate_year(year) and cls.validate_week(year, week):
405 parsed = (year, week)
406 elif itype == "time":
407 m = RE_TIME.match(value)
408 if m:
409 hour = int(m.group('hour'), 10)
410 minutes = int(m.group('minutes'), 10)
411 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
412 parsed = (hour, minutes)
413 elif itype == "datetime-local":
414 m = RE_DATETIME.match(value)
415 if m:
416 year = int(m.group('year'), 10)
417 month = int(m.group('month'), 10)
418 day = int(m.group('day'), 10)
419 hour = int(m.group('hour'), 10)
420 minutes = int(m.group('minutes'), 10)
421 if (
422 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
423 cls.validate_hour(hour) and cls.validate_minutes(minutes)
424 ):
425 parsed = (year, month, day, hour, minutes)
426 elif itype in ("number", "range"):
427 m = RE_NUM.match(value)
428 if m:
429 parsed = float(m.group('value'))
430 return parsed
431
432
433 class _Match(object):
434 """Perform CSS matching."""
435
436 def __init__(self, selectors, scope, namespaces, flags):
437 """Initialize."""
438
439 self.assert_valid_input(scope)
440 self.tag = scope
441 self.cached_meta_lang = []
442 self.cached_default_forms = []
443 self.cached_indeterminate_forms = []
444 self.selectors = selectors
445 self.namespaces = {} if namespaces is None else namespaces
446 self.flags = flags
447 self.iframe_restrict = False
448
449 # Find the root element for the whole tree
450 doc = scope
451 parent = self.get_parent(doc)
452 while parent:
453 doc = parent
454 parent = self.get_parent(doc)
455 root = None
456 if not self.is_doc(doc):
457 root = doc
458 else:
459 for child in self.get_children(doc):
460 root = child
461 break
462
463 self.root = root
464 self.scope = scope if scope is not doc else root
465 self.has_html_namespace = self.has_html_ns(root)
466
467 # A document can be both XML and HTML (XHTML)
468 self.is_xml = self.is_xml_tree(doc)
469 self.is_html = not self.is_xml or self.has_html_namespace
470
471 def supports_namespaces(self):
472 """Check if namespaces are supported in the HTML type."""
473
474 return self.is_xml or self.has_html_namespace
475
476 def get_tag_ns(self, el):
477 """Get tag namespace."""
478
479 if self.supports_namespaces():
480 namespace = ''
481 ns = self.get_uri(el)
482 if ns:
483 namespace = ns
484 else:
485 namespace = NS_XHTML
486 return namespace
487
488 def is_html_tag(self, el):
489 """Check if tag is in HTML namespace."""
490
491 return self.get_tag_ns(el) == NS_XHTML
492
493 def get_tag(self, el):
494 """Get tag."""
495
496 name = self.get_tag_name(el)
497 return util.lower(name) if name is not None and not self.is_xml else name
498
499 def get_prefix(self, el):
500 """Get prefix."""
501
502 prefix = self.get_prefix_name(el)
503 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
504
505 def find_bidi(self, el):
506 """Get directionality from element text."""
507
508 for node in self.get_children(el, tags=False):
509
510 # Analyze child text nodes
511 if self.is_tag(node):
512
513 # Avoid analyzing certain elements specified in the specification.
514 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
515 if (
516 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
517 not self.is_html_tag(node) or
518 direction is not None
519 ):
520 continue # pragma: no cover
521
522 # Check directionality of this node's text
523 value = self.find_bidi(node)
524 if value is not None:
525 return value
526
527 # Direction could not be determined
528 continue # pragma: no cover
529
530 # Skip `doctype` comments, etc.
531 if self.is_special_string(node):
532 continue
533
534 # Analyze text nodes for directionality.
535 for c in node:
536 bidi = unicodedata.bidirectional(c)
537 if bidi in ('AL', 'R', 'L'):
538 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
539 return None
540
541 def extended_language_filter(self, lang_range, lang_tag):
542 """Filter the language tags."""
543
544 match = True
545 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
546 ranges = lang_range.split('-')
547 subtags = lang_tag.lower().split('-')
548 length = len(ranges)
549 rindex = 0
550 sindex = 0
551 r = ranges[rindex]
552 s = subtags[sindex]
553
554 # Primary tag needs to match
555 if r != '*' and r != s:
556 match = False
557
558 rindex += 1
559 sindex += 1
560
561 # Match until we run out of ranges
562 while match and rindex < length:
563 r = ranges[rindex]
564 try:
565 s = subtags[sindex]
566 except IndexError:
567 # Ran out of subtags,
568 # but we still have ranges
569 match = False
570 continue
571
572 # Empty range
573 if not r:
574 match = False
575 continue
576
577 # Matched range
578 elif s == r:
579 rindex += 1
580
581 # Implicit wildcard cannot match
582 # singletons
583 elif len(s) == 1:
584 match = False
585 continue
586
587 # Implicitly matched, so grab next subtag
588 sindex += 1
589
590 return match
591
592 def match_attribute_name(self, el, attr, prefix):
593 """Match attribute name and return value if it exists."""
594
595 value = None
596 if self.supports_namespaces():
597 value = None
598 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
599 if prefix:
600 ns = self.namespaces.get(prefix)
601 if ns is None and prefix != '*':
602 return None
603 else:
604 ns = None
605
606 for k, v in self.iter_attributes(el):
607
608 # Get attribute parts
609 namespace, name = self.split_namespace(el, k)
610
611 # Can't match a prefix attribute as we haven't specified one to match
612 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
613 if ns is None:
614 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
615 value = v
616 break
617 # Coverage is not finding this even though it is executed.
618 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
619 # Ignore the false positive message.
620 continue # pragma: no cover
621
622 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
623 if namespace is None or ns != namespace and prefix != '*':
624 continue
625
626 # The attribute doesn't match.
627 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
628 continue
629
630 value = v
631 break
632 else:
633 for k, v in self.iter_attributes(el):
634 if util.lower(attr) != util.lower(k):
635 continue
636 value = v
637 break
638 return value
639
640 def match_namespace(self, el, tag):
641 """Match the namespace of the element."""
642
643 match = True
644 namespace = self.get_tag_ns(el)
645 default_namespace = self.namespaces.get('')
646 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
647 # We must match the default namespace if one is not provided
648 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
649 match = False
650 # If we specified `|tag`, we must not have a namespace.
651 elif (tag.prefix is not None and tag.prefix == '' and namespace):
652 match = False
653 # Verify prefix matches
654 elif (
655 tag.prefix and
656 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
657 ):
658 match = False
659 return match
660
661 def match_attributes(self, el, attributes):
662 """Match attributes."""
663
664 match = True
665 if attributes:
666 for a in attributes:
667 value = self.match_attribute_name(el, a.attribute, a.prefix)
668 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
669 if isinstance(value, list):
670 value = ' '.join(value)
671 if value is None:
672 match = False
673 break
674 elif pattern is None:
675 continue
676 elif pattern.match(value) is None:
677 match = False
678 break
679 return match
680
681 def match_tagname(self, el, tag):
682 """Match tag name."""
683
684 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
685 return not (
686 name is not None and
687 name not in (self.get_tag(el), '*')
688 )
689
690 def match_tag(self, el, tag):
691 """Match the tag."""
692
693 match = True
694 if tag is not None:
695 # Verify namespace
696 if not self.match_namespace(el, tag):
697 match = False
698 if not self.match_tagname(el, tag):
699 match = False
700 return match
701
702 def match_past_relations(self, el, relation):
703 """Match past relationship."""
704
705 found = False
706 if relation[0].rel_type == REL_PARENT:
707 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
708 while not found and parent:
709 found = self.match_selectors(parent, relation)
710 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
711 elif relation[0].rel_type == REL_CLOSE_PARENT:
712 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
713 if parent:
714 found = self.match_selectors(parent, relation)
715 elif relation[0].rel_type == REL_SIBLING:
716 sibling = self.get_previous(el)
717 while not found and sibling:
718 found = self.match_selectors(sibling, relation)
719 sibling = self.get_previous(sibling)
720 elif relation[0].rel_type == REL_CLOSE_SIBLING:
721 sibling = self.get_previous(el)
722 if sibling and self.is_tag(sibling):
723 found = self.match_selectors(sibling, relation)
724 return found
725
726 def match_future_child(self, parent, relation, recursive=False):
727 """Match future child."""
728
729 match = False
730 children = self.get_descendants if recursive else self.get_children
731 for child in children(parent, no_iframe=self.iframe_restrict):
732 match = self.match_selectors(child, relation)
733 if match:
734 break
735 return match
736
737 def match_future_relations(self, el, relation):
738 """Match future relationship."""
739
740 found = False
741 if relation[0].rel_type == REL_HAS_PARENT:
742 found = self.match_future_child(el, relation, True)
743 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
744 found = self.match_future_child(el, relation)
745 elif relation[0].rel_type == REL_HAS_SIBLING:
746 sibling = self.get_next(el)
747 while not found and sibling:
748 found = self.match_selectors(sibling, relation)
749 sibling = self.get_next(sibling)
750 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
751 sibling = self.get_next(el)
752 if sibling and self.is_tag(sibling):
753 found = self.match_selectors(sibling, relation)
754 return found
755
756 def match_relations(self, el, relation):
757 """Match relationship to other elements."""
758
759 found = False
760
761 if relation[0].rel_type.startswith(':'):
762 found = self.match_future_relations(el, relation)
763 else:
764 found = self.match_past_relations(el, relation)
765
766 return found
767
768 def match_id(self, el, ids):
769 """Match element's ID."""
770
771 found = True
772 for i in ids:
773 if i != self.get_attribute_by_name(el, 'id', ''):
774 found = False
775 break
776 return found
777
778 def match_classes(self, el, classes):
779 """Match element's classes."""
780
781 current_classes = self.get_classes(el)
782 found = True
783 for c in classes:
784 if c not in current_classes:
785 found = False
786 break
787 return found
788
789 def match_root(self, el):
790 """Match element as root."""
791
792 is_root = self.is_root(el)
793 if is_root:
794 sibling = self.get_previous(el, tags=False)
795 while is_root and sibling is not None:
796 if (
797 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
798 self.is_cdata(sibling)
799 ):
800 is_root = False
801 else:
802 sibling = self.get_previous(sibling, tags=False)
803 if is_root:
804 sibling = self.get_next(el, tags=False)
805 while is_root and sibling is not None:
806 if (
807 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
808 self.is_cdata(sibling)
809 ):
810 is_root = False
811 else:
812 sibling = self.get_next(sibling, tags=False)
813 return is_root
814
815 def match_scope(self, el):
816 """Match element as scope."""
817
818 return self.scope is el
819
820 def match_nth_tag_type(self, el, child):
821 """Match tag type for `nth` matches."""
822
823 return(
824 (self.get_tag(child) == self.get_tag(el)) and
825 (self.get_tag_ns(child) == self.get_tag_ns(el))
826 )
827
828 def match_nth(self, el, nth):
829 """Match `nth` elements."""
830
831 matched = True
832
833 for n in nth:
834 matched = False
835 if n.selectors and not self.match_selectors(el, n.selectors):
836 break
837 parent = self.get_parent(el)
838 if parent is None:
839 parent = self.create_fake_parent(el)
840 last = n.last
841 last_index = len(parent) - 1
842 index = last_index if last else 0
843 relative_index = 0
844 a = n.a
845 b = n.b
846 var = n.n
847 count = 0
848 count_incr = 1
849 factor = -1 if last else 1
850 idx = last_idx = a * count + b if var else a
851
852 # We can only adjust bounds within a variable index
853 if var:
854 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
855 # Otherwise, increment to try to get in bounds.
856 adjust = None
857 while idx < 1 or idx > last_index:
858 if idx < 0:
859 diff_low = 0 - idx
860 if adjust is not None and adjust == 1:
861 break
862 adjust = -1
863 count += count_incr
864 idx = last_idx = a * count + b if var else a
865 diff = 0 - idx
866 if diff >= diff_low:
867 break
868 else:
869 diff_high = idx - last_index
870 if adjust is not None and adjust == -1:
871 break
872 adjust = 1
873 count += count_incr
874 idx = last_idx = a * count + b if var else a
875 diff = idx - last_index
876 if diff >= diff_high:
877 break
878 diff_high = diff
879
880 # If a < 0, our count is working backwards, so floor the index by increasing the count.
881 # Find the count that yields the lowest, in bound value and use that.
882 # Lastly reverse count increment so that we'll increase our index.
883 lowest = count
884 if a < 0:
885 while idx >= 1:
886 lowest = count
887 count += count_incr
888 idx = last_idx = a * count + b if var else a
889 count_incr = -1
890 count = lowest
891 idx = last_idx = a * count + b if var else a
892
893 # Evaluate elements while our calculated nth index is still in range
894 while 1 <= idx <= last_index + 1:
895 child = None
896 # Evaluate while our child index is still in range.
897 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
898 index += factor
899 if not self.is_tag(child):
900 continue
901 # Handle `of S` in `nth-child`
902 if n.selectors and not self.match_selectors(child, n.selectors):
903 continue
904 # Handle `of-type`
905 if n.of_type and not self.match_nth_tag_type(el, child):
906 continue
907 relative_index += 1
908 if relative_index == idx:
909 if child is el:
910 matched = True
911 else:
912 break
913 if child is el:
914 break
915 if child is el:
916 break
917 last_idx = idx
918 count += count_incr
919 if count < 0:
920 # Count is counting down and has now ventured into invalid territory.
921 break
922 idx = a * count + b if var else a
923 if last_idx == idx:
924 break
925 if not matched:
926 break
927 return matched
928
929 def match_empty(self, el):
930 """Check if element is empty (if requested)."""
931
932 is_empty = True
933 for child in self.get_children(el, tags=False):
934 if self.is_tag(child):
935 is_empty = False
936 break
937 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
938 is_empty = False
939 break
940 return is_empty
941
942 def match_subselectors(self, el, selectors):
943 """Match selectors."""
944
945 match = True
946 for sel in selectors:
947 if not self.match_selectors(el, sel):
948 match = False
949 return match
950
951 def match_contains(self, el, contains):
952 """Match element if it contains text."""
953
954 match = True
955 content = None
956 for contain_list in contains:
957 if content is None:
958 content = self.get_text(el, no_iframe=self.is_html)
959 found = False
960 for text in contain_list.text:
961 if text in content:
962 found = True
963 break
964 if not found:
965 match = False
966 return match
967
968 def match_default(self, el):
969 """Match default."""
970
971 match = False
972
973 # Find this input's form
974 form = None
975 parent = self.get_parent(el, no_iframe=True)
976 while parent and form is None:
977 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
978 form = parent
979 else:
980 parent = self.get_parent(parent, no_iframe=True)
981
982 # Look in form cache to see if we've already located its default button
983 found_form = False
984 for f, t in self.cached_default_forms:
985 if f is form:
986 found_form = True
987 if t is el:
988 match = True
989 break
990
991 # We didn't have the form cached, so look for its default button
992 if not found_form:
993 for child in self.get_descendants(form, no_iframe=True):
994 name = self.get_tag(child)
995 # Can't do nested forms (haven't figured out why we never hit this)
996 if name == 'form': # pragma: no cover
997 break
998 if name in ('input', 'button'):
999 v = self.get_attribute_by_name(child, 'type', '')
1000 if v and util.lower(v) == 'submit':
1001 self.cached_default_forms.append([form, child])
1002 if el is child:
1003 match = True
1004 break
1005 return match
1006
1007 def match_indeterminate(self, el):
1008 """Match default."""
1009
1010 match = False
1011 name = self.get_attribute_by_name(el, 'name')
1012
1013 def get_parent_form(el):
1014 """Find this input's form."""
1015 form = None
1016 parent = self.get_parent(el, no_iframe=True)
1017 while form is None:
1018 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1019 form = parent
1020 break
1021 last_parent = parent
1022 parent = self.get_parent(parent, no_iframe=True)
1023 if parent is None:
1024 form = last_parent
1025 break
1026 return form
1027
1028 form = get_parent_form(el)
1029
1030 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1031 found_form = False
1032 for f, n, i in self.cached_indeterminate_forms:
1033 if f is form and n == name:
1034 found_form = True
1035 if i is True:
1036 match = True
1037 break
1038
1039 # We didn't have the form cached, so validate that the radio button is indeterminate
1040 if not found_form:
1041 checked = False
1042 for child in self.get_descendants(form, no_iframe=True):
1043 if child is el:
1044 continue
1045 tag_name = self.get_tag(child)
1046 if tag_name == 'input':
1047 is_radio = False
1048 check = False
1049 has_name = False
1050 for k, v in self.iter_attributes(child):
1051 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1052 is_radio = True
1053 elif util.lower(k) == 'name' and v == name:
1054 has_name = True
1055 elif util.lower(k) == 'checked':
1056 check = True
1057 if is_radio and check and has_name and get_parent_form(child) is form:
1058 checked = True
1059 break
1060 if checked:
1061 break
1062 if not checked:
1063 match = True
1064 self.cached_indeterminate_forms.append([form, name, match])
1065
1066 return match
1067
1068 def match_lang(self, el, langs):
1069 """Match languages."""
1070
1071 match = False
1072 has_ns = self.supports_namespaces()
1073 root = self.root
1074 has_html_namespace = self.has_html_namespace
1075
1076 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1077 parent = el
1078 found_lang = None
1079 last = None
1080 while not found_lang:
1081 has_html_ns = self.has_html_ns(parent)
1082 for k, v in self.iter_attributes(parent):
1083 attr_ns, attr = self.split_namespace(parent, k)
1084 if (
1085 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1086 (
1087 has_ns and not has_html_ns and attr_ns == NS_XML and
1088 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1089 )
1090 ):
1091 found_lang = v
1092 break
1093 last = parent
1094 parent = self.get_parent(parent, no_iframe=self.is_html)
1095
1096 if parent is None:
1097 root = last
1098 has_html_namespace = self.has_html_ns(root)
1099 parent = last
1100 break
1101
1102 # Use cached meta language.
1103 if not found_lang and self.cached_meta_lang:
1104 for cache in self.cached_meta_lang:
1105 if root is cache[0]:
1106 found_lang = cache[1]
1107
1108 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1109 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1110 # Find head
1111 found = False
1112 for tag in ('html', 'head'):
1113 found = False
1114 for child in self.get_children(parent, no_iframe=self.is_html):
1115 if self.get_tag(child) == tag and self.is_html_tag(child):
1116 found = True
1117 parent = child
1118 break
1119 if not found: # pragma: no cover
1120 break
1121
1122 # Search meta tags
1123 if found:
1124 for child in parent:
1125 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1126 c_lang = False
1127 content = None
1128 for k, v in self.iter_attributes(child):
1129 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1130 c_lang = True
1131 if util.lower(k) == 'content':
1132 content = v
1133 if c_lang and content:
1134 found_lang = content
1135 self.cached_meta_lang.append((root, found_lang))
1136 break
1137 if found_lang:
1138 break
1139 if not found_lang:
1140 self.cached_meta_lang.append((root, False))
1141
1142 # If we determined a language, compare.
1143 if found_lang:
1144 for patterns in langs:
1145 match = False
1146 for pattern in patterns:
1147 if self.extended_language_filter(pattern, found_lang):
1148 match = True
1149 if not match:
1150 break
1151
1152 return match
1153
1154 def match_dir(self, el, directionality):
1155 """Check directionality."""
1156
1157 # If we have to match both left and right, we can't match either.
1158 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1159 return False
1160
1161 if el is None or not self.is_html_tag(el):
1162 return False
1163
1164 # Element has defined direction of left to right or right to left
1165 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1166 if direction not in (None, 0):
1167 return direction == directionality
1168
1169 # Element is the document element (the root) and no direction assigned, assume left to right.
1170 is_root = self.is_root(el)
1171 if is_root and direction is None:
1172 return ct.SEL_DIR_LTR == directionality
1173
1174 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1175 name = self.get_tag(el)
1176 is_input = name == 'input'
1177 is_textarea = name == 'textarea'
1178 is_bdi = name == 'bdi'
1179 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1180 if is_input and itype == 'tel' and direction is None:
1181 return ct.SEL_DIR_LTR == directionality
1182
1183 # Auto handling for text inputs
1184 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1185 if is_textarea:
1186 value = []
1187 for node in self.get_contents(el, no_iframe=True):
1188 if self.is_content_string(node):
1189 value.append(node)
1190 value = ''.join(value)
1191 else:
1192 value = self.get_attribute_by_name(el, 'value', '')
1193 if value:
1194 for c in value:
1195 bidi = unicodedata.bidirectional(c)
1196 if bidi in ('AL', 'R', 'L'):
1197 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1198 return direction == directionality
1199 # Assume left to right
1200 return ct.SEL_DIR_LTR == directionality
1201 elif is_root:
1202 return ct.SEL_DIR_LTR == directionality
1203 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1204
1205 # Auto handling for `bdi` and other non text inputs.
1206 if (is_bdi and direction is None) or direction == 0:
1207 direction = self.find_bidi(el)
1208 if direction is not None:
1209 return direction == directionality
1210 elif is_root:
1211 return ct.SEL_DIR_LTR == directionality
1212 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1213
1214 # Match parents direction
1215 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1216
1217 def match_range(self, el, condition):
1218 """
1219 Match range.
1220
1221 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1222 if the value is out of range, and if not, it is in range. So a missing value
1223 will not evaluate out of range; therefore, value is in range. Personally, I
1224 feel like this should evaluate as neither in or out of range.
1225 """
1226
1227 out_of_range = False
1228
1229 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1230 mn = self.get_attribute_by_name(el, 'min', None)
1231 if mn is not None:
1232 mn = Inputs.parse_value(itype, mn)
1233 mx = self.get_attribute_by_name(el, 'max', None)
1234 if mx is not None:
1235 mx = Inputs.parse_value(itype, mx)
1236
1237 # There is no valid min or max, so we cannot evaluate a range
1238 if mn is None and mx is None:
1239 return False
1240
1241 value = self.get_attribute_by_name(el, 'value', None)
1242 if value is not None:
1243 value = Inputs.parse_value(itype, value)
1244 if value is not None:
1245 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1246 if mn is not None and value < mn:
1247 out_of_range = True
1248 if not out_of_range and mx is not None and value > mx:
1249 out_of_range = True
1250 elif itype == "time":
1251 if mn is not None and mx is not None and mn > mx:
1252 # Time is periodic, so this is a reversed/discontinuous range
1253 if value < mn and value > mx:
1254 out_of_range = True
1255 else:
1256 if mn is not None and value < mn:
1257 out_of_range = True
1258 if not out_of_range and mx is not None and value > mx:
1259 out_of_range = True
1260
1261 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1262
1263 def match_defined(self, el):
1264 """
1265 Match defined.
1266
1267 `:defined` is related to custom elements in a browser.
1268
1269 - If the document is XML (not XHTML), all tags will match.
1270 - Tags that are not custom (don't have a hyphen) are marked defined.
1271 - If the tag has a prefix (without or without a namespace), it will not match.
1272
1273 This is of course requires the parser to provide us with the proper prefix and namespace info,
1274 if it doesn't, there is nothing we can do.
1275 """
1276
1277 name = self.get_tag(el)
1278 return (
1279 name.find('-') == -1 or
1280 name.find(':') != -1 or
1281 self.get_prefix(el) is not None
1282 )
1283
1284 def match_placeholder_shown(self, el):
1285 """
1286 Match placeholder shown according to HTML spec.
1287
1288 - text area should be checked if they have content. A single newline does not count as content.
1289
1290 """
1291
1292 match = False
1293 content = self.get_text(el)
1294 if content in ('', '\n'):
1295 match = True
1296
1297 return match
1298
1299 def match_selectors(self, el, selectors):
1300 """Check if element matches one of the selectors."""
1301
1302 match = False
1303 is_not = selectors.is_not
1304 is_html = selectors.is_html
1305
1306 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1307 if is_html:
1308 namespaces = self.namespaces
1309 iframe_restrict = self.iframe_restrict
1310 self.namespaces = {'html': NS_XHTML}
1311 self.iframe_restrict = True
1312
1313 if not is_html or self.is_html:
1314 for selector in selectors:
1315 match = is_not
1316 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1317 if isinstance(selector, ct.SelectorNull):
1318 continue
1319 # Verify tag matches
1320 if not self.match_tag(el, selector.tag):
1321 continue
1322 # Verify tag is defined
1323 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1324 continue
1325 # Verify element is root
1326 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1327 continue
1328 # Verify element is scope
1329 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1330 continue
1331 # Verify element has placeholder shown
1332 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1333 continue
1334 # Verify `nth` matches
1335 if not self.match_nth(el, selector.nth):
1336 continue
1337 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1338 continue
1339 # Verify id matches
1340 if selector.ids and not self.match_id(el, selector.ids):
1341 continue
1342 # Verify classes match
1343 if selector.classes and not self.match_classes(el, selector.classes):
1344 continue
1345 # Verify attribute(s) match
1346 if not self.match_attributes(el, selector.attributes):
1347 continue
1348 # Verify ranges
1349 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1350 continue
1351 # Verify language patterns
1352 if selector.lang and not self.match_lang(el, selector.lang):
1353 continue
1354 # Verify pseudo selector patterns
1355 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1356 continue
1357 # Verify relationship selectors
1358 if selector.relation and not self.match_relations(el, selector.relation):
1359 continue
1360 # Validate that the current default selector match corresponds to the first submit button in the form
1361 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1362 continue
1363 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1364 # also not set.
1365 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1366 continue
1367 # Validate element directionality
1368 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1369 continue
1370 # Validate that the tag contains the specified text.
1371 if not self.match_contains(el, selector.contains):
1372 continue
1373 match = not is_not
1374 break
1375
1376 # Restore actual namespaces being used for external selector lists
1377 if is_html:
1378 self.namespaces = namespaces
1379 self.iframe_restrict = iframe_restrict
1380
1381 return match
1382
1383 def select(self, limit=0):
1384 """Match all tags under the targeted tag."""
1385
1386 if limit < 1:
1387 limit = None
1388
1389 for child in self.get_descendants(self.tag):
1390 if self.match(child):
1391 yield child
1392 if limit is not None:
1393 limit -= 1
1394 if limit < 1:
1395 break
1396
1397 def closest(self):
1398 """Match closest ancestor."""
1399
1400 current = self.tag
1401 closest = None
1402 while closest is None and current is not None:
1403 if self.match(current):
1404 closest = current
1405 else:
1406 current = self.get_parent(current)
1407 return closest
1408
1409 def filter(self): # noqa A001
1410 """Filter tag's children."""
1411
1412 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1413
1414 def match(self, el):
1415 """Match."""
1416
1417 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1418
1419
1420 class CSSMatch(_DocumentNav, _Match):
1421 """The Beautiful Soup CSS match class."""
1422
1423
1424 class SoupSieve(ct.Immutable):
1425 """Compiled Soup Sieve selector matching object."""
1426
1427 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1428
1429 def __init__(self, pattern, selectors, namespaces, custom, flags):
1430 """Initialize."""
1431
1432 super(SoupSieve, self).__init__(
1433 pattern=pattern,
1434 selectors=selectors,
1435 namespaces=namespaces,
1436 custom=custom,
1437 flags=flags
1438 )
1439
1440 def match(self, tag):
1441 """Match."""
1442
1443 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1444
1445 def closest(self, tag):
1446 """Match closest ancestor."""
1447
1448 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1449
1450 def filter(self, iterable): # noqa A001
1451 """
1452 Filter.
1453
1454 `CSSMatch` can cache certain searches for tags of the same document,
1455 so if we are given a tag, all tags are from the same document,
1456 and we can take advantage of the optimization.
1457
1458 Any other kind of iterable could have tags from different documents or detached tags,
1459 so for those, we use a new `CSSMatch` for each item in the iterable.
1460 """
1461
1462 if CSSMatch.is_tag(iterable):
1463 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1464 else:
1465 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1466
1467 def select_one(self, tag):
1468 """Select a single tag."""
1469
1470 tags = self.select(tag, limit=1)
1471 return tags[0] if tags else None
1472
1473 def select(self, tag, limit=0):
1474 """Select the specified tags."""
1475
1476 return list(self.iselect(tag, limit))
1477
1478 def iselect(self, tag, limit=0):
1479 """Iterate the specified tags."""
1480
1481 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
1482 yield el
1483
1484 def __repr__(self): # pragma: no cover
1485 """Representation."""
1486
1487 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
1488 self.pattern,
1489 self.namespaces,
1490 self.custom,
1491 self.flags
1492 )
1493
1494 __str__ = __repr__
1495
1496
1497 ct.pickle_register(SoupSieve)