Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/soupsieve/css_match.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 """CSS matcher.""" | |
| 2 from datetime import datetime | |
| 3 from . import util | |
| 4 import re | |
| 5 from .import css_types as ct | |
| 6 import unicodedata | |
| 7 from collections.abc import Sequence | |
| 8 | |
| 9 import bs4 | |
| 10 | |
| 11 # Empty tag pattern (whitespace okay) | |
| 12 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') | |
| 13 | |
| 14 RE_NOT_WS = re.compile('[^ \t\r\n\f]+') | |
| 15 | |
| 16 # Relationships | |
| 17 REL_PARENT = ' ' | |
| 18 REL_CLOSE_PARENT = '>' | |
| 19 REL_SIBLING = '~' | |
| 20 REL_CLOSE_SIBLING = '+' | |
| 21 | |
| 22 # Relationships for :has() (forward looking) | |
| 23 REL_HAS_PARENT = ': ' | |
| 24 REL_HAS_CLOSE_PARENT = ':>' | |
| 25 REL_HAS_SIBLING = ':~' | |
| 26 REL_HAS_CLOSE_SIBLING = ':+' | |
| 27 | |
| 28 NS_XHTML = 'http://www.w3.org/1999/xhtml' | |
| 29 NS_XML = 'http://www.w3.org/XML/1998/namespace' | |
| 30 | |
| 31 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL | |
| 32 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE | |
| 33 | |
| 34 DIR_MAP = { | |
| 35 'ltr': ct.SEL_DIR_LTR, | |
| 36 'rtl': ct.SEL_DIR_RTL, | |
| 37 'auto': 0 | |
| 38 } | |
| 39 | |
| 40 RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") | |
| 41 RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') | |
| 42 RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') | |
| 43 RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') | |
| 44 RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') | |
| 45 RE_DATETIME = re.compile( | |
| 46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' | |
| 47 ) | |
| 48 RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') | |
| 49 | |
| 50 MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November | |
| 51 FEB = 2 | |
| 52 SHORT_MONTH = 30 | |
| 53 LONG_MONTH = 31 | |
| 54 FEB_MONTH = 28 | |
| 55 FEB_LEAP_MONTH = 29 | |
| 56 DAYS_IN_WEEK = 7 | |
| 57 | |
| 58 | |
| 59 class _FakeParent(object): | |
| 60 """ | |
| 61 Fake parent class. | |
| 62 | |
| 63 When we have a fragment with no `BeautifulSoup` document object, | |
| 64 we can't evaluate `nth` selectors properly. Create a temporary | |
| 65 fake parent so we can traverse the root element as a child. | |
| 66 """ | |
| 67 | |
| 68 def __init__(self, element): | |
| 69 """Initialize.""" | |
| 70 | |
| 71 self.contents = [element] | |
| 72 | |
| 73 def __len__(self): | |
| 74 """Length.""" | |
| 75 | |
| 76 return len(self.contents) | |
| 77 | |
| 78 | |
| 79 class _DocumentNav(object): | |
| 80 """Navigate a Beautiful Soup document.""" | |
| 81 | |
| 82 @classmethod | |
| 83 def assert_valid_input(cls, tag): | |
| 84 """Check if valid input tag or document.""" | |
| 85 | |
| 86 # Fail on unexpected types. | |
| 87 if not cls.is_tag(tag): | |
| 88 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) | |
| 89 | |
| 90 @staticmethod | |
| 91 def is_doc(obj): | |
| 92 """Is `BeautifulSoup` object.""" | |
| 93 return isinstance(obj, bs4.BeautifulSoup) | |
| 94 | |
| 95 @staticmethod | |
| 96 def is_tag(obj): | |
| 97 """Is tag.""" | |
| 98 return isinstance(obj, bs4.Tag) | |
| 99 | |
| 100 @staticmethod | |
| 101 def is_declaration(obj): # pragma: no cover | |
| 102 """Is declaration.""" | |
| 103 return isinstance(obj, bs4.Declaration) | |
| 104 | |
| 105 @staticmethod | |
| 106 def is_cdata(obj): | |
| 107 """Is CDATA.""" | |
| 108 return isinstance(obj, bs4.CData) | |
| 109 | |
| 110 @staticmethod | |
| 111 def is_processing_instruction(obj): # pragma: no cover | |
| 112 """Is processing instruction.""" | |
| 113 return isinstance(obj, bs4.ProcessingInstruction) | |
| 114 | |
| 115 @staticmethod | |
| 116 def is_navigable_string(obj): | |
| 117 """Is navigable string.""" | |
| 118 return isinstance(obj, bs4.NavigableString) | |
| 119 | |
| 120 @staticmethod | |
| 121 def is_special_string(obj): | |
| 122 """Is special string.""" | |
| 123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) | |
| 124 | |
| 125 @classmethod | |
| 126 def is_content_string(cls, obj): | |
| 127 """Check if node is content string.""" | |
| 128 | |
| 129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) | |
| 130 | |
| 131 @staticmethod | |
| 132 def create_fake_parent(el): | |
| 133 """Create fake parent for a given element.""" | |
| 134 | |
| 135 return _FakeParent(el) | |
| 136 | |
| 137 @staticmethod | |
| 138 def is_xml_tree(el): | |
| 139 """Check if element (or document) is from a XML tree.""" | |
| 140 | |
| 141 return el._is_xml | |
| 142 | |
| 143 def is_iframe(self, el): | |
| 144 """Check if element is an `iframe`.""" | |
| 145 | |
| 146 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) | |
| 147 | |
| 148 def is_root(self, el): | |
| 149 """ | |
| 150 Return whether element is a root element. | |
| 151 | |
| 152 We check that the element is the root of the tree (which we have already pre-calculated), | |
| 153 and we check if it is the root element under an `iframe`. | |
| 154 """ | |
| 155 | |
| 156 root = self.root and self.root is el | |
| 157 if not root: | |
| 158 parent = self.get_parent(el) | |
| 159 root = parent is not None and self.is_html and self.is_iframe(parent) | |
| 160 return root | |
| 161 | |
| 162 def get_contents(self, el, no_iframe=False): | |
| 163 """Get contents or contents in reverse.""" | |
| 164 if not no_iframe or not self.is_iframe(el): | |
| 165 for content in el.contents: | |
| 166 yield content | |
| 167 | |
| 168 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): | |
| 169 """Get children.""" | |
| 170 | |
| 171 if not no_iframe or not self.is_iframe(el): | |
| 172 last = len(el.contents) - 1 | |
| 173 if start is None: | |
| 174 index = last if reverse else 0 | |
| 175 else: | |
| 176 index = start | |
| 177 end = -1 if reverse else last + 1 | |
| 178 incr = -1 if reverse else 1 | |
| 179 | |
| 180 if 0 <= index <= last: | |
| 181 while index != end: | |
| 182 node = el.contents[index] | |
| 183 index += incr | |
| 184 if not tags or self.is_tag(node): | |
| 185 yield node | |
| 186 | |
| 187 def get_descendants(self, el, tags=True, no_iframe=False): | |
| 188 """Get descendants.""" | |
| 189 | |
| 190 if not no_iframe or not self.is_iframe(el): | |
| 191 next_good = None | |
| 192 for child in el.descendants: | |
| 193 | |
| 194 if next_good is not None: | |
| 195 if child is not next_good: | |
| 196 continue | |
| 197 next_good = None | |
| 198 | |
| 199 is_tag = self.is_tag(child) | |
| 200 | |
| 201 if no_iframe and is_tag and self.is_iframe(child): | |
| 202 if child.next_sibling is not None: | |
| 203 next_good = child.next_sibling | |
| 204 else: | |
| 205 last_child = child | |
| 206 while self.is_tag(last_child) and last_child.contents: | |
| 207 last_child = last_child.contents[-1] | |
| 208 next_good = last_child.next_element | |
| 209 yield child | |
| 210 if next_good is None: | |
| 211 break | |
| 212 # Coverage isn't seeing this even though it's executed | |
| 213 continue # pragma: no cover | |
| 214 | |
| 215 if not tags or is_tag: | |
| 216 yield child | |
| 217 | |
| 218 def get_parent(self, el, no_iframe=False): | |
| 219 """Get parent.""" | |
| 220 | |
| 221 parent = el.parent | |
| 222 if no_iframe and parent is not None and self.is_iframe(parent): | |
| 223 parent = None | |
| 224 return parent | |
| 225 | |
| 226 @staticmethod | |
| 227 def get_tag_name(el): | |
| 228 """Get tag.""" | |
| 229 | |
| 230 return el.name | |
| 231 | |
| 232 @staticmethod | |
| 233 def get_prefix_name(el): | |
| 234 """Get prefix.""" | |
| 235 | |
| 236 return el.prefix | |
| 237 | |
| 238 @staticmethod | |
| 239 def get_uri(el): | |
| 240 """Get namespace `URI`.""" | |
| 241 | |
| 242 return el.namespace | |
| 243 | |
| 244 @classmethod | |
| 245 def get_next(cls, el, tags=True): | |
| 246 """Get next sibling tag.""" | |
| 247 | |
| 248 sibling = el.next_sibling | |
| 249 while tags and not cls.is_tag(sibling) and sibling is not None: | |
| 250 sibling = sibling.next_sibling | |
| 251 return sibling | |
| 252 | |
| 253 @classmethod | |
| 254 def get_previous(cls, el, tags=True): | |
| 255 """Get previous sibling tag.""" | |
| 256 | |
| 257 sibling = el.previous_sibling | |
| 258 while tags and not cls.is_tag(sibling) and sibling is not None: | |
| 259 sibling = sibling.previous_sibling | |
| 260 return sibling | |
| 261 | |
| 262 @staticmethod | |
| 263 def has_html_ns(el): | |
| 264 """ | |
| 265 Check if element has an HTML namespace. | |
| 266 | |
| 267 This is a bit different than whether a element is treated as having an HTML namespace, | |
| 268 like we do in the case of `is_html_tag`. | |
| 269 """ | |
| 270 | |
| 271 ns = getattr(el, 'namespace') if el else None | |
| 272 return ns and ns == NS_XHTML | |
| 273 | |
| 274 @staticmethod | |
| 275 def split_namespace(el, attr_name): | |
| 276 """Return namespace and attribute name without the prefix.""" | |
| 277 | |
| 278 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) | |
| 279 | |
| 280 @classmethod | |
| 281 def normalize_value(cls, value): | |
| 282 """Normalize the value to be a string or list of strings.""" | |
| 283 | |
| 284 # Treat `None` as empty string. | |
| 285 if value is None: | |
| 286 return '' | |
| 287 | |
| 288 # Pass through strings | |
| 289 if (isinstance(value, str)): | |
| 290 return value | |
| 291 | |
| 292 # If it's a byte string, convert it to Unicode, treating it as UTF-8. | |
| 293 if isinstance(value, bytes): | |
| 294 return value.decode("utf8") | |
| 295 | |
| 296 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. | |
| 297 if isinstance(value, Sequence): | |
| 298 new_value = [] | |
| 299 for v in value: | |
| 300 if isinstance(v, Sequence): | |
| 301 # This is most certainly a user error and will crash and burn later, | |
| 302 # but to avoid excessive recursion, kick out now. | |
| 303 new_value.append(v) | |
| 304 else: | |
| 305 # Convert the child to a string | |
| 306 new_value.append(cls.normalize_value(v)) | |
| 307 return new_value | |
| 308 | |
| 309 # Try and make anything else a string | |
| 310 return str(value) | |
| 311 | |
| 312 @classmethod | |
| 313 def get_attribute_by_name(cls, el, name, default=None): | |
| 314 """Get attribute by name.""" | |
| 315 | |
| 316 value = default | |
| 317 if el._is_xml: | |
| 318 try: | |
| 319 value = cls.normalize_value(el.attrs[name]) | |
| 320 except KeyError: | |
| 321 pass | |
| 322 else: | |
| 323 for k, v in el.attrs.items(): | |
| 324 if util.lower(k) == name: | |
| 325 value = cls.normalize_value(v) | |
| 326 break | |
| 327 return value | |
| 328 | |
| 329 @classmethod | |
| 330 def iter_attributes(cls, el): | |
| 331 """Iterate attributes.""" | |
| 332 | |
| 333 for k, v in el.attrs.items(): | |
| 334 yield k, cls.normalize_value(v) | |
| 335 | |
| 336 @classmethod | |
| 337 def get_classes(cls, el): | |
| 338 """Get classes.""" | |
| 339 | |
| 340 classes = cls.get_attribute_by_name(el, 'class', []) | |
| 341 if isinstance(classes, str): | |
| 342 classes = RE_NOT_WS.findall(classes) | |
| 343 return classes | |
| 344 | |
| 345 def get_text(self, el, no_iframe=False): | |
| 346 """Get text.""" | |
| 347 | |
| 348 return ''.join( | |
| 349 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] | |
| 350 ) | |
| 351 | |
| 352 def get_own_text(self, el, no_iframe=False): | |
| 353 """Get Own Text.""" | |
| 354 | |
| 355 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] | |
| 356 | |
| 357 | |
| 358 class Inputs(object): | |
| 359 """Class for parsing and validating input items.""" | |
| 360 | |
| 361 @staticmethod | |
| 362 def validate_day(year, month, day): | |
| 363 """Validate day.""" | |
| 364 | |
| 365 max_days = LONG_MONTH | |
| 366 if month == FEB: | |
| 367 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH | |
| 368 elif month in MONTHS_30: | |
| 369 max_days = SHORT_MONTH | |
| 370 return 1 <= day <= max_days | |
| 371 | |
| 372 @staticmethod | |
| 373 def validate_week(year, week): | |
| 374 """Validate week.""" | |
| 375 | |
| 376 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] | |
| 377 if max_week == 1: | |
| 378 max_week = 53 | |
| 379 return 1 <= week <= max_week | |
| 380 | |
| 381 @staticmethod | |
| 382 def validate_month(month): | |
| 383 """Validate month.""" | |
| 384 | |
| 385 return 1 <= month <= 12 | |
| 386 | |
| 387 @staticmethod | |
| 388 def validate_year(year): | |
| 389 """Validate year.""" | |
| 390 | |
| 391 return 1 <= year | |
| 392 | |
| 393 @staticmethod | |
| 394 def validate_hour(hour): | |
| 395 """Validate hour.""" | |
| 396 | |
| 397 return 0 <= hour <= 23 | |
| 398 | |
| 399 @staticmethod | |
| 400 def validate_minutes(minutes): | |
| 401 """Validate minutes.""" | |
| 402 | |
| 403 return 0 <= minutes <= 59 | |
| 404 | |
| 405 @classmethod | |
| 406 def parse_value(cls, itype, value): | |
| 407 """Parse the input value.""" | |
| 408 | |
| 409 parsed = None | |
| 410 if itype == "date": | |
| 411 m = RE_DATE.match(value) | |
| 412 if m: | |
| 413 year = int(m.group('year'), 10) | |
| 414 month = int(m.group('month'), 10) | |
| 415 day = int(m.group('day'), 10) | |
| 416 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): | |
| 417 parsed = (year, month, day) | |
| 418 elif itype == "month": | |
| 419 m = RE_MONTH.match(value) | |
| 420 if m: | |
| 421 year = int(m.group('year'), 10) | |
| 422 month = int(m.group('month'), 10) | |
| 423 if cls.validate_year(year) and cls.validate_month(month): | |
| 424 parsed = (year, month) | |
| 425 elif itype == "week": | |
| 426 m = RE_WEEK.match(value) | |
| 427 if m: | |
| 428 year = int(m.group('year'), 10) | |
| 429 week = int(m.group('week'), 10) | |
| 430 if cls.validate_year(year) and cls.validate_week(year, week): | |
| 431 parsed = (year, week) | |
| 432 elif itype == "time": | |
| 433 m = RE_TIME.match(value) | |
| 434 if m: | |
| 435 hour = int(m.group('hour'), 10) | |
| 436 minutes = int(m.group('minutes'), 10) | |
| 437 if cls.validate_hour(hour) and cls.validate_minutes(minutes): | |
| 438 parsed = (hour, minutes) | |
| 439 elif itype == "datetime-local": | |
| 440 m = RE_DATETIME.match(value) | |
| 441 if m: | |
| 442 year = int(m.group('year'), 10) | |
| 443 month = int(m.group('month'), 10) | |
| 444 day = int(m.group('day'), 10) | |
| 445 hour = int(m.group('hour'), 10) | |
| 446 minutes = int(m.group('minutes'), 10) | |
| 447 if ( | |
| 448 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and | |
| 449 cls.validate_hour(hour) and cls.validate_minutes(minutes) | |
| 450 ): | |
| 451 parsed = (year, month, day, hour, minutes) | |
| 452 elif itype in ("number", "range"): | |
| 453 m = RE_NUM.match(value) | |
| 454 if m: | |
| 455 parsed = float(m.group('value')) | |
| 456 return parsed | |
| 457 | |
| 458 | |
| 459 class _Match(object): | |
| 460 """Perform CSS matching.""" | |
| 461 | |
| 462 def __init__(self, selectors, scope, namespaces, flags): | |
| 463 """Initialize.""" | |
| 464 | |
| 465 self.assert_valid_input(scope) | |
| 466 self.tag = scope | |
| 467 self.cached_meta_lang = [] | |
| 468 self.cached_default_forms = [] | |
| 469 self.cached_indeterminate_forms = [] | |
| 470 self.selectors = selectors | |
| 471 self.namespaces = {} if namespaces is None else namespaces | |
| 472 self.flags = flags | |
| 473 self.iframe_restrict = False | |
| 474 | |
| 475 # Find the root element for the whole tree | |
| 476 doc = scope | |
| 477 parent = self.get_parent(doc) | |
| 478 while parent: | |
| 479 doc = parent | |
| 480 parent = self.get_parent(doc) | |
| 481 root = None | |
| 482 if not self.is_doc(doc): | |
| 483 root = doc | |
| 484 else: | |
| 485 for child in self.get_children(doc): | |
| 486 root = child | |
| 487 break | |
| 488 | |
| 489 self.root = root | |
| 490 self.scope = scope if scope is not doc else root | |
| 491 self.has_html_namespace = self.has_html_ns(root) | |
| 492 | |
| 493 # A document can be both XML and HTML (XHTML) | |
| 494 self.is_xml = self.is_xml_tree(doc) | |
| 495 self.is_html = not self.is_xml or self.has_html_namespace | |
| 496 | |
| 497 def supports_namespaces(self): | |
| 498 """Check if namespaces are supported in the HTML type.""" | |
| 499 | |
| 500 return self.is_xml or self.has_html_namespace | |
| 501 | |
| 502 def get_tag_ns(self, el): | |
| 503 """Get tag namespace.""" | |
| 504 | |
| 505 if self.supports_namespaces(): | |
| 506 namespace = '' | |
| 507 ns = self.get_uri(el) | |
| 508 if ns: | |
| 509 namespace = ns | |
| 510 else: | |
| 511 namespace = NS_XHTML | |
| 512 return namespace | |
| 513 | |
| 514 def is_html_tag(self, el): | |
| 515 """Check if tag is in HTML namespace.""" | |
| 516 | |
| 517 return self.get_tag_ns(el) == NS_XHTML | |
| 518 | |
| 519 def get_tag(self, el): | |
| 520 """Get tag.""" | |
| 521 | |
| 522 name = self.get_tag_name(el) | |
| 523 return util.lower(name) if name is not None and not self.is_xml else name | |
| 524 | |
| 525 def get_prefix(self, el): | |
| 526 """Get prefix.""" | |
| 527 | |
| 528 prefix = self.get_prefix_name(el) | |
| 529 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix | |
| 530 | |
| 531 def find_bidi(self, el): | |
| 532 """Get directionality from element text.""" | |
| 533 | |
| 534 for node in self.get_children(el, tags=False): | |
| 535 | |
| 536 # Analyze child text nodes | |
| 537 if self.is_tag(node): | |
| 538 | |
| 539 # Avoid analyzing certain elements specified in the specification. | |
| 540 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) | |
| 541 if ( | |
| 542 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or | |
| 543 not self.is_html_tag(node) or | |
| 544 direction is not None | |
| 545 ): | |
| 546 continue # pragma: no cover | |
| 547 | |
| 548 # Check directionality of this node's text | |
| 549 value = self.find_bidi(node) | |
| 550 if value is not None: | |
| 551 return value | |
| 552 | |
| 553 # Direction could not be determined | |
| 554 continue # pragma: no cover | |
| 555 | |
| 556 # Skip `doctype` comments, etc. | |
| 557 if self.is_special_string(node): | |
| 558 continue | |
| 559 | |
| 560 # Analyze text nodes for directionality. | |
| 561 for c in node: | |
| 562 bidi = unicodedata.bidirectional(c) | |
| 563 if bidi in ('AL', 'R', 'L'): | |
| 564 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
| 565 return None | |
| 566 | |
| 567 def extended_language_filter(self, lang_range, lang_tag): | |
| 568 """Filter the language tags.""" | |
| 569 | |
| 570 match = True | |
| 571 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() | |
| 572 ranges = lang_range.split('-') | |
| 573 subtags = lang_tag.lower().split('-') | |
| 574 length = len(ranges) | |
| 575 rindex = 0 | |
| 576 sindex = 0 | |
| 577 r = ranges[rindex] | |
| 578 s = subtags[sindex] | |
| 579 | |
| 580 # Primary tag needs to match | |
| 581 if r != '*' and r != s: | |
| 582 match = False | |
| 583 | |
| 584 rindex += 1 | |
| 585 sindex += 1 | |
| 586 | |
| 587 # Match until we run out of ranges | |
| 588 while match and rindex < length: | |
| 589 r = ranges[rindex] | |
| 590 try: | |
| 591 s = subtags[sindex] | |
| 592 except IndexError: | |
| 593 # Ran out of subtags, | |
| 594 # but we still have ranges | |
| 595 match = False | |
| 596 continue | |
| 597 | |
| 598 # Empty range | |
| 599 if not r: | |
| 600 match = False | |
| 601 continue | |
| 602 | |
| 603 # Matched range | |
| 604 elif s == r: | |
| 605 rindex += 1 | |
| 606 | |
| 607 # Implicit wildcard cannot match | |
| 608 # singletons | |
| 609 elif len(s) == 1: | |
| 610 match = False | |
| 611 continue | |
| 612 | |
| 613 # Implicitly matched, so grab next subtag | |
| 614 sindex += 1 | |
| 615 | |
| 616 return match | |
| 617 | |
| 618 def match_attribute_name(self, el, attr, prefix): | |
| 619 """Match attribute name and return value if it exists.""" | |
| 620 | |
| 621 value = None | |
| 622 if self.supports_namespaces(): | |
| 623 value = None | |
| 624 # If we have not defined namespaces, we can't very well find them, so don't bother trying. | |
| 625 if prefix: | |
| 626 ns = self.namespaces.get(prefix) | |
| 627 if ns is None and prefix != '*': | |
| 628 return None | |
| 629 else: | |
| 630 ns = None | |
| 631 | |
| 632 for k, v in self.iter_attributes(el): | |
| 633 | |
| 634 # Get attribute parts | |
| 635 namespace, name = self.split_namespace(el, k) | |
| 636 | |
| 637 # Can't match a prefix attribute as we haven't specified one to match | |
| 638 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. | |
| 639 if ns is None: | |
| 640 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): | |
| 641 value = v | |
| 642 break | |
| 643 # Coverage is not finding this even though it is executed. | |
| 644 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. | |
| 645 # Ignore the false positive message. | |
| 646 continue # pragma: no cover | |
| 647 | |
| 648 # We can't match our desired prefix attribute as the attribute doesn't have a prefix | |
| 649 if namespace is None or ns != namespace and prefix != '*': | |
| 650 continue | |
| 651 | |
| 652 # The attribute doesn't match. | |
| 653 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): | |
| 654 continue | |
| 655 | |
| 656 value = v | |
| 657 break | |
| 658 else: | |
| 659 for k, v in self.iter_attributes(el): | |
| 660 if util.lower(attr) != util.lower(k): | |
| 661 continue | |
| 662 value = v | |
| 663 break | |
| 664 return value | |
| 665 | |
| 666 def match_namespace(self, el, tag): | |
| 667 """Match the namespace of the element.""" | |
| 668 | |
| 669 match = True | |
| 670 namespace = self.get_tag_ns(el) | |
| 671 default_namespace = self.namespaces.get('') | |
| 672 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) | |
| 673 # We must match the default namespace if one is not provided | |
| 674 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): | |
| 675 match = False | |
| 676 # If we specified `|tag`, we must not have a namespace. | |
| 677 elif (tag.prefix is not None and tag.prefix == '' and namespace): | |
| 678 match = False | |
| 679 # Verify prefix matches | |
| 680 elif ( | |
| 681 tag.prefix and | |
| 682 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) | |
| 683 ): | |
| 684 match = False | |
| 685 return match | |
| 686 | |
| 687 def match_attributes(self, el, attributes): | |
| 688 """Match attributes.""" | |
| 689 | |
| 690 match = True | |
| 691 if attributes: | |
| 692 for a in attributes: | |
| 693 value = self.match_attribute_name(el, a.attribute, a.prefix) | |
| 694 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern | |
| 695 if isinstance(value, list): | |
| 696 value = ' '.join(value) | |
| 697 if value is None: | |
| 698 match = False | |
| 699 break | |
| 700 elif pattern is None: | |
| 701 continue | |
| 702 elif pattern.match(value) is None: | |
| 703 match = False | |
| 704 break | |
| 705 return match | |
| 706 | |
| 707 def match_tagname(self, el, tag): | |
| 708 """Match tag name.""" | |
| 709 | |
| 710 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) | |
| 711 return not ( | |
| 712 name is not None and | |
| 713 name not in (self.get_tag(el), '*') | |
| 714 ) | |
| 715 | |
| 716 def match_tag(self, el, tag): | |
| 717 """Match the tag.""" | |
| 718 | |
| 719 match = True | |
| 720 if tag is not None: | |
| 721 # Verify namespace | |
| 722 if not self.match_namespace(el, tag): | |
| 723 match = False | |
| 724 if not self.match_tagname(el, tag): | |
| 725 match = False | |
| 726 return match | |
| 727 | |
| 728 def match_past_relations(self, el, relation): | |
| 729 """Match past relationship.""" | |
| 730 | |
| 731 found = False | |
| 732 if relation[0].rel_type == REL_PARENT: | |
| 733 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
| 734 while not found and parent: | |
| 735 found = self.match_selectors(parent, relation) | |
| 736 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) | |
| 737 elif relation[0].rel_type == REL_CLOSE_PARENT: | |
| 738 parent = self.get_parent(el, no_iframe=self.iframe_restrict) | |
| 739 if parent: | |
| 740 found = self.match_selectors(parent, relation) | |
| 741 elif relation[0].rel_type == REL_SIBLING: | |
| 742 sibling = self.get_previous(el) | |
| 743 while not found and sibling: | |
| 744 found = self.match_selectors(sibling, relation) | |
| 745 sibling = self.get_previous(sibling) | |
| 746 elif relation[0].rel_type == REL_CLOSE_SIBLING: | |
| 747 sibling = self.get_previous(el) | |
| 748 if sibling and self.is_tag(sibling): | |
| 749 found = self.match_selectors(sibling, relation) | |
| 750 return found | |
| 751 | |
| 752 def match_future_child(self, parent, relation, recursive=False): | |
| 753 """Match future child.""" | |
| 754 | |
| 755 match = False | |
| 756 children = self.get_descendants if recursive else self.get_children | |
| 757 for child in children(parent, no_iframe=self.iframe_restrict): | |
| 758 match = self.match_selectors(child, relation) | |
| 759 if match: | |
| 760 break | |
| 761 return match | |
| 762 | |
| 763 def match_future_relations(self, el, relation): | |
| 764 """Match future relationship.""" | |
| 765 | |
| 766 found = False | |
| 767 if relation[0].rel_type == REL_HAS_PARENT: | |
| 768 found = self.match_future_child(el, relation, True) | |
| 769 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: | |
| 770 found = self.match_future_child(el, relation) | |
| 771 elif relation[0].rel_type == REL_HAS_SIBLING: | |
| 772 sibling = self.get_next(el) | |
| 773 while not found and sibling: | |
| 774 found = self.match_selectors(sibling, relation) | |
| 775 sibling = self.get_next(sibling) | |
| 776 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: | |
| 777 sibling = self.get_next(el) | |
| 778 if sibling and self.is_tag(sibling): | |
| 779 found = self.match_selectors(sibling, relation) | |
| 780 return found | |
| 781 | |
| 782 def match_relations(self, el, relation): | |
| 783 """Match relationship to other elements.""" | |
| 784 | |
| 785 found = False | |
| 786 | |
| 787 if relation[0].rel_type.startswith(':'): | |
| 788 found = self.match_future_relations(el, relation) | |
| 789 else: | |
| 790 found = self.match_past_relations(el, relation) | |
| 791 | |
| 792 return found | |
| 793 | |
| 794 def match_id(self, el, ids): | |
| 795 """Match element's ID.""" | |
| 796 | |
| 797 found = True | |
| 798 for i in ids: | |
| 799 if i != self.get_attribute_by_name(el, 'id', ''): | |
| 800 found = False | |
| 801 break | |
| 802 return found | |
| 803 | |
| 804 def match_classes(self, el, classes): | |
| 805 """Match element's classes.""" | |
| 806 | |
| 807 current_classes = self.get_classes(el) | |
| 808 found = True | |
| 809 for c in classes: | |
| 810 if c not in current_classes: | |
| 811 found = False | |
| 812 break | |
| 813 return found | |
| 814 | |
| 815 def match_root(self, el): | |
| 816 """Match element as root.""" | |
| 817 | |
| 818 is_root = self.is_root(el) | |
| 819 if is_root: | |
| 820 sibling = self.get_previous(el, tags=False) | |
| 821 while is_root and sibling is not None: | |
| 822 if ( | |
| 823 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
| 824 self.is_cdata(sibling) | |
| 825 ): | |
| 826 is_root = False | |
| 827 else: | |
| 828 sibling = self.get_previous(sibling, tags=False) | |
| 829 if is_root: | |
| 830 sibling = self.get_next(el, tags=False) | |
| 831 while is_root and sibling is not None: | |
| 832 if ( | |
| 833 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or | |
| 834 self.is_cdata(sibling) | |
| 835 ): | |
| 836 is_root = False | |
| 837 else: | |
| 838 sibling = self.get_next(sibling, tags=False) | |
| 839 return is_root | |
| 840 | |
| 841 def match_scope(self, el): | |
| 842 """Match element as scope.""" | |
| 843 | |
| 844 return self.scope is el | |
| 845 | |
| 846 def match_nth_tag_type(self, el, child): | |
| 847 """Match tag type for `nth` matches.""" | |
| 848 | |
| 849 return( | |
| 850 (self.get_tag(child) == self.get_tag(el)) and | |
| 851 (self.get_tag_ns(child) == self.get_tag_ns(el)) | |
| 852 ) | |
| 853 | |
| 854 def match_nth(self, el, nth): | |
| 855 """Match `nth` elements.""" | |
| 856 | |
| 857 matched = True | |
| 858 | |
| 859 for n in nth: | |
| 860 matched = False | |
| 861 if n.selectors and not self.match_selectors(el, n.selectors): | |
| 862 break | |
| 863 parent = self.get_parent(el) | |
| 864 if parent is None: | |
| 865 parent = self.create_fake_parent(el) | |
| 866 last = n.last | |
| 867 last_index = len(parent) - 1 | |
| 868 index = last_index if last else 0 | |
| 869 relative_index = 0 | |
| 870 a = n.a | |
| 871 b = n.b | |
| 872 var = n.n | |
| 873 count = 0 | |
| 874 count_incr = 1 | |
| 875 factor = -1 if last else 1 | |
| 876 idx = last_idx = a * count + b if var else a | |
| 877 | |
| 878 # We can only adjust bounds within a variable index | |
| 879 if var: | |
| 880 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. | |
| 881 # Otherwise, increment to try to get in bounds. | |
| 882 adjust = None | |
| 883 while idx < 1 or idx > last_index: | |
| 884 if idx < 0: | |
| 885 diff_low = 0 - idx | |
| 886 if adjust is not None and adjust == 1: | |
| 887 break | |
| 888 adjust = -1 | |
| 889 count += count_incr | |
| 890 idx = last_idx = a * count + b if var else a | |
| 891 diff = 0 - idx | |
| 892 if diff >= diff_low: | |
| 893 break | |
| 894 else: | |
| 895 diff_high = idx - last_index | |
| 896 if adjust is not None and adjust == -1: | |
| 897 break | |
| 898 adjust = 1 | |
| 899 count += count_incr | |
| 900 idx = last_idx = a * count + b if var else a | |
| 901 diff = idx - last_index | |
| 902 if diff >= diff_high: | |
| 903 break | |
| 904 diff_high = diff | |
| 905 | |
| 906 # If a < 0, our count is working backwards, so floor the index by increasing the count. | |
| 907 # Find the count that yields the lowest, in bound value and use that. | |
| 908 # Lastly reverse count increment so that we'll increase our index. | |
| 909 lowest = count | |
| 910 if a < 0: | |
| 911 while idx >= 1: | |
| 912 lowest = count | |
| 913 count += count_incr | |
| 914 idx = last_idx = a * count + b if var else a | |
| 915 count_incr = -1 | |
| 916 count = lowest | |
| 917 idx = last_idx = a * count + b if var else a | |
| 918 | |
| 919 # Evaluate elements while our calculated nth index is still in range | |
| 920 while 1 <= idx <= last_index + 1: | |
| 921 child = None | |
| 922 # Evaluate while our child index is still in range. | |
| 923 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): | |
| 924 index += factor | |
| 925 if not self.is_tag(child): | |
| 926 continue | |
| 927 # Handle `of S` in `nth-child` | |
| 928 if n.selectors and not self.match_selectors(child, n.selectors): | |
| 929 continue | |
| 930 # Handle `of-type` | |
| 931 if n.of_type and not self.match_nth_tag_type(el, child): | |
| 932 continue | |
| 933 relative_index += 1 | |
| 934 if relative_index == idx: | |
| 935 if child is el: | |
| 936 matched = True | |
| 937 else: | |
| 938 break | |
| 939 if child is el: | |
| 940 break | |
| 941 if child is el: | |
| 942 break | |
| 943 last_idx = idx | |
| 944 count += count_incr | |
| 945 if count < 0: | |
| 946 # Count is counting down and has now ventured into invalid territory. | |
| 947 break | |
| 948 idx = a * count + b if var else a | |
| 949 if last_idx == idx: | |
| 950 break | |
| 951 if not matched: | |
| 952 break | |
| 953 return matched | |
| 954 | |
| 955 def match_empty(self, el): | |
| 956 """Check if element is empty (if requested).""" | |
| 957 | |
| 958 is_empty = True | |
| 959 for child in self.get_children(el, tags=False): | |
| 960 if self.is_tag(child): | |
| 961 is_empty = False | |
| 962 break | |
| 963 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): | |
| 964 is_empty = False | |
| 965 break | |
| 966 return is_empty | |
| 967 | |
| 968 def match_subselectors(self, el, selectors): | |
| 969 """Match selectors.""" | |
| 970 | |
| 971 match = True | |
| 972 for sel in selectors: | |
| 973 if not self.match_selectors(el, sel): | |
| 974 match = False | |
| 975 return match | |
| 976 | |
| 977 def match_contains(self, el, contains): | |
| 978 """Match element if it contains text.""" | |
| 979 | |
| 980 match = True | |
| 981 content = None | |
| 982 for contain_list in contains: | |
| 983 if content is None: | |
| 984 if contain_list.own: | |
| 985 content = self.get_own_text(el, no_iframe=self.is_html) | |
| 986 else: | |
| 987 content = self.get_text(el, no_iframe=self.is_html) | |
| 988 found = False | |
| 989 for text in contain_list.text: | |
| 990 if contain_list.own: | |
| 991 for c in content: | |
| 992 if text in c: | |
| 993 found = True | |
| 994 break | |
| 995 if found: | |
| 996 break | |
| 997 else: | |
| 998 if text in content: | |
| 999 found = True | |
| 1000 break | |
| 1001 if not found: | |
| 1002 match = False | |
| 1003 return match | |
| 1004 | |
| 1005 def match_default(self, el): | |
| 1006 """Match default.""" | |
| 1007 | |
| 1008 match = False | |
| 1009 | |
| 1010 # Find this input's form | |
| 1011 form = None | |
| 1012 parent = self.get_parent(el, no_iframe=True) | |
| 1013 while parent and form is None: | |
| 1014 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
| 1015 form = parent | |
| 1016 else: | |
| 1017 parent = self.get_parent(parent, no_iframe=True) | |
| 1018 | |
| 1019 # Look in form cache to see if we've already located its default button | |
| 1020 found_form = False | |
| 1021 for f, t in self.cached_default_forms: | |
| 1022 if f is form: | |
| 1023 found_form = True | |
| 1024 if t is el: | |
| 1025 match = True | |
| 1026 break | |
| 1027 | |
| 1028 # We didn't have the form cached, so look for its default button | |
| 1029 if not found_form: | |
| 1030 for child in self.get_descendants(form, no_iframe=True): | |
| 1031 name = self.get_tag(child) | |
| 1032 # Can't do nested forms (haven't figured out why we never hit this) | |
| 1033 if name == 'form': # pragma: no cover | |
| 1034 break | |
| 1035 if name in ('input', 'button'): | |
| 1036 v = self.get_attribute_by_name(child, 'type', '') | |
| 1037 if v and util.lower(v) == 'submit': | |
| 1038 self.cached_default_forms.append([form, child]) | |
| 1039 if el is child: | |
| 1040 match = True | |
| 1041 break | |
| 1042 return match | |
| 1043 | |
| 1044 def match_indeterminate(self, el): | |
| 1045 """Match default.""" | |
| 1046 | |
| 1047 match = False | |
| 1048 name = self.get_attribute_by_name(el, 'name') | |
| 1049 | |
| 1050 def get_parent_form(el): | |
| 1051 """Find this input's form.""" | |
| 1052 form = None | |
| 1053 parent = self.get_parent(el, no_iframe=True) | |
| 1054 while form is None: | |
| 1055 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): | |
| 1056 form = parent | |
| 1057 break | |
| 1058 last_parent = parent | |
| 1059 parent = self.get_parent(parent, no_iframe=True) | |
| 1060 if parent is None: | |
| 1061 form = last_parent | |
| 1062 break | |
| 1063 return form | |
| 1064 | |
| 1065 form = get_parent_form(el) | |
| 1066 | |
| 1067 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate | |
| 1068 found_form = False | |
| 1069 for f, n, i in self.cached_indeterminate_forms: | |
| 1070 if f is form and n == name: | |
| 1071 found_form = True | |
| 1072 if i is True: | |
| 1073 match = True | |
| 1074 break | |
| 1075 | |
| 1076 # We didn't have the form cached, so validate that the radio button is indeterminate | |
| 1077 if not found_form: | |
| 1078 checked = False | |
| 1079 for child in self.get_descendants(form, no_iframe=True): | |
| 1080 if child is el: | |
| 1081 continue | |
| 1082 tag_name = self.get_tag(child) | |
| 1083 if tag_name == 'input': | |
| 1084 is_radio = False | |
| 1085 check = False | |
| 1086 has_name = False | |
| 1087 for k, v in self.iter_attributes(child): | |
| 1088 if util.lower(k) == 'type' and util.lower(v) == 'radio': | |
| 1089 is_radio = True | |
| 1090 elif util.lower(k) == 'name' and v == name: | |
| 1091 has_name = True | |
| 1092 elif util.lower(k) == 'checked': | |
| 1093 check = True | |
| 1094 if is_radio and check and has_name and get_parent_form(child) is form: | |
| 1095 checked = True | |
| 1096 break | |
| 1097 if checked: | |
| 1098 break | |
| 1099 if not checked: | |
| 1100 match = True | |
| 1101 self.cached_indeterminate_forms.append([form, name, match]) | |
| 1102 | |
| 1103 return match | |
| 1104 | |
| 1105 def match_lang(self, el, langs): | |
| 1106 """Match languages.""" | |
| 1107 | |
| 1108 match = False | |
| 1109 has_ns = self.supports_namespaces() | |
| 1110 root = self.root | |
| 1111 has_html_namespace = self.has_html_namespace | |
| 1112 | |
| 1113 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. | |
| 1114 parent = el | |
| 1115 found_lang = None | |
| 1116 last = None | |
| 1117 while not found_lang: | |
| 1118 has_html_ns = self.has_html_ns(parent) | |
| 1119 for k, v in self.iter_attributes(parent): | |
| 1120 attr_ns, attr = self.split_namespace(parent, k) | |
| 1121 if ( | |
| 1122 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or | |
| 1123 ( | |
| 1124 has_ns and not has_html_ns and attr_ns == NS_XML and | |
| 1125 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' | |
| 1126 ) | |
| 1127 ): | |
| 1128 found_lang = v | |
| 1129 break | |
| 1130 last = parent | |
| 1131 parent = self.get_parent(parent, no_iframe=self.is_html) | |
| 1132 | |
| 1133 if parent is None: | |
| 1134 root = last | |
| 1135 has_html_namespace = self.has_html_ns(root) | |
| 1136 parent = last | |
| 1137 break | |
| 1138 | |
| 1139 # Use cached meta language. | |
| 1140 if not found_lang and self.cached_meta_lang: | |
| 1141 for cache in self.cached_meta_lang: | |
| 1142 if root is cache[0]: | |
| 1143 found_lang = cache[1] | |
| 1144 | |
| 1145 # If we couldn't find a language, and the document is HTML, look to meta to determine language. | |
| 1146 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): | |
| 1147 # Find head | |
| 1148 found = False | |
| 1149 for tag in ('html', 'head'): | |
| 1150 found = False | |
| 1151 for child in self.get_children(parent, no_iframe=self.is_html): | |
| 1152 if self.get_tag(child) == tag and self.is_html_tag(child): | |
| 1153 found = True | |
| 1154 parent = child | |
| 1155 break | |
| 1156 if not found: # pragma: no cover | |
| 1157 break | |
| 1158 | |
| 1159 # Search meta tags | |
| 1160 if found: | |
| 1161 for child in parent: | |
| 1162 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): | |
| 1163 c_lang = False | |
| 1164 content = None | |
| 1165 for k, v in self.iter_attributes(child): | |
| 1166 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': | |
| 1167 c_lang = True | |
| 1168 if util.lower(k) == 'content': | |
| 1169 content = v | |
| 1170 if c_lang and content: | |
| 1171 found_lang = content | |
| 1172 self.cached_meta_lang.append((root, found_lang)) | |
| 1173 break | |
| 1174 if found_lang: | |
| 1175 break | |
| 1176 if not found_lang: | |
| 1177 self.cached_meta_lang.append((root, False)) | |
| 1178 | |
| 1179 # If we determined a language, compare. | |
| 1180 if found_lang: | |
| 1181 for patterns in langs: | |
| 1182 match = False | |
| 1183 for pattern in patterns: | |
| 1184 if self.extended_language_filter(pattern, found_lang): | |
| 1185 match = True | |
| 1186 if not match: | |
| 1187 break | |
| 1188 | |
| 1189 return match | |
| 1190 | |
| 1191 def match_dir(self, el, directionality): | |
| 1192 """Check directionality.""" | |
| 1193 | |
| 1194 # If we have to match both left and right, we can't match either. | |
| 1195 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: | |
| 1196 return False | |
| 1197 | |
| 1198 if el is None or not self.is_html_tag(el): | |
| 1199 return False | |
| 1200 | |
| 1201 # Element has defined direction of left to right or right to left | |
| 1202 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) | |
| 1203 if direction not in (None, 0): | |
| 1204 return direction == directionality | |
| 1205 | |
| 1206 # Element is the document element (the root) and no direction assigned, assume left to right. | |
| 1207 is_root = self.is_root(el) | |
| 1208 if is_root and direction is None: | |
| 1209 return ct.SEL_DIR_LTR == directionality | |
| 1210 | |
| 1211 # If `input[type=telephone]` and no direction is assigned, assume left to right. | |
| 1212 name = self.get_tag(el) | |
| 1213 is_input = name == 'input' | |
| 1214 is_textarea = name == 'textarea' | |
| 1215 is_bdi = name == 'bdi' | |
| 1216 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' | |
| 1217 if is_input and itype == 'tel' and direction is None: | |
| 1218 return ct.SEL_DIR_LTR == directionality | |
| 1219 | |
| 1220 # Auto handling for text inputs | |
| 1221 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: | |
| 1222 if is_textarea: | |
| 1223 value = [] | |
| 1224 for node in self.get_contents(el, no_iframe=True): | |
| 1225 if self.is_content_string(node): | |
| 1226 value.append(node) | |
| 1227 value = ''.join(value) | |
| 1228 else: | |
| 1229 value = self.get_attribute_by_name(el, 'value', '') | |
| 1230 if value: | |
| 1231 for c in value: | |
| 1232 bidi = unicodedata.bidirectional(c) | |
| 1233 if bidi in ('AL', 'R', 'L'): | |
| 1234 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL | |
| 1235 return direction == directionality | |
| 1236 # Assume left to right | |
| 1237 return ct.SEL_DIR_LTR == directionality | |
| 1238 elif is_root: | |
| 1239 return ct.SEL_DIR_LTR == directionality | |
| 1240 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1241 | |
| 1242 # Auto handling for `bdi` and other non text inputs. | |
| 1243 if (is_bdi and direction is None) or direction == 0: | |
| 1244 direction = self.find_bidi(el) | |
| 1245 if direction is not None: | |
| 1246 return direction == directionality | |
| 1247 elif is_root: | |
| 1248 return ct.SEL_DIR_LTR == directionality | |
| 1249 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1250 | |
| 1251 # Match parents direction | |
| 1252 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) | |
| 1253 | |
| 1254 def match_range(self, el, condition): | |
| 1255 """ | |
| 1256 Match range. | |
| 1257 | |
| 1258 Behavior is modeled after what we see in browsers. Browsers seem to evaluate | |
| 1259 if the value is out of range, and if not, it is in range. So a missing value | |
| 1260 will not evaluate out of range; therefore, value is in range. Personally, I | |
| 1261 feel like this should evaluate as neither in or out of range. | |
| 1262 """ | |
| 1263 | |
| 1264 out_of_range = False | |
| 1265 | |
| 1266 itype = util.lower(self.get_attribute_by_name(el, 'type')) | |
| 1267 mn = self.get_attribute_by_name(el, 'min', None) | |
| 1268 if mn is not None: | |
| 1269 mn = Inputs.parse_value(itype, mn) | |
| 1270 mx = self.get_attribute_by_name(el, 'max', None) | |
| 1271 if mx is not None: | |
| 1272 mx = Inputs.parse_value(itype, mx) | |
| 1273 | |
| 1274 # There is no valid min or max, so we cannot evaluate a range | |
| 1275 if mn is None and mx is None: | |
| 1276 return False | |
| 1277 | |
| 1278 value = self.get_attribute_by_name(el, 'value', None) | |
| 1279 if value is not None: | |
| 1280 value = Inputs.parse_value(itype, value) | |
| 1281 if value is not None: | |
| 1282 if itype in ("date", "datetime-local", "month", "week", "number", "range"): | |
| 1283 if mn is not None and value < mn: | |
| 1284 out_of_range = True | |
| 1285 if not out_of_range and mx is not None and value > mx: | |
| 1286 out_of_range = True | |
| 1287 elif itype == "time": | |
| 1288 if mn is not None and mx is not None and mn > mx: | |
| 1289 # Time is periodic, so this is a reversed/discontinuous range | |
| 1290 if value < mn and value > mx: | |
| 1291 out_of_range = True | |
| 1292 else: | |
| 1293 if mn is not None and value < mn: | |
| 1294 out_of_range = True | |
| 1295 if not out_of_range and mx is not None and value > mx: | |
| 1296 out_of_range = True | |
| 1297 | |
| 1298 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range | |
| 1299 | |
| 1300 def match_defined(self, el): | |
| 1301 """ | |
| 1302 Match defined. | |
| 1303 | |
| 1304 `:defined` is related to custom elements in a browser. | |
| 1305 | |
| 1306 - If the document is XML (not XHTML), all tags will match. | |
| 1307 - Tags that are not custom (don't have a hyphen) are marked defined. | |
| 1308 - If the tag has a prefix (without or without a namespace), it will not match. | |
| 1309 | |
| 1310 This is of course requires the parser to provide us with the proper prefix and namespace info, | |
| 1311 if it doesn't, there is nothing we can do. | |
| 1312 """ | |
| 1313 | |
| 1314 name = self.get_tag(el) | |
| 1315 return ( | |
| 1316 name.find('-') == -1 or | |
| 1317 name.find(':') != -1 or | |
| 1318 self.get_prefix(el) is not None | |
| 1319 ) | |
| 1320 | |
| 1321 def match_placeholder_shown(self, el): | |
| 1322 """ | |
| 1323 Match placeholder shown according to HTML spec. | |
| 1324 | |
| 1325 - text area should be checked if they have content. A single newline does not count as content. | |
| 1326 | |
| 1327 """ | |
| 1328 | |
| 1329 match = False | |
| 1330 content = self.get_text(el) | |
| 1331 if content in ('', '\n'): | |
| 1332 match = True | |
| 1333 | |
| 1334 return match | |
| 1335 | |
| 1336 def match_selectors(self, el, selectors): | |
| 1337 """Check if element matches one of the selectors.""" | |
| 1338 | |
| 1339 match = False | |
| 1340 is_not = selectors.is_not | |
| 1341 is_html = selectors.is_html | |
| 1342 | |
| 1343 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. | |
| 1344 if is_html: | |
| 1345 namespaces = self.namespaces | |
| 1346 iframe_restrict = self.iframe_restrict | |
| 1347 self.namespaces = {'html': NS_XHTML} | |
| 1348 self.iframe_restrict = True | |
| 1349 | |
| 1350 if not is_html or self.is_html: | |
| 1351 for selector in selectors: | |
| 1352 match = is_not | |
| 1353 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) | |
| 1354 if isinstance(selector, ct.SelectorNull): | |
| 1355 continue | |
| 1356 # Verify tag matches | |
| 1357 if not self.match_tag(el, selector.tag): | |
| 1358 continue | |
| 1359 # Verify tag is defined | |
| 1360 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): | |
| 1361 continue | |
| 1362 # Verify element is root | |
| 1363 if selector.flags & ct.SEL_ROOT and not self.match_root(el): | |
| 1364 continue | |
| 1365 # Verify element is scope | |
| 1366 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): | |
| 1367 continue | |
| 1368 # Verify element has placeholder shown | |
| 1369 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): | |
| 1370 continue | |
| 1371 # Verify `nth` matches | |
| 1372 if not self.match_nth(el, selector.nth): | |
| 1373 continue | |
| 1374 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): | |
| 1375 continue | |
| 1376 # Verify id matches | |
| 1377 if selector.ids and not self.match_id(el, selector.ids): | |
| 1378 continue | |
| 1379 # Verify classes match | |
| 1380 if selector.classes and not self.match_classes(el, selector.classes): | |
| 1381 continue | |
| 1382 # Verify attribute(s) match | |
| 1383 if not self.match_attributes(el, selector.attributes): | |
| 1384 continue | |
| 1385 # Verify ranges | |
| 1386 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): | |
| 1387 continue | |
| 1388 # Verify language patterns | |
| 1389 if selector.lang and not self.match_lang(el, selector.lang): | |
| 1390 continue | |
| 1391 # Verify pseudo selector patterns | |
| 1392 if selector.selectors and not self.match_subselectors(el, selector.selectors): | |
| 1393 continue | |
| 1394 # Verify relationship selectors | |
| 1395 if selector.relation and not self.match_relations(el, selector.relation): | |
| 1396 continue | |
| 1397 # Validate that the current default selector match corresponds to the first submit button in the form | |
| 1398 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): | |
| 1399 continue | |
| 1400 # Validate that the unset radio button is among radio buttons with the same name in a form that are | |
| 1401 # also not set. | |
| 1402 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): | |
| 1403 continue | |
| 1404 # Validate element directionality | |
| 1405 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): | |
| 1406 continue | |
| 1407 # Validate that the tag contains the specified text. | |
| 1408 if not self.match_contains(el, selector.contains): | |
| 1409 continue | |
| 1410 match = not is_not | |
| 1411 break | |
| 1412 | |
| 1413 # Restore actual namespaces being used for external selector lists | |
| 1414 if is_html: | |
| 1415 self.namespaces = namespaces | |
| 1416 self.iframe_restrict = iframe_restrict | |
| 1417 | |
| 1418 return match | |
| 1419 | |
| 1420 def select(self, limit=0): | |
| 1421 """Match all tags under the targeted tag.""" | |
| 1422 | |
| 1423 if limit < 1: | |
| 1424 limit = None | |
| 1425 | |
| 1426 for child in self.get_descendants(self.tag): | |
| 1427 if self.match(child): | |
| 1428 yield child | |
| 1429 if limit is not None: | |
| 1430 limit -= 1 | |
| 1431 if limit < 1: | |
| 1432 break | |
| 1433 | |
| 1434 def closest(self): | |
| 1435 """Match closest ancestor.""" | |
| 1436 | |
| 1437 current = self.tag | |
| 1438 closest = None | |
| 1439 while closest is None and current is not None: | |
| 1440 if self.match(current): | |
| 1441 closest = current | |
| 1442 else: | |
| 1443 current = self.get_parent(current) | |
| 1444 return closest | |
| 1445 | |
| 1446 def filter(self): # noqa A001 | |
| 1447 """Filter tag's children.""" | |
| 1448 | |
| 1449 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] | |
| 1450 | |
| 1451 def match(self, el): | |
| 1452 """Match.""" | |
| 1453 | |
| 1454 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) | |
| 1455 | |
| 1456 | |
| 1457 class CSSMatch(_DocumentNav, _Match): | |
| 1458 """The Beautiful Soup CSS match class.""" | |
| 1459 | |
| 1460 | |
| 1461 class SoupSieve(ct.Immutable): | |
| 1462 """Compiled Soup Sieve selector matching object.""" | |
| 1463 | |
| 1464 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") | |
| 1465 | |
| 1466 def __init__(self, pattern, selectors, namespaces, custom, flags): | |
| 1467 """Initialize.""" | |
| 1468 | |
| 1469 super(SoupSieve, self).__init__( | |
| 1470 pattern=pattern, | |
| 1471 selectors=selectors, | |
| 1472 namespaces=namespaces, | |
| 1473 custom=custom, | |
| 1474 flags=flags | |
| 1475 ) | |
| 1476 | |
| 1477 def match(self, tag): | |
| 1478 """Match.""" | |
| 1479 | |
| 1480 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) | |
| 1481 | |
| 1482 def closest(self, tag): | |
| 1483 """Match closest ancestor.""" | |
| 1484 | |
| 1485 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() | |
| 1486 | |
| 1487 def filter(self, iterable): # noqa A001 | |
| 1488 """ | |
| 1489 Filter. | |
| 1490 | |
| 1491 `CSSMatch` can cache certain searches for tags of the same document, | |
| 1492 so if we are given a tag, all tags are from the same document, | |
| 1493 and we can take advantage of the optimization. | |
| 1494 | |
| 1495 Any other kind of iterable could have tags from different documents or detached tags, | |
| 1496 so for those, we use a new `CSSMatch` for each item in the iterable. | |
| 1497 """ | |
| 1498 | |
| 1499 if CSSMatch.is_tag(iterable): | |
| 1500 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() | |
| 1501 else: | |
| 1502 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] | |
| 1503 | |
| 1504 def select_one(self, tag): | |
| 1505 """Select a single tag.""" | |
| 1506 | |
| 1507 tags = self.select(tag, limit=1) | |
| 1508 return tags[0] if tags else None | |
| 1509 | |
| 1510 def select(self, tag, limit=0): | |
| 1511 """Select the specified tags.""" | |
| 1512 | |
| 1513 return list(self.iselect(tag, limit)) | |
| 1514 | |
| 1515 def iselect(self, tag, limit=0): | |
| 1516 """Iterate the specified tags.""" | |
| 1517 | |
| 1518 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): | |
| 1519 yield el | |
| 1520 | |
| 1521 def __repr__(self): # pragma: no cover | |
| 1522 """Representation.""" | |
| 1523 | |
| 1524 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( | |
| 1525 self.pattern, | |
| 1526 self.namespaces, | |
| 1527 self.custom, | |
| 1528 self.flags | |
| 1529 ) | |
| 1530 | |
| 1531 __str__ = __repr__ | |
| 1532 | |
| 1533 | |
| 1534 ct.pickle_register(SoupSieve) |
