1"""CSS matcher.""" 2from datetime import datetime 3from . import util 4import re 5from . import css_types as ct 6import unicodedata 7import bs4 # type: ignore[import] 8from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast 9 10# Empty tag pattern (whitespace okay) 11RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 12 13RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 14 15# Relationships 16REL_PARENT = ' ' 17REL_CLOSE_PARENT = '>' 18REL_SIBLING = '~' 19REL_CLOSE_SIBLING = '+' 20 21# Relationships for :has() (forward looking) 22REL_HAS_PARENT = ': ' 23REL_HAS_CLOSE_PARENT = ':>' 24REL_HAS_SIBLING = ':~' 25REL_HAS_CLOSE_SIBLING = ':+' 26 27NS_XHTML = 'http://www.w3.org/1999/xhtml' 28NS_XML = 'http://www.w3.org/XML/1998/namespace' 29 30DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 31RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 32 33DIR_MAP = { 34 'ltr': ct.SEL_DIR_LTR, 35 'rtl': ct.SEL_DIR_RTL, 36 'auto': 0 37} 38 39RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 40RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 41RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 42RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 43RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 44RE_DATETIME = re.compile( 45 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 46) 47RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') 48 49MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 50FEB = 2 51SHORT_MONTH = 30 52LONG_MONTH = 31 53FEB_MONTH = 28 54FEB_LEAP_MONTH = 29 55DAYS_IN_WEEK = 7 56 57 58class _FakeParent: 59 """ 60 Fake parent class. 61 62 When we have a fragment with no `BeautifulSoup` document object, 63 we can't evaluate `nth` selectors properly. Create a temporary 64 fake parent so we can traverse the root element as a child. 65 """ 66 67 def __init__(self, element: 'bs4.Tag') -> None: 68 """Initialize.""" 69 70 self.contents = [element] 71 72 def __len__(self) -> 'bs4.PageElement': 73 """Length.""" 74 75 return len(self.contents) 76 77 78class _DocumentNav: 79 """Navigate a Beautiful Soup document.""" 80 81 @classmethod 82 def assert_valid_input(cls, tag: Any) -> None: 83 """Check if valid input tag or document.""" 84 85 # Fail on unexpected types. 86 if not cls.is_tag(tag): 87 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) 88 89 @staticmethod 90 def is_doc(obj: 'bs4.Tag') -> bool: 91 """Is `BeautifulSoup` object.""" 92 return isinstance(obj, bs4.BeautifulSoup) 93 94 @staticmethod 95 def is_tag(obj: 'bs4.PageElement') -> bool: 96 """Is tag.""" 97 return isinstance(obj, bs4.Tag) 98 99 @staticmethod 100 def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover 101 """Is declaration.""" 102 return isinstance(obj, bs4.Declaration) 103 104 @staticmethod 105 def is_cdata(obj: 'bs4.PageElement') -> bool: 106 """Is CDATA.""" 107 return isinstance(obj, bs4.CData) 108 109 @staticmethod 110 def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover 111 """Is processing instruction.""" 112 return isinstance(obj, bs4.ProcessingInstruction) 113 114 @staticmethod 115 def is_navigable_string(obj: 'bs4.PageElement') -> bool: 116 """Is navigable string.""" 117 return isinstance(obj, bs4.NavigableString) 118 119 @staticmethod 120 def is_special_string(obj: 'bs4.PageElement') -> bool: 121 """Is special string.""" 122 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 123 124 @classmethod 125 def is_content_string(cls, obj: 'bs4.PageElement') -> bool: 126 """Check if node is content string.""" 127 128 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 129 130 @staticmethod 131 def create_fake_parent(el: 'bs4.Tag') -> _FakeParent: 132 """Create fake parent for a given element.""" 133 134 return _FakeParent(el) 135 136 @staticmethod 137 def is_xml_tree(el: 'bs4.Tag') -> bool: 138 """Check if element (or document) is from a XML tree.""" 139 140 return bool(el._is_xml) 141 142 def is_iframe(self, el: 'bs4.Tag') -> bool: 143 """Check if element is an `iframe`.""" 144 145 return bool( 146 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and 147 self.is_html_tag(el) # type: ignore[attr-defined] 148 ) 149 150 def is_root(self, el: 'bs4.Tag') -> bool: 151 """ 152 Return whether element is a root element. 153 154 We check that the element is the root of the tree (which we have already pre-calculated), 155 and we check if it is the root element under an `iframe`. 156 """ 157 158 root = self.root and self.root is el # type: ignore[attr-defined] 159 if not root: 160 parent = self.get_parent(el) 161 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] 162 return root 163 164 def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']: 165 """Get contents or contents in reverse.""" 166 if not no_iframe or not self.is_iframe(el): 167 for content in el.contents: 168 yield content 169 170 def get_children( 171 self, 172 el: 'bs4.Tag', 173 start: Optional[int] = None, 174 reverse: bool = False, 175 tags: bool = True, 176 no_iframe: bool = False 177 ) -> Iterator['bs4.PageElement']: 178 """Get children.""" 179 180 if not no_iframe or not self.is_iframe(el): 181 last = len(el.contents) - 1 182 if start is None: 183 index = last if reverse else 0 184 else: 185 index = start 186 end = -1 if reverse else last + 1 187 incr = -1 if reverse else 1 188 189 if 0 <= index <= last: 190 while index != end: 191 node = el.contents[index] 192 index += incr 193 if not tags or self.is_tag(node): 194 yield node 195 196 def get_descendants( 197 self, 198 el: 'bs4.Tag', 199 tags: bool = True, 200 no_iframe: bool = False 201 ) -> Iterator['bs4.PageElement']: 202 """Get descendants.""" 203 204 if not no_iframe or not self.is_iframe(el): 205 next_good = None 206 for child in el.descendants: 207 208 if next_good is not None: 209 if child is not next_good: 210 continue 211 next_good = None 212 213 is_tag = self.is_tag(child) 214 215 if no_iframe and is_tag and self.is_iframe(child): 216 if child.next_sibling is not None: 217 next_good = child.next_sibling 218 else: 219 last_child = child 220 while self.is_tag(last_child) and last_child.contents: 221 last_child = last_child.contents[-1] 222 next_good = last_child.next_element 223 yield child 224 if next_good is None: 225 break 226 # Coverage isn't seeing this even though it's executed 227 continue # pragma: no cover 228 229 if not tags or is_tag: 230 yield child 231 232 def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag': 233 """Get parent.""" 234 235 parent = el.parent 236 if no_iframe and parent is not None and self.is_iframe(parent): 237 parent = None 238 return parent 239 240 @staticmethod 241 def get_tag_name(el: 'bs4.Tag') -> Optional[str]: 242 """Get tag.""" 243 244 return cast(Optional[str], el.name) 245 246 @staticmethod 247 def get_prefix_name(el: 'bs4.Tag') -> Optional[str]: 248 """Get prefix.""" 249 250 return cast(Optional[str], el.prefix) 251 252 @staticmethod 253 def get_uri(el: 'bs4.Tag') -> Optional[str]: 254 """Get namespace `URI`.""" 255 256 return cast(Optional[str], el.namespace) 257 258 @classmethod 259 def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': 260 """Get next sibling tag.""" 261 262 sibling = el.next_sibling 263 while tags and not cls.is_tag(sibling) and sibling is not None: 264 sibling = sibling.next_sibling 265 return sibling 266 267 @classmethod 268 def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': 269 """Get previous sibling tag.""" 270 271 sibling = el.previous_sibling 272 while tags and not cls.is_tag(sibling) and sibling is not None: 273 sibling = sibling.previous_sibling 274 return sibling 275 276 @staticmethod 277 def has_html_ns(el: 'bs4.Tag') -> bool: 278 """ 279 Check if element has an HTML namespace. 280 281 This is a bit different than whether a element is treated as having an HTML namespace, 282 like we do in the case of `is_html_tag`. 283 """ 284 285 ns = getattr(el, 'namespace') if el else None 286 return bool(ns and ns == NS_XHTML) 287 288 @staticmethod 289 def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]: 290 """Return namespace and attribute name without the prefix.""" 291 292 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 293 294 @classmethod 295 def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]: 296 """Normalize the value to be a string or list of strings.""" 297 298 # Treat `None` as empty string. 299 if value is None: 300 return '' 301 302 # Pass through strings 303 if (isinstance(value, str)): 304 return value 305 306 # If it's a byte string, convert it to Unicode, treating it as UTF-8. 307 if isinstance(value, bytes): 308 return value.decode("utf8") 309 310 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. 311 if isinstance(value, Sequence): 312 new_value = [] 313 for v in value: 314 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): 315 # This is most certainly a user error and will crash and burn later. 316 # To keep things working, we'll do what we do with all objects, 317 # And convert them to strings. 318 new_value.append(str(v)) 319 else: 320 # Convert the child to a string 321 new_value.append(cast(str, cls.normalize_value(v))) 322 return new_value 323 324 # Try and make anything else a string 325 return str(value) 326 327 @classmethod 328 def get_attribute_by_name( 329 cls, 330 el: 'bs4.Tag', 331 name: str, 332 default: Optional[Union[str, Sequence[str]]] = None 333 ) -> Optional[Union[str, Sequence[str]]]: 334 """Get attribute by name.""" 335 336 value = default 337 if el._is_xml: 338 try: 339 value = cls.normalize_value(el.attrs[name]) 340 except KeyError: 341 pass 342 else: 343 for k, v in el.attrs.items(): 344 if util.lower(k) == name: 345 value = cls.normalize_value(v) 346 break 347 return value 348 349 @classmethod 350 def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]: 351 """Iterate attributes.""" 352 353 for k, v in el.attrs.items(): 354 yield k, cls.normalize_value(v) 355 356 @classmethod 357 def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]: 358 """Get classes.""" 359 360 classes = cls.get_attribute_by_name(el, 'class', []) 361 if isinstance(classes, str): 362 classes = RE_NOT_WS.findall(classes) 363 return cast(Sequence[str], classes) 364 365 def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str: 366 """Get text.""" 367 368 return ''.join( 369 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] 370 ) 371 372 def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]: 373 """Get Own Text.""" 374 375 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] 376 377 378class Inputs: 379 """Class for parsing and validating input items.""" 380 381 @staticmethod 382 def validate_day(year: int, month: int, day: int) -> bool: 383 """Validate day.""" 384 385 max_days = LONG_MONTH 386 if month == FEB: 387 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 388 elif month in MONTHS_30: 389 max_days = SHORT_MONTH 390 return 1 <= day <= max_days 391 392 @staticmethod 393 def validate_week(year: int, week: int) -> bool: 394 """Validate week.""" 395 396 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] 397 if max_week == 1: 398 max_week = 53 399 return 1 <= week <= max_week 400 401 @staticmethod 402 def validate_month(month: int) -> bool: 403 """Validate month.""" 404 405 return 1 <= month <= 12 406 407 @staticmethod 408 def validate_year(year: int) -> bool: 409 """Validate year.""" 410 411 return 1 <= year 412 413 @staticmethod 414 def validate_hour(hour: int) -> bool: 415 """Validate hour.""" 416 417 return 0 <= hour <= 23 418 419 @staticmethod 420 def validate_minutes(minutes: int) -> bool: 421 """Validate minutes.""" 422 423 return 0 <= minutes <= 59 424 425 @classmethod 426 def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]: 427 """Parse the input value.""" 428 429 parsed = None # type: Optional[Tuple[float, ...]] 430 if value is None: 431 return value 432 if itype == "date": 433 m = RE_DATE.match(value) 434 if m: 435 year = int(m.group('year'), 10) 436 month = int(m.group('month'), 10) 437 day = int(m.group('day'), 10) 438 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 439 parsed = (year, month, day) 440 elif itype == "month": 441 m = RE_MONTH.match(value) 442 if m: 443 year = int(m.group('year'), 10) 444 month = int(m.group('month'), 10) 445 if cls.validate_year(year) and cls.validate_month(month): 446 parsed = (year, month) 447 elif itype == "week": 448 m = RE_WEEK.match(value) 449 if m: 450 year = int(m.group('year'), 10) 451 week = int(m.group('week'), 10) 452 if cls.validate_year(year) and cls.validate_week(year, week): 453 parsed = (year, week) 454 elif itype == "time": 455 m = RE_TIME.match(value) 456 if m: 457 hour = int(m.group('hour'), 10) 458 minutes = int(m.group('minutes'), 10) 459 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 460 parsed = (hour, minutes) 461 elif itype == "datetime-local": 462 m = RE_DATETIME.match(value) 463 if m: 464 year = int(m.group('year'), 10) 465 month = int(m.group('month'), 10) 466 day = int(m.group('day'), 10) 467 hour = int(m.group('hour'), 10) 468 minutes = int(m.group('minutes'), 10) 469 if ( 470 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 471 cls.validate_hour(hour) and cls.validate_minutes(minutes) 472 ): 473 parsed = (year, month, day, hour, minutes) 474 elif itype in ("number", "range"): 475 m = RE_NUM.match(value) 476 if m: 477 parsed = (float(m.group('value')),) 478 return parsed 479 480 481class CSSMatch(_DocumentNav): 482 """Perform CSS matching.""" 483 484 def __init__( 485 self, 486 selectors: ct.SelectorList, 487 scope: 'bs4.Tag', 488 namespaces: Optional[ct.Namespaces], 489 flags: int 490 ) -> None: 491 """Initialize.""" 492 493 self.assert_valid_input(scope) 494 self.tag = scope 495 self.cached_meta_lang = [] # type: List[Tuple[str, str]] 496 self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']] 497 self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]] 498 self.selectors = selectors 499 self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]] 500 self.flags = flags 501 self.iframe_restrict = False 502 503 # Find the root element for the whole tree 504 doc = scope 505 parent = self.get_parent(doc) 506 while parent: 507 doc = parent 508 parent = self.get_parent(doc) 509 root = None 510 if not self.is_doc(doc): 511 root = doc 512 else: 513 for child in self.get_children(doc): 514 root = child 515 break 516 517 self.root = root 518 self.scope = scope if scope is not doc else root 519 self.has_html_namespace = self.has_html_ns(root) 520 521 # A document can be both XML and HTML (XHTML) 522 self.is_xml = self.is_xml_tree(doc) 523 self.is_html = not self.is_xml or self.has_html_namespace 524 525 def supports_namespaces(self) -> bool: 526 """Check if namespaces are supported in the HTML type.""" 527 528 return self.is_xml or self.has_html_namespace 529 530 def get_tag_ns(self, el: 'bs4.Tag') -> str: 531 """Get tag namespace.""" 532 533 if self.supports_namespaces(): 534 namespace = '' 535 ns = self.get_uri(el) 536 if ns: 537 namespace = ns 538 else: 539 namespace = NS_XHTML 540 return namespace 541 542 def is_html_tag(self, el: 'bs4.Tag') -> bool: 543 """Check if tag is in HTML namespace.""" 544 545 return self.get_tag_ns(el) == NS_XHTML 546 547 def get_tag(self, el: 'bs4.Tag') -> Optional[str]: 548 """Get tag.""" 549 550 name = self.get_tag_name(el) 551 return util.lower(name) if name is not None and not self.is_xml else name 552 553 def get_prefix(self, el: 'bs4.Tag') -> Optional[str]: 554 """Get prefix.""" 555 556 prefix = self.get_prefix_name(el) 557 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 558 559 def find_bidi(self, el: 'bs4.Tag') -> Optional[int]: 560 """Get directionality from element text.""" 561 562 for node in self.get_children(el, tags=False): 563 564 # Analyze child text nodes 565 if self.is_tag(node): 566 567 # Avoid analyzing certain elements specified in the specification. 568 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) 569 if ( 570 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or 571 not self.is_html_tag(node) or 572 direction is not None 573 ): 574 continue # pragma: no cover 575 576 # Check directionality of this node's text 577 value = self.find_bidi(node) 578 if value is not None: 579 return value 580 581 # Direction could not be determined 582 continue # pragma: no cover 583 584 # Skip `doctype` comments, etc. 585 if self.is_special_string(node): 586 continue 587 588 # Analyze text nodes for directionality. 589 for c in node: 590 bidi = unicodedata.bidirectional(c) 591 if bidi in ('AL', 'R', 'L'): 592 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 593 return None 594 595 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: 596 """Filter the language tags.""" 597 598 match = True 599 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() 600 ranges = lang_range.split('-') 601 subtags = lang_tag.lower().split('-') 602 length = len(ranges) 603 rindex = 0 604 sindex = 0 605 r = ranges[rindex] 606 s = subtags[sindex] 607 608 # Primary tag needs to match 609 if r != '*' and r != s: 610 match = False 611 612 rindex += 1 613 sindex += 1 614 615 # Match until we run out of ranges 616 while match and rindex < length: 617 r = ranges[rindex] 618 try: 619 s = subtags[sindex] 620 except IndexError: 621 # Ran out of subtags, 622 # but we still have ranges 623 match = False 624 continue 625 626 # Empty range 627 if not r: 628 match = False 629 continue 630 631 # Matched range 632 elif s == r: 633 rindex += 1 634 635 # Implicit wildcard cannot match 636 # singletons 637 elif len(s) == 1: 638 match = False 639 continue 640 641 # Implicitly matched, so grab next subtag 642 sindex += 1 643 644 return match 645 646 def match_attribute_name( 647 self, 648 el: 'bs4.Tag', 649 attr: str, 650 prefix: Optional[str] 651 ) -> Optional[Union[str, Sequence[str]]]: 652 """Match attribute name and return value if it exists.""" 653 654 value = None 655 if self.supports_namespaces(): 656 value = None 657 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 658 if prefix: 659 ns = self.namespaces.get(prefix) 660 if ns is None and prefix != '*': 661 return None 662 else: 663 ns = None 664 665 for k, v in self.iter_attributes(el): 666 667 # Get attribute parts 668 namespace, name = self.split_namespace(el, k) 669 670 # Can't match a prefix attribute as we haven't specified one to match 671 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 672 if ns is None: 673 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 674 value = v 675 break 676 # Coverage is not finding this even though it is executed. 677 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 678 # Ignore the false positive message. 679 continue # pragma: no cover 680 681 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 682 if namespace is None or ns != namespace and prefix != '*': 683 continue 684 685 # The attribute doesn't match. 686 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 687 continue 688 689 value = v 690 break 691 else: 692 for k, v in self.iter_attributes(el): 693 if util.lower(attr) != util.lower(k): 694 continue 695 value = v 696 break 697 return value 698 699 def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: 700 """Match the namespace of the element.""" 701 702 match = True 703 namespace = self.get_tag_ns(el) 704 default_namespace = self.namespaces.get('') 705 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) 706 # We must match the default namespace if one is not provided 707 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 708 match = False 709 # If we specified `|tag`, we must not have a namespace. 710 elif (tag.prefix is not None and tag.prefix == '' and namespace): 711 match = False 712 # Verify prefix matches 713 elif ( 714 tag.prefix and 715 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 716 ): 717 match = False 718 return match 719 720 def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool: 721 """Match attributes.""" 722 723 match = True 724 if attributes: 725 for a in attributes: 726 temp = self.match_attribute_name(el, a.attribute, a.prefix) 727 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 728 if temp is None: 729 match = False 730 break 731 value = temp if isinstance(temp, str) else ' '.join(temp) 732 if pattern is None: 733 continue 734 elif pattern.match(value) is None: 735 match = False 736 break 737 return match 738 739 def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: 740 """Match tag name.""" 741 742 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 743 return not ( 744 name is not None and 745 name not in (self.get_tag(el), '*') 746 ) 747 748 def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool: 749 """Match the tag.""" 750 751 match = True 752 if tag is not None: 753 # Verify namespace 754 if not self.match_namespace(el, tag): 755 match = False 756 if not self.match_tagname(el, tag): 757 match = False 758 return match 759 760 def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: 761 """Match past relationship.""" 762 763 found = False 764 # I don't think this can ever happen, but it makes `mypy` happy 765 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 766 return found 767 768 if relation[0].rel_type == REL_PARENT: 769 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 770 while not found and parent: 771 found = self.match_selectors(parent, relation) 772 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 773 elif relation[0].rel_type == REL_CLOSE_PARENT: 774 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 775 if parent: 776 found = self.match_selectors(parent, relation) 777 elif relation[0].rel_type == REL_SIBLING: 778 sibling = self.get_previous(el) 779 while not found and sibling: 780 found = self.match_selectors(sibling, relation) 781 sibling = self.get_previous(sibling) 782 elif relation[0].rel_type == REL_CLOSE_SIBLING: 783 sibling = self.get_previous(el) 784 if sibling and self.is_tag(sibling): 785 found = self.match_selectors(sibling, relation) 786 return found 787 788 def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool: 789 """Match future child.""" 790 791 match = False 792 if recursive: 793 children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']] 794 else: 795 children = self.get_children 796 for child in children(parent, no_iframe=self.iframe_restrict): 797 match = self.match_selectors(child, relation) 798 if match: 799 break 800 return match 801 802 def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: 803 """Match future relationship.""" 804 805 found = False 806 # I don't think this can ever happen, but it makes `mypy` happy 807 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover 808 return found 809 810 if relation[0].rel_type == REL_HAS_PARENT: 811 found = self.match_future_child(el, relation, True) 812 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 813 found = self.match_future_child(el, relation) 814 elif relation[0].rel_type == REL_HAS_SIBLING: 815 sibling = self.get_next(el) 816 while not found and sibling: 817 found = self.match_selectors(sibling, relation) 818 sibling = self.get_next(sibling) 819 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 820 sibling = self.get_next(el) 821 if sibling and self.is_tag(sibling): 822 found = self.match_selectors(sibling, relation) 823 return found 824 825 def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: 826 """Match relationship to other elements.""" 827 828 found = False 829 830 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: 831 return found 832 833 if relation[0].rel_type.startswith(':'): 834 found = self.match_future_relations(el, relation) 835 else: 836 found = self.match_past_relations(el, relation) 837 838 return found 839 840 def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool: 841 """Match element's ID.""" 842 843 found = True 844 for i in ids: 845 if i != self.get_attribute_by_name(el, 'id', ''): 846 found = False 847 break 848 return found 849 850 def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool: 851 """Match element's classes.""" 852 853 current_classes = self.get_classes(el) 854 found = True 855 for c in classes: 856 if c not in current_classes: 857 found = False 858 break 859 return found 860 861 def match_root(self, el: 'bs4.Tag') -> bool: 862 """Match element as root.""" 863 864 is_root = self.is_root(el) 865 if is_root: 866 sibling = self.get_previous(el, tags=False) 867 while is_root and sibling is not None: 868 if ( 869 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 870 self.is_cdata(sibling) 871 ): 872 is_root = False 873 else: 874 sibling = self.get_previous(sibling, tags=False) 875 if is_root: 876 sibling = self.get_next(el, tags=False) 877 while is_root and sibling is not None: 878 if ( 879 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 880 self.is_cdata(sibling) 881 ): 882 is_root = False 883 else: 884 sibling = self.get_next(sibling, tags=False) 885 return is_root 886 887 def match_scope(self, el: 'bs4.Tag') -> bool: 888 """Match element as scope.""" 889 890 return self.scope is el 891 892 def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool: 893 """Match tag type for `nth` matches.""" 894 895 return( 896 (self.get_tag(child) == self.get_tag(el)) and 897 (self.get_tag_ns(child) == self.get_tag_ns(el)) 898 ) 899 900 def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool: 901 """Match `nth` elements.""" 902 903 matched = True 904 905 for n in nth: 906 matched = False 907 if n.selectors and not self.match_selectors(el, n.selectors): 908 break 909 parent = self.get_parent(el) 910 if parent is None: 911 parent = self.create_fake_parent(el) 912 last = n.last 913 last_index = len(parent) - 1 914 index = last_index if last else 0 915 relative_index = 0 916 a = n.a 917 b = n.b 918 var = n.n 919 count = 0 920 count_incr = 1 921 factor = -1 if last else 1 922 idx = last_idx = a * count + b if var else a 923 924 # We can only adjust bounds within a variable index 925 if var: 926 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 927 # Otherwise, increment to try to get in bounds. 928 adjust = None 929 while idx < 1 or idx > last_index: 930 if idx < 0: 931 diff_low = 0 - idx 932 if adjust is not None and adjust == 1: 933 break 934 adjust = -1 935 count += count_incr 936 idx = last_idx = a * count + b if var else a 937 diff = 0 - idx 938 if diff >= diff_low: 939 break 940 else: 941 diff_high = idx - last_index 942 if adjust is not None and adjust == -1: 943 break 944 adjust = 1 945 count += count_incr 946 idx = last_idx = a * count + b if var else a 947 diff = idx - last_index 948 if diff >= diff_high: 949 break 950 diff_high = diff 951 952 # If a < 0, our count is working backwards, so floor the index by increasing the count. 953 # Find the count that yields the lowest, in bound value and use that. 954 # Lastly reverse count increment so that we'll increase our index. 955 lowest = count 956 if a < 0: 957 while idx >= 1: 958 lowest = count 959 count += count_incr 960 idx = last_idx = a * count + b if var else a 961 count_incr = -1 962 count = lowest 963 idx = last_idx = a * count + b if var else a 964 965 # Evaluate elements while our calculated nth index is still in range 966 while 1 <= idx <= last_index + 1: 967 child = None 968 # Evaluate while our child index is still in range. 969 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): 970 index += factor 971 if not self.is_tag(child): 972 continue 973 # Handle `of S` in `nth-child` 974 if n.selectors and not self.match_selectors(child, n.selectors): 975 continue 976 # Handle `of-type` 977 if n.of_type and not self.match_nth_tag_type(el, child): 978 continue 979 relative_index += 1 980 if relative_index == idx: 981 if child is el: 982 matched = True 983 else: 984 break 985 if child is el: 986 break 987 if child is el: 988 break 989 last_idx = idx 990 count += count_incr 991 if count < 0: 992 # Count is counting down and has now ventured into invalid territory. 993 break 994 idx = a * count + b if var else a 995 if last_idx == idx: 996 break 997 if not matched: 998 break 999 return matched 1000 1001 def match_empty(self, el: 'bs4.Tag') -> bool: 1002 """Check if element is empty (if requested).""" 1003 1004 is_empty = True 1005 for child in self.get_children(el, tags=False): 1006 if self.is_tag(child): 1007 is_empty = False 1008 break 1009 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): 1010 is_empty = False 1011 break 1012 return is_empty 1013 1014 def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool: 1015 """Match selectors.""" 1016 1017 match = True 1018 for sel in selectors: 1019 if not self.match_selectors(el, sel): 1020 match = False 1021 return match 1022 1023 def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool: 1024 """Match element if it contains text.""" 1025 1026 match = True 1027 content = None # type: Optional[Union[str, Sequence[str]]] 1028 for contain_list in contains: 1029 if content is None: 1030 if contain_list.own: 1031 content = self.get_own_text(el, no_iframe=self.is_html) 1032 else: 1033 content = self.get_text(el, no_iframe=self.is_html) 1034 found = False 1035 for text in contain_list.text: 1036 if contain_list.own: 1037 for c in content: 1038 if text in c: 1039 found = True 1040 break 1041 if found: 1042 break 1043 else: 1044 if text in content: 1045 found = True 1046 break 1047 if not found: 1048 match = False 1049 return match 1050 1051 def match_default(self, el: 'bs4.Tag') -> bool: 1052 """Match default.""" 1053 1054 match = False 1055 1056 # Find this input's form 1057 form = None 1058 parent = self.get_parent(el, no_iframe=True) 1059 while parent and form is None: 1060 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 1061 form = parent 1062 else: 1063 parent = self.get_parent(parent, no_iframe=True) 1064 1065 # Look in form cache to see if we've already located its default button 1066 found_form = False 1067 for f, t in self.cached_default_forms: 1068 if f is form: 1069 found_form = True 1070 if t is el: 1071 match = True 1072 break 1073 1074 # We didn't have the form cached, so look for its default button 1075 if not found_form: 1076 for child in self.get_descendants(form, no_iframe=True): 1077 name = self.get_tag(child) 1078 # Can't do nested forms (haven't figured out why we never hit this) 1079 if name == 'form': # pragma: no cover 1080 break 1081 if name in ('input', 'button'): 1082 v = self.get_attribute_by_name(child, 'type', '') 1083 if v and util.lower(v) == 'submit': 1084 self.cached_default_forms.append((form, child)) 1085 if el is child: 1086 match = True 1087 break 1088 return match 1089 1090 def match_indeterminate(self, el: 'bs4.Tag') -> bool: 1091 """Match default.""" 1092 1093 match = False 1094 name = cast(str, self.get_attribute_by_name(el, 'name')) 1095 1096 def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']: 1097 """Find this input's form.""" 1098 form = None 1099 parent = self.get_parent(el, no_iframe=True) 1100 while form is None: 1101 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 1102 form = parent 1103 break 1104 last_parent = parent 1105 parent = self.get_parent(parent, no_iframe=True) 1106 if parent is None: 1107 form = last_parent 1108 break 1109 return form 1110 1111 form = get_parent_form(el) 1112 1113 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 1114 found_form = False 1115 for f, n, i in self.cached_indeterminate_forms: 1116 if f is form and n == name: 1117 found_form = True 1118 if i is True: 1119 match = True 1120 break 1121 1122 # We didn't have the form cached, so validate that the radio button is indeterminate 1123 if not found_form: 1124 checked = False 1125 for child in self.get_descendants(form, no_iframe=True): 1126 if child is el: 1127 continue 1128 tag_name = self.get_tag(child) 1129 if tag_name == 'input': 1130 is_radio = False 1131 check = False 1132 has_name = False 1133 for k, v in self.iter_attributes(child): 1134 if util.lower(k) == 'type' and util.lower(v) == 'radio': 1135 is_radio = True 1136 elif util.lower(k) == 'name' and v == name: 1137 has_name = True 1138 elif util.lower(k) == 'checked': 1139 check = True 1140 if is_radio and check and has_name and get_parent_form(child) is form: 1141 checked = True 1142 break 1143 if checked: 1144 break 1145 if not checked: 1146 match = True 1147 self.cached_indeterminate_forms.append((form, name, match)) 1148 1149 return match 1150 1151 def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool: 1152 """Match languages.""" 1153 1154 match = False 1155 has_ns = self.supports_namespaces() 1156 root = self.root 1157 has_html_namespace = self.has_html_namespace 1158 1159 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 1160 parent = el 1161 found_lang = None 1162 last = None 1163 while not found_lang: 1164 has_html_ns = self.has_html_ns(parent) 1165 for k, v in self.iter_attributes(parent): 1166 attr_ns, attr = self.split_namespace(parent, k) 1167 if ( 1168 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 1169 ( 1170 has_ns and not has_html_ns and attr_ns == NS_XML and 1171 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 1172 ) 1173 ): 1174 found_lang = v 1175 break 1176 last = parent 1177 parent = self.get_parent(parent, no_iframe=self.is_html) 1178 1179 if parent is None: 1180 root = last 1181 has_html_namespace = self.has_html_ns(root) 1182 parent = last 1183 break 1184 1185 # Use cached meta language. 1186 if not found_lang and self.cached_meta_lang: 1187 for cache in self.cached_meta_lang: 1188 if root is cache[0]: 1189 found_lang = cache[1] 1190 1191 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 1192 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): 1193 # Find head 1194 found = False 1195 for tag in ('html', 'head'): 1196 found = False 1197 for child in self.get_children(parent, no_iframe=self.is_html): 1198 if self.get_tag(child) == tag and self.is_html_tag(child): 1199 found = True 1200 parent = child 1201 break 1202 if not found: # pragma: no cover 1203 break 1204 1205 # Search meta tags 1206 if found: 1207 for child in parent: 1208 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): 1209 c_lang = False 1210 content = None 1211 for k, v in self.iter_attributes(child): 1212 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 1213 c_lang = True 1214 if util.lower(k) == 'content': 1215 content = v 1216 if c_lang and content: 1217 found_lang = content 1218 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) 1219 break 1220 if found_lang: 1221 break 1222 if not found_lang: 1223 self.cached_meta_lang.append((cast(str, root), '')) 1224 1225 # If we determined a language, compare. 1226 if found_lang: 1227 for patterns in langs: 1228 match = False 1229 for pattern in patterns: 1230 if self.extended_language_filter(pattern, cast(str, found_lang)): 1231 match = True 1232 if not match: 1233 break 1234 1235 return match 1236 1237 def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool: 1238 """Check directionality.""" 1239 1240 # If we have to match both left and right, we can't match either. 1241 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 1242 return False 1243 1244 if el is None or not self.is_html_tag(el): 1245 return False 1246 1247 # Element has defined direction of left to right or right to left 1248 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 1249 if direction not in (None, 0): 1250 return direction == directionality 1251 1252 # Element is the document element (the root) and no direction assigned, assume left to right. 1253 is_root = self.is_root(el) 1254 if is_root and direction is None: 1255 return ct.SEL_DIR_LTR == directionality 1256 1257 # If `input[type=telephone]` and no direction is assigned, assume left to right. 1258 name = self.get_tag(el) 1259 is_input = name == 'input' 1260 is_textarea = name == 'textarea' 1261 is_bdi = name == 'bdi' 1262 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 1263 if is_input and itype == 'tel' and direction is None: 1264 return ct.SEL_DIR_LTR == directionality 1265 1266 # Auto handling for text inputs 1267 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 1268 if is_textarea: 1269 temp = [] 1270 for node in self.get_contents(el, no_iframe=True): 1271 if self.is_content_string(node): 1272 temp.append(node) 1273 value = ''.join(temp) 1274 else: 1275 value = cast(str, self.get_attribute_by_name(el, 'value', '')) 1276 if value: 1277 for c in value: 1278 bidi = unicodedata.bidirectional(c) 1279 if bidi in ('AL', 'R', 'L'): 1280 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 1281 return direction == directionality 1282 # Assume left to right 1283 return ct.SEL_DIR_LTR == directionality 1284 elif is_root: 1285 return ct.SEL_DIR_LTR == directionality 1286 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1287 1288 # Auto handling for `bdi` and other non text inputs. 1289 if (is_bdi and direction is None) or direction == 0: 1290 direction = self.find_bidi(el) 1291 if direction is not None: 1292 return direction == directionality 1293 elif is_root: 1294 return ct.SEL_DIR_LTR == directionality 1295 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1296 1297 # Match parents direction 1298 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1299 1300 def match_range(self, el: 'bs4.Tag', condition: int) -> bool: 1301 """ 1302 Match range. 1303 1304 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 1305 if the value is out of range, and if not, it is in range. So a missing value 1306 will not evaluate out of range; therefore, value is in range. Personally, I 1307 feel like this should evaluate as neither in or out of range. 1308 """ 1309 1310 out_of_range = False 1311 1312 itype = util.lower(self.get_attribute_by_name(el, 'type')) 1313 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) 1314 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) 1315 1316 # There is no valid min or max, so we cannot evaluate a range 1317 if mn is None and mx is None: 1318 return False 1319 1320 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) 1321 if value is not None: 1322 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 1323 if mn is not None and value < mn: 1324 out_of_range = True 1325 if not out_of_range and mx is not None and value > mx: 1326 out_of_range = True 1327 elif itype == "time": 1328 if mn is not None and mx is not None and mn > mx: 1329 # Time is periodic, so this is a reversed/discontinuous range 1330 if value < mn and value > mx: 1331 out_of_range = True 1332 else: 1333 if mn is not None and value < mn: 1334 out_of_range = True 1335 if not out_of_range and mx is not None and value > mx: 1336 out_of_range = True 1337 1338 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 1339 1340 def match_defined(self, el: 'bs4.Tag') -> bool: 1341 """ 1342 Match defined. 1343 1344 `:defined` is related to custom elements in a browser. 1345 1346 - If the document is XML (not XHTML), all tags will match. 1347 - Tags that are not custom (don't have a hyphen) are marked defined. 1348 - If the tag has a prefix (without or without a namespace), it will not match. 1349 1350 This is of course requires the parser to provide us with the proper prefix and namespace info, 1351 if it doesn't, there is nothing we can do. 1352 """ 1353 1354 name = self.get_tag(el) 1355 return ( 1356 name is not None and ( 1357 name.find('-') == -1 or 1358 name.find(':') != -1 or 1359 self.get_prefix(el) is not None 1360 ) 1361 ) 1362 1363 def match_placeholder_shown(self, el: 'bs4.Tag') -> bool: 1364 """ 1365 Match placeholder shown according to HTML spec. 1366 1367 - text area should be checked if they have content. A single newline does not count as content. 1368 1369 """ 1370 1371 match = False 1372 content = self.get_text(el) 1373 if content in ('', '\n'): 1374 match = True 1375 1376 return match 1377 1378 def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool: 1379 """Check if element matches one of the selectors.""" 1380 1381 match = False 1382 is_not = selectors.is_not 1383 is_html = selectors.is_html 1384 1385 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 1386 if is_html: 1387 namespaces = self.namespaces 1388 iframe_restrict = self.iframe_restrict 1389 self.namespaces = {'html': NS_XHTML} 1390 self.iframe_restrict = True 1391 1392 if not is_html or self.is_html: 1393 for selector in selectors: 1394 match = is_not 1395 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 1396 if isinstance(selector, ct.SelectorNull): 1397 continue 1398 # Verify tag matches 1399 if not self.match_tag(el, selector.tag): 1400 continue 1401 # Verify tag is defined 1402 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 1403 continue 1404 # Verify element is root 1405 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 1406 continue 1407 # Verify element is scope 1408 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 1409 continue 1410 # Verify element has placeholder shown 1411 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): 1412 continue 1413 # Verify `nth` matches 1414 if not self.match_nth(el, selector.nth): 1415 continue 1416 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 1417 continue 1418 # Verify id matches 1419 if selector.ids and not self.match_id(el, selector.ids): 1420 continue 1421 # Verify classes match 1422 if selector.classes and not self.match_classes(el, selector.classes): 1423 continue 1424 # Verify attribute(s) match 1425 if not self.match_attributes(el, selector.attributes): 1426 continue 1427 # Verify ranges 1428 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 1429 continue 1430 # Verify language patterns 1431 if selector.lang and not self.match_lang(el, selector.lang): 1432 continue 1433 # Verify pseudo selector patterns 1434 if selector.selectors and not self.match_subselectors(el, selector.selectors): 1435 continue 1436 # Verify relationship selectors 1437 if selector.relation and not self.match_relations(el, selector.relation): 1438 continue 1439 # Validate that the current default selector match corresponds to the first submit button in the form 1440 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 1441 continue 1442 # Validate that the unset radio button is among radio buttons with the same name in a form that are 1443 # also not set. 1444 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 1445 continue 1446 # Validate element directionality 1447 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 1448 continue 1449 # Validate that the tag contains the specified text. 1450 if selector.contains and not self.match_contains(el, selector.contains): 1451 continue 1452 match = not is_not 1453 break 1454 1455 # Restore actual namespaces being used for external selector lists 1456 if is_html: 1457 self.namespaces = namespaces 1458 self.iframe_restrict = iframe_restrict 1459 1460 return match 1461 1462 def select(self, limit: int = 0) -> Iterator['bs4.Tag']: 1463 """Match all tags under the targeted tag.""" 1464 1465 lim = None if limit < 1 else limit 1466 1467 for child in self.get_descendants(self.tag): 1468 if self.match(child): 1469 yield child 1470 if lim is not None: 1471 lim -= 1 1472 if lim < 1: 1473 break 1474 1475 def closest(self) -> Optional['bs4.Tag']: 1476 """Match closest ancestor.""" 1477 1478 current = self.tag 1479 closest = None 1480 while closest is None and current is not None: 1481 if self.match(current): 1482 closest = current 1483 else: 1484 current = self.get_parent(current) 1485 return closest 1486 1487 def filter(self) -> List['bs4.Tag']: # noqa A001 1488 """Filter tag's children.""" 1489 1490 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] 1491 1492 def match(self, el: 'bs4.Tag') -> bool: 1493 """Match.""" 1494 1495 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 1496 1497 1498class SoupSieve(ct.Immutable): 1499 """Compiled Soup Sieve selector matching object.""" 1500 1501 pattern: str 1502 selectors: ct.SelectorList 1503 namespaces: Optional[ct.Namespaces] 1504 custom: Dict[str, str] 1505 flags: int 1506 1507 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 1508 1509 def __init__( 1510 self, 1511 pattern: str, 1512 selectors: ct.SelectorList, 1513 namespaces: Optional[ct.Namespaces], 1514 custom: Optional[ct.CustomSelectors], 1515 flags: int 1516 ): 1517 """Initialize.""" 1518 1519 super().__init__( 1520 pattern=pattern, 1521 selectors=selectors, 1522 namespaces=namespaces, 1523 custom=custom, 1524 flags=flags 1525 ) 1526 1527 def match(self, tag: 'bs4.Tag') -> bool: 1528 """Match.""" 1529 1530 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 1531 1532 def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag': 1533 """Match closest ancestor.""" 1534 1535 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 1536 1537 def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001 1538 """ 1539 Filter. 1540 1541 `CSSMatch` can cache certain searches for tags of the same document, 1542 so if we are given a tag, all tags are from the same document, 1543 and we can take advantage of the optimization. 1544 1545 Any other kind of iterable could have tags from different documents or detached tags, 1546 so for those, we use a new `CSSMatch` for each item in the iterable. 1547 """ 1548 1549 if CSSMatch.is_tag(iterable): 1550 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 1551 else: 1552 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 1553 1554 def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag': 1555 """Select a single tag.""" 1556 1557 tags = self.select(tag, limit=1) 1558 return tags[0] if tags else None 1559 1560 def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']: 1561 """Select the specified tags.""" 1562 1563 return list(self.iselect(tag, limit)) 1564 1565 def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']: 1566 """Iterate the specified tags.""" 1567 1568 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): 1569 yield el 1570 1571 def __repr__(self) -> str: # pragma: no cover 1572 """Representation.""" 1573 1574 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( 1575 self.pattern, 1576 self.namespaces, 1577 self.custom, 1578 self.flags 1579 ) 1580 1581 __str__ = __repr__ 1582 1583 1584ct.pickle_register(SoupSieve) 1585