1"""CSS matcher.""" 2from __future__ import unicode_literals 3from datetime import datetime 4from . import util 5import re 6from .import css_types as ct 7import unicodedata 8 9# Empty tag pattern (whitespace okay) 10RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') 11 12RE_NOT_WS = re.compile('[^ \t\r\n\f]+') 13 14# Relationships 15REL_PARENT = ' ' 16REL_CLOSE_PARENT = '>' 17REL_SIBLING = '~' 18REL_CLOSE_SIBLING = '+' 19 20# Relationships for :has() (forward looking) 21REL_HAS_PARENT = ': ' 22REL_HAS_CLOSE_PARENT = ':>' 23REL_HAS_SIBLING = ':~' 24REL_HAS_CLOSE_SIBLING = ':+' 25 26NS_XHTML = 'http://www.w3.org/1999/xhtml' 27NS_XML = 'http://www.w3.org/XML/1998/namespace' 28 29DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL 30RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE 31 32DIR_MAP = { 33 'ltr': ct.SEL_DIR_LTR, 34 'rtl': ct.SEL_DIR_RTL, 35 'auto': 0 36} 37 38RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") 39RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') 40RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') 41RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') 42RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') 43RE_DATETIME = re.compile( 44 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' 45) 46 47MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November 48FEB = 2 49SHORT_MONTH = 30 50LONG_MONTH = 31 51FEB_MONTH = 28 52FEB_LEAP_MONTH = 29 53DAYS_IN_WEEK = 7 54 55 56class _FakeParent(object): 57 """ 58 Fake parent class. 59 60 When we have a fragment with no `BeautifulSoup` document object, 61 we can't evaluate `nth` selectors properly. Create a temporary 62 fake parent so we can traverse the root element as a child. 63 """ 64 65 def __init__(self, element): 66 """Initialize.""" 67 68 self.contents = [element] 69 70 def __len__(self): 71 """Length.""" 72 73 return len(self.contents) 74 75 76class _DocumentNav(object): 77 """Navigate a Beautiful Soup document.""" 78 79 @classmethod 80 def assert_valid_input(cls, tag): 81 """Check if valid input tag or document.""" 82 83 # Fail on unexpected types. 84 if not cls.is_tag(tag): 85 raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) 86 87 @staticmethod 88 def is_doc(obj): 89 """Is `BeautifulSoup` object.""" 90 91 import bs4 92 return isinstance(obj, bs4.BeautifulSoup) 93 94 @staticmethod 95 def is_tag(obj): 96 """Is tag.""" 97 98 import bs4 99 return isinstance(obj, bs4.Tag) 100 101 @staticmethod 102 def is_comment(obj): 103 """Is comment.""" 104 105 import bs4 106 return isinstance(obj, bs4.Comment) 107 108 @staticmethod 109 def is_declaration(obj): # pragma: no cover 110 """Is declaration.""" 111 112 import bs4 113 return isinstance(obj, bs4.Declaration) 114 115 @staticmethod 116 def is_cdata(obj): 117 """Is CDATA.""" 118 119 import bs4 120 return isinstance(obj, bs4.CData) 121 122 @staticmethod 123 def is_processing_instruction(obj): # pragma: no cover 124 """Is processing instruction.""" 125 126 import bs4 127 return isinstance(obj, bs4.ProcessingInstruction) 128 129 @staticmethod 130 def is_navigable_string(obj): 131 """Is navigable string.""" 132 133 import bs4 134 return isinstance(obj, bs4.NavigableString) 135 136 @staticmethod 137 def is_special_string(obj): 138 """Is special string.""" 139 140 import bs4 141 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) 142 143 @classmethod 144 def is_content_string(cls, obj): 145 """Check if node is content string.""" 146 147 return cls.is_navigable_string(obj) and not cls.is_special_string(obj) 148 149 @staticmethod 150 def create_fake_parent(el): 151 """Create fake parent for a given element.""" 152 153 return _FakeParent(el) 154 155 @staticmethod 156 def is_xml_tree(el): 157 """Check if element (or document) is from a XML tree.""" 158 159 return el._is_xml 160 161 def is_iframe(self, el): 162 """Check if element is an `iframe`.""" 163 164 return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) 165 166 def is_root(self, el): 167 """ 168 Return whether element is a root element. 169 170 We check that the element is the root of the tree (which we have already pre-calculated), 171 and we check if it is the root element under an `iframe`. 172 """ 173 174 root = self.root and self.root is el 175 if not root: 176 parent = self.get_parent(el) 177 root = parent is not None and self.is_html and self.is_iframe(parent) 178 return root 179 180 def get_contents(self, el, no_iframe=False): 181 """Get contents or contents in reverse.""" 182 if not no_iframe or not self.is_iframe(el): 183 for content in el.contents: 184 yield content 185 186 def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): 187 """Get children.""" 188 189 if not no_iframe or not self.is_iframe(el): 190 last = len(el.contents) - 1 191 if start is None: 192 index = last if reverse else 0 193 else: 194 index = start 195 end = -1 if reverse else last + 1 196 incr = -1 if reverse else 1 197 198 if 0 <= index <= last: 199 while index != end: 200 node = el.contents[index] 201 index += incr 202 if not tags or self.is_tag(node): 203 yield node 204 205 def get_descendants(self, el, tags=True, no_iframe=False): 206 """Get descendants.""" 207 208 if not no_iframe or not self.is_iframe(el): 209 next_good = None 210 for child in el.descendants: 211 212 if next_good is not None: 213 if child is not next_good: 214 continue 215 next_good = None 216 217 is_tag = self.is_tag(child) 218 219 if no_iframe and is_tag and self.is_iframe(child): 220 if child.next_sibling is not None: 221 next_good = child.next_sibling 222 else: 223 last_child = child 224 while self.is_tag(last_child) and last_child.contents: 225 last_child = last_child.contents[-1] 226 next_good = last_child.next_element 227 yield child 228 if next_good is None: 229 break 230 # Coverage isn't seeing this even though it's executed 231 continue # pragma: no cover 232 233 if not tags or is_tag: 234 yield child 235 236 def get_parent(self, el, no_iframe=False): 237 """Get parent.""" 238 239 parent = el.parent 240 if no_iframe and parent is not None and self.is_iframe(parent): 241 parent = None 242 return parent 243 244 @staticmethod 245 def get_tag_name(el): 246 """Get tag.""" 247 248 return el.name 249 250 @staticmethod 251 def get_prefix_name(el): 252 """Get prefix.""" 253 254 return el.prefix 255 256 @staticmethod 257 def get_uri(el): 258 """Get namespace `URI`.""" 259 260 return el.namespace 261 262 @classmethod 263 def get_next(cls, el, tags=True): 264 """Get next sibling tag.""" 265 266 sibling = el.next_sibling 267 while tags and not cls.is_tag(sibling) and sibling is not None: 268 sibling = sibling.next_sibling 269 return sibling 270 271 @classmethod 272 def get_previous(cls, el, tags=True): 273 """Get previous sibling tag.""" 274 275 sibling = el.previous_sibling 276 while tags and not cls.is_tag(sibling) and sibling is not None: 277 sibling = sibling.previous_sibling 278 return sibling 279 280 @staticmethod 281 def has_html_ns(el): 282 """ 283 Check if element has an HTML namespace. 284 285 This is a bit different than whether a element is treated as having an HTML namespace, 286 like we do in the case of `is_html_tag`. 287 """ 288 289 ns = getattr(el, 'namespace') if el else None 290 return ns and ns == NS_XHTML 291 292 @staticmethod 293 def split_namespace(el, attr_name): 294 """Return namespace and attribute name without the prefix.""" 295 296 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) 297 298 @staticmethod 299 def get_attribute_by_name(el, name, default=None): 300 """Get attribute by name.""" 301 302 value = default 303 if el._is_xml: 304 try: 305 value = el.attrs[name] 306 except KeyError: 307 pass 308 else: 309 for k, v in el.attrs.items(): 310 if util.lower(k) == name: 311 value = v 312 break 313 return value 314 315 @staticmethod 316 def iter_attributes(el): 317 """Iterate attributes.""" 318 319 for k, v in el.attrs.items(): 320 yield k, v 321 322 @classmethod 323 def get_classes(cls, el): 324 """Get classes.""" 325 326 classes = cls.get_attribute_by_name(el, 'class', []) 327 if isinstance(classes, util.ustr): 328 classes = RE_NOT_WS.findall(classes) 329 return classes 330 331 def get_text(self, el, no_iframe=False): 332 """Get text.""" 333 334 return ''.join( 335 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] 336 ) 337 338 339class Inputs(object): 340 """Class for parsing and validating input items.""" 341 342 @staticmethod 343 def validate_day(year, month, day): 344 """Validate day.""" 345 346 max_days = LONG_MONTH 347 if month == FEB: 348 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH 349 elif month in MONTHS_30: 350 max_days = SHORT_MONTH 351 return 1 <= day <= max_days 352 353 @staticmethod 354 def validate_week(year, week): 355 """Validate week.""" 356 357 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] 358 if max_week == 1: 359 max_week = 53 360 return 1 <= week <= max_week 361 362 @staticmethod 363 def validate_month(month): 364 """Validate month.""" 365 366 return 1 <= month <= 12 367 368 @staticmethod 369 def validate_year(year): 370 """Validate year.""" 371 372 return 1 <= year 373 374 @staticmethod 375 def validate_hour(hour): 376 """Validate hour.""" 377 378 return 0 <= hour <= 23 379 380 @staticmethod 381 def validate_minutes(minutes): 382 """Validate minutes.""" 383 384 return 0 <= minutes <= 59 385 386 @classmethod 387 def parse_value(cls, itype, value): 388 """Parse the input value.""" 389 390 parsed = None 391 if itype == "date": 392 m = RE_DATE.match(value) 393 if m: 394 year = int(m.group('year'), 10) 395 month = int(m.group('month'), 10) 396 day = int(m.group('day'), 10) 397 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): 398 parsed = (year, month, day) 399 elif itype == "month": 400 m = RE_MONTH.match(value) 401 if m: 402 year = int(m.group('year'), 10) 403 month = int(m.group('month'), 10) 404 if cls.validate_year(year) and cls.validate_month(month): 405 parsed = (year, month) 406 elif itype == "week": 407 m = RE_WEEK.match(value) 408 if m: 409 year = int(m.group('year'), 10) 410 week = int(m.group('week'), 10) 411 if cls.validate_year(year) and cls.validate_week(year, week): 412 parsed = (year, week) 413 elif itype == "time": 414 m = RE_TIME.match(value) 415 if m: 416 hour = int(m.group('hour'), 10) 417 minutes = int(m.group('minutes'), 10) 418 if cls.validate_hour(hour) and cls.validate_minutes(minutes): 419 parsed = (hour, minutes) 420 elif itype == "datetime-local": 421 m = RE_DATETIME.match(value) 422 if m: 423 year = int(m.group('year'), 10) 424 month = int(m.group('month'), 10) 425 day = int(m.group('day'), 10) 426 hour = int(m.group('hour'), 10) 427 minutes = int(m.group('minutes'), 10) 428 if ( 429 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and 430 cls.validate_hour(hour) and cls.validate_minutes(minutes) 431 ): 432 parsed = (year, month, day, hour, minutes) 433 elif itype in ("number", "range"): 434 m = RE_NUM.match(value) 435 if m: 436 parsed = float(m.group('value')) 437 return parsed 438 439 440class _Match(object): 441 """Perform CSS matching.""" 442 443 def __init__(self, selectors, scope, namespaces, flags): 444 """Initialize.""" 445 446 self.assert_valid_input(scope) 447 self.tag = scope 448 self.cached_meta_lang = [] 449 self.cached_default_forms = [] 450 self.cached_indeterminate_forms = [] 451 self.selectors = selectors 452 self.namespaces = {} if namespaces is None else namespaces 453 self.flags = flags 454 self.iframe_restrict = False 455 456 # Find the root element for the whole tree 457 doc = scope 458 parent = self.get_parent(doc) 459 while parent: 460 doc = parent 461 parent = self.get_parent(doc) 462 root = None 463 if not self.is_doc(doc): 464 root = doc 465 else: 466 for child in self.get_children(doc): 467 root = child 468 break 469 470 self.root = root 471 self.scope = scope if scope is not doc else root 472 self.has_html_namespace = self.has_html_ns(root) 473 474 # A document can be both XML and HTML (XHTML) 475 self.is_xml = self.is_xml_tree(doc) 476 self.is_html = not self.is_xml or self.has_html_namespace 477 478 def supports_namespaces(self): 479 """Check if namespaces are supported in the HTML type.""" 480 481 return self.is_xml or self.has_html_namespace 482 483 def get_tag_ns(self, el): 484 """Get tag namespace.""" 485 486 if self.supports_namespaces(): 487 namespace = '' 488 ns = self.get_uri(el) 489 if ns: 490 namespace = ns 491 else: 492 namespace = NS_XHTML 493 return namespace 494 495 def is_html_tag(self, el): 496 """Check if tag is in HTML namespace.""" 497 498 return self.get_tag_ns(el) == NS_XHTML 499 500 def get_tag(self, el): 501 """Get tag.""" 502 503 name = self.get_tag_name(el) 504 return util.lower(name) if name is not None and not self.is_xml else name 505 506 def get_prefix(self, el): 507 """Get prefix.""" 508 509 prefix = self.get_prefix_name(el) 510 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix 511 512 def find_bidi(self, el): 513 """Get directionality from element text.""" 514 515 for node in self.get_children(el, tags=False): 516 517 # Analyze child text nodes 518 if self.is_tag(node): 519 520 # Avoid analyzing certain elements specified in the specification. 521 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) 522 if ( 523 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or 524 not self.is_html_tag(node) or 525 direction is not None 526 ): 527 continue # pragma: no cover 528 529 # Check directionality of this node's text 530 value = self.find_bidi(node) 531 if value is not None: 532 return value 533 534 # Direction could not be determined 535 continue # pragma: no cover 536 537 # Skip `doctype` comments, etc. 538 if self.is_special_string(node): 539 continue 540 541 # Analyze text nodes for directionality. 542 for c in node: 543 bidi = unicodedata.bidirectional(c) 544 if bidi in ('AL', 'R', 'L'): 545 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 546 return None 547 548 def match_attribute_name(self, el, attr, prefix): 549 """Match attribute name and return value if it exists.""" 550 551 value = None 552 if self.supports_namespaces(): 553 value = None 554 # If we have not defined namespaces, we can't very well find them, so don't bother trying. 555 if prefix: 556 ns = self.namespaces.get(prefix) 557 if ns is None and prefix != '*': 558 return None 559 else: 560 ns = None 561 562 for k, v in self.iter_attributes(el): 563 564 # Get attribute parts 565 namespace, name = self.split_namespace(el, k) 566 567 # Can't match a prefix attribute as we haven't specified one to match 568 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. 569 if ns is None: 570 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): 571 value = v 572 break 573 # Coverage is not finding this even though it is executed. 574 # Adding a print statement before this (and erasing coverage) causes coverage to find the line. 575 # Ignore the false positive message. 576 continue # pragma: no cover 577 578 # We can't match our desired prefix attribute as the attribute doesn't have a prefix 579 if namespace is None or ns != namespace and prefix != '*': 580 continue 581 582 # The attribute doesn't match. 583 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): 584 continue 585 586 value = v 587 break 588 else: 589 for k, v in self.iter_attributes(el): 590 if util.lower(attr) != util.lower(k): 591 continue 592 value = v 593 break 594 return value 595 596 def match_namespace(self, el, tag): 597 """Match the namespace of the element.""" 598 599 match = True 600 namespace = self.get_tag_ns(el) 601 default_namespace = self.namespaces.get('') 602 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) 603 # We must match the default namespace if one is not provided 604 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): 605 match = False 606 # If we specified `|tag`, we must not have a namespace. 607 elif (tag.prefix is not None and tag.prefix == '' and namespace): 608 match = False 609 # Verify prefix matches 610 elif ( 611 tag.prefix and 612 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) 613 ): 614 match = False 615 return match 616 617 def match_attributes(self, el, attributes): 618 """Match attributes.""" 619 620 match = True 621 if attributes: 622 for a in attributes: 623 value = self.match_attribute_name(el, a.attribute, a.prefix) 624 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern 625 if isinstance(value, list): 626 value = ' '.join(value) 627 if value is None: 628 match = False 629 break 630 elif pattern is None: 631 continue 632 elif pattern.match(value) is None: 633 match = False 634 break 635 return match 636 637 def match_tagname(self, el, tag): 638 """Match tag name.""" 639 640 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) 641 return not ( 642 name is not None and 643 name not in (self.get_tag(el), '*') 644 ) 645 646 def match_tag(self, el, tag): 647 """Match the tag.""" 648 649 match = True 650 if tag is not None: 651 # Verify namespace 652 if not self.match_namespace(el, tag): 653 match = False 654 if not self.match_tagname(el, tag): 655 match = False 656 return match 657 658 def match_past_relations(self, el, relation): 659 """Match past relationship.""" 660 661 found = False 662 if relation[0].rel_type == REL_PARENT: 663 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 664 while not found and parent: 665 found = self.match_selectors(parent, relation) 666 parent = self.get_parent(parent, no_iframe=self.iframe_restrict) 667 elif relation[0].rel_type == REL_CLOSE_PARENT: 668 parent = self.get_parent(el, no_iframe=self.iframe_restrict) 669 if parent: 670 found = self.match_selectors(parent, relation) 671 elif relation[0].rel_type == REL_SIBLING: 672 sibling = self.get_previous(el) 673 while not found and sibling: 674 found = self.match_selectors(sibling, relation) 675 sibling = self.get_previous(sibling) 676 elif relation[0].rel_type == REL_CLOSE_SIBLING: 677 sibling = self.get_previous(el) 678 if sibling and self.is_tag(sibling): 679 found = self.match_selectors(sibling, relation) 680 return found 681 682 def match_future_child(self, parent, relation, recursive=False): 683 """Match future child.""" 684 685 match = False 686 children = self.get_descendants if recursive else self.get_children 687 for child in children(parent, no_iframe=self.iframe_restrict): 688 match = self.match_selectors(child, relation) 689 if match: 690 break 691 return match 692 693 def match_future_relations(self, el, relation): 694 """Match future relationship.""" 695 696 found = False 697 if relation[0].rel_type == REL_HAS_PARENT: 698 found = self.match_future_child(el, relation, True) 699 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: 700 found = self.match_future_child(el, relation) 701 elif relation[0].rel_type == REL_HAS_SIBLING: 702 sibling = self.get_next(el) 703 while not found and sibling: 704 found = self.match_selectors(sibling, relation) 705 sibling = self.get_next(sibling) 706 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: 707 sibling = self.get_next(el) 708 if sibling and self.is_tag(sibling): 709 found = self.match_selectors(sibling, relation) 710 return found 711 712 def match_relations(self, el, relation): 713 """Match relationship to other elements.""" 714 715 found = False 716 717 if relation[0].rel_type.startswith(':'): 718 found = self.match_future_relations(el, relation) 719 else: 720 found = self.match_past_relations(el, relation) 721 722 return found 723 724 def match_id(self, el, ids): 725 """Match element's ID.""" 726 727 found = True 728 for i in ids: 729 if i != self.get_attribute_by_name(el, 'id', ''): 730 found = False 731 break 732 return found 733 734 def match_classes(self, el, classes): 735 """Match element's classes.""" 736 737 current_classes = self.get_classes(el) 738 found = True 739 for c in classes: 740 if c not in current_classes: 741 found = False 742 break 743 return found 744 745 def match_root(self, el): 746 """Match element as root.""" 747 748 is_root = self.is_root(el) 749 if is_root: 750 sibling = self.get_previous(el, tags=False) 751 while is_root and sibling is not None: 752 if ( 753 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 754 self.is_cdata(sibling) 755 ): 756 is_root = False 757 else: 758 sibling = self.get_previous(sibling, tags=False) 759 if is_root: 760 sibling = self.get_next(el, tags=False) 761 while is_root and sibling is not None: 762 if ( 763 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or 764 self.is_cdata(sibling) 765 ): 766 is_root = False 767 else: 768 sibling = self.get_next(sibling, tags=False) 769 return is_root 770 771 def match_scope(self, el): 772 """Match element as scope.""" 773 774 return self.scope is el 775 776 def match_nth_tag_type(self, el, child): 777 """Match tag type for `nth` matches.""" 778 779 return( 780 (self.get_tag(child) == self.get_tag(el)) and 781 (self.get_tag_ns(child) == self.get_tag_ns(el)) 782 ) 783 784 def match_nth(self, el, nth): 785 """Match `nth` elements.""" 786 787 matched = True 788 789 for n in nth: 790 matched = False 791 if n.selectors and not self.match_selectors(el, n.selectors): 792 break 793 parent = self.get_parent(el) 794 if parent is None: 795 parent = self.create_fake_parent(el) 796 last = n.last 797 last_index = len(parent) - 1 798 index = last_index if last else 0 799 relative_index = 0 800 a = n.a 801 b = n.b 802 var = n.n 803 count = 0 804 count_incr = 1 805 factor = -1 if last else 1 806 idx = last_idx = a * count + b if var else a 807 808 # We can only adjust bounds within a variable index 809 if var: 810 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. 811 # Otherwise, increment to try to get in bounds. 812 adjust = None 813 while idx < 1 or idx > last_index: 814 if idx < 0: 815 diff_low = 0 - idx 816 if adjust is not None and adjust == 1: 817 break 818 adjust = -1 819 count += count_incr 820 idx = last_idx = a * count + b if var else a 821 diff = 0 - idx 822 if diff >= diff_low: 823 break 824 else: 825 diff_high = idx - last_index 826 if adjust is not None and adjust == -1: 827 break 828 adjust = 1 829 count += count_incr 830 idx = last_idx = a * count + b if var else a 831 diff = idx - last_index 832 if diff >= diff_high: 833 break 834 diff_high = diff 835 836 # If a < 0, our count is working backwards, so floor the index by increasing the count. 837 # Find the count that yields the lowest, in bound value and use that. 838 # Lastly reverse count increment so that we'll increase our index. 839 lowest = count 840 if a < 0: 841 while idx >= 1: 842 lowest = count 843 count += count_incr 844 idx = last_idx = a * count + b if var else a 845 count_incr = -1 846 count = lowest 847 idx = last_idx = a * count + b if var else a 848 849 # Evaluate elements while our calculated nth index is still in range 850 while 1 <= idx <= last_index + 1: 851 child = None 852 # Evaluate while our child index is still in range. 853 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): 854 index += factor 855 if not self.is_tag(child): 856 continue 857 # Handle `of S` in `nth-child` 858 if n.selectors and not self.match_selectors(child, n.selectors): 859 continue 860 # Handle `of-type` 861 if n.of_type and not self.match_nth_tag_type(el, child): 862 continue 863 relative_index += 1 864 if relative_index == idx: 865 if child is el: 866 matched = True 867 else: 868 break 869 if child is el: 870 break 871 if child is el: 872 break 873 last_idx = idx 874 count += count_incr 875 if count < 0: 876 # Count is counting down and has now ventured into invalid territory. 877 break 878 idx = a * count + b if var else a 879 if last_idx == idx: 880 break 881 if not matched: 882 break 883 return matched 884 885 def match_empty(self, el): 886 """Check if element is empty (if requested).""" 887 888 is_empty = True 889 for child in self.get_children(el, tags=False): 890 if self.is_tag(child): 891 is_empty = False 892 break 893 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): 894 is_empty = False 895 break 896 return is_empty 897 898 def match_subselectors(self, el, selectors): 899 """Match selectors.""" 900 901 match = True 902 for sel in selectors: 903 if not self.match_selectors(el, sel): 904 match = False 905 return match 906 907 def match_contains(self, el, contains): 908 """Match element if it contains text.""" 909 910 match = True 911 content = None 912 for contain_list in contains: 913 if content is None: 914 content = self.get_text(el, no_iframe=self.is_html) 915 found = False 916 for text in contain_list.text: 917 if text in content: 918 found = True 919 break 920 if not found: 921 match = False 922 return match 923 924 def match_default(self, el): 925 """Match default.""" 926 927 match = False 928 929 # Find this input's form 930 form = None 931 parent = self.get_parent(el, no_iframe=True) 932 while parent and form is None: 933 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 934 form = parent 935 else: 936 parent = self.get_parent(parent, no_iframe=True) 937 938 # Look in form cache to see if we've already located its default button 939 found_form = False 940 for f, t in self.cached_default_forms: 941 if f is form: 942 found_form = True 943 if t is el: 944 match = True 945 break 946 947 # We didn't have the form cached, so look for its default button 948 if not found_form: 949 for child in self.get_descendants(form, no_iframe=True): 950 name = self.get_tag(child) 951 # Can't do nested forms (haven't figured out why we never hit this) 952 if name == 'form': # pragma: no cover 953 break 954 if name in ('input', 'button'): 955 v = self.get_attribute_by_name(child, 'type', '') 956 if v and util.lower(v) == 'submit': 957 self.cached_default_forms.append([form, child]) 958 if el is child: 959 match = True 960 break 961 return match 962 963 def match_indeterminate(self, el): 964 """Match default.""" 965 966 match = False 967 name = self.get_attribute_by_name(el, 'name') 968 969 def get_parent_form(el): 970 """Find this input's form.""" 971 form = None 972 parent = self.get_parent(el, no_iframe=True) 973 while form is None: 974 if self.get_tag(parent) == 'form' and self.is_html_tag(parent): 975 form = parent 976 break 977 last_parent = parent 978 parent = self.get_parent(parent, no_iframe=True) 979 if parent is None: 980 form = last_parent 981 break 982 return form 983 984 form = get_parent_form(el) 985 986 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate 987 found_form = False 988 for f, n, i in self.cached_indeterminate_forms: 989 if f is form and n == name: 990 found_form = True 991 if i is True: 992 match = True 993 break 994 995 # We didn't have the form cached, so validate that the radio button is indeterminate 996 if not found_form: 997 checked = False 998 for child in self.get_descendants(form, no_iframe=True): 999 if child is el: 1000 continue 1001 tag_name = self.get_tag(child) 1002 if tag_name == 'input': 1003 is_radio = False 1004 check = False 1005 has_name = False 1006 for k, v in self.iter_attributes(child): 1007 if util.lower(k) == 'type' and util.lower(v) == 'radio': 1008 is_radio = True 1009 elif util.lower(k) == 'name' and v == name: 1010 has_name = True 1011 elif util.lower(k) == 'checked': 1012 check = True 1013 if is_radio and check and has_name and get_parent_form(child) is form: 1014 checked = True 1015 break 1016 if checked: 1017 break 1018 if not checked: 1019 match = True 1020 self.cached_indeterminate_forms.append([form, name, match]) 1021 1022 return match 1023 1024 def match_lang(self, el, langs): 1025 """Match languages.""" 1026 1027 match = False 1028 has_ns = self.supports_namespaces() 1029 root = self.root 1030 has_html_namespace = self.has_html_namespace 1031 1032 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. 1033 parent = el 1034 found_lang = None 1035 last = None 1036 while not found_lang: 1037 has_html_ns = self.has_html_ns(parent) 1038 for k, v in self.iter_attributes(parent): 1039 attr_ns, attr = self.split_namespace(parent, k) 1040 if ( 1041 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or 1042 ( 1043 has_ns and not has_html_ns and attr_ns == NS_XML and 1044 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' 1045 ) 1046 ): 1047 found_lang = v 1048 break 1049 last = parent 1050 parent = self.get_parent(parent, no_iframe=self.is_html) 1051 1052 if parent is None: 1053 root = last 1054 has_html_namespace = self.has_html_ns(root) 1055 parent = last 1056 break 1057 1058 # Use cached meta language. 1059 if not found_lang and self.cached_meta_lang: 1060 for cache in self.cached_meta_lang: 1061 if root is cache[0]: 1062 found_lang = cache[1] 1063 1064 # If we couldn't find a language, and the document is HTML, look to meta to determine language. 1065 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): 1066 # Find head 1067 found = False 1068 for tag in ('html', 'head'): 1069 found = False 1070 for child in self.get_children(parent, no_iframe=self.is_html): 1071 if self.get_tag(child) == tag and self.is_html_tag(child): 1072 found = True 1073 parent = child 1074 break 1075 if not found: # pragma: no cover 1076 break 1077 1078 # Search meta tags 1079 if found: 1080 for child in parent: 1081 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): 1082 c_lang = False 1083 content = None 1084 for k, v in self.iter_attributes(child): 1085 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': 1086 c_lang = True 1087 if util.lower(k) == 'content': 1088 content = v 1089 if c_lang and content: 1090 found_lang = content 1091 self.cached_meta_lang.append((root, found_lang)) 1092 break 1093 if found_lang: 1094 break 1095 if not found_lang: 1096 self.cached_meta_lang.append((root, False)) 1097 1098 # If we determined a language, compare. 1099 if found_lang: 1100 for patterns in langs: 1101 match = False 1102 for pattern in patterns: 1103 if pattern.match(found_lang): 1104 match = True 1105 if not match: 1106 break 1107 1108 return match 1109 1110 def match_dir(self, el, directionality): 1111 """Check directionality.""" 1112 1113 # If we have to match both left and right, we can't match either. 1114 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: 1115 return False 1116 1117 if el is None or not self.is_html_tag(el): 1118 return False 1119 1120 # Element has defined direction of left to right or right to left 1121 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) 1122 if direction not in (None, 0): 1123 return direction == directionality 1124 1125 # Element is the document element (the root) and no direction assigned, assume left to right. 1126 is_root = self.is_root(el) 1127 if is_root and direction is None: 1128 return ct.SEL_DIR_LTR == directionality 1129 1130 # If `input[type=telephone]` and no direction is assigned, assume left to right. 1131 name = self.get_tag(el) 1132 is_input = name == 'input' 1133 is_textarea = name == 'textarea' 1134 is_bdi = name == 'bdi' 1135 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' 1136 if is_input and itype == 'tel' and direction is None: 1137 return ct.SEL_DIR_LTR == directionality 1138 1139 # Auto handling for text inputs 1140 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: 1141 if is_textarea: 1142 value = [] 1143 for node in self.get_contents(el, no_iframe=True): 1144 if self.is_content_string(node): 1145 value.append(node) 1146 value = ''.join(value) 1147 else: 1148 value = self.get_attribute_by_name(el, 'value', '') 1149 if value: 1150 for c in value: 1151 bidi = unicodedata.bidirectional(c) 1152 if bidi in ('AL', 'R', 'L'): 1153 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL 1154 return direction == directionality 1155 # Assume left to right 1156 return ct.SEL_DIR_LTR == directionality 1157 elif is_root: 1158 return ct.SEL_DIR_LTR == directionality 1159 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1160 1161 # Auto handling for `bdi` and other non text inputs. 1162 if (is_bdi and direction is None) or direction == 0: 1163 direction = self.find_bidi(el) 1164 if direction is not None: 1165 return direction == directionality 1166 elif is_root: 1167 return ct.SEL_DIR_LTR == directionality 1168 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1169 1170 # Match parents direction 1171 return self.match_dir(self.get_parent(el, no_iframe=True), directionality) 1172 1173 def match_range(self, el, condition): 1174 """ 1175 Match range. 1176 1177 Behavior is modeled after what we see in browsers. Browsers seem to evaluate 1178 if the value is out of range, and if not, it is in range. So a missing value 1179 will not evaluate out of range; therefore, value is in range. Personally, I 1180 feel like this should evaluate as neither in or out of range. 1181 """ 1182 1183 out_of_range = False 1184 1185 itype = self.get_attribute_by_name(el, 'type').lower() 1186 mn = self.get_attribute_by_name(el, 'min', None) 1187 if mn is not None: 1188 mn = Inputs.parse_value(itype, mn) 1189 mx = self.get_attribute_by_name(el, 'max', None) 1190 if mx is not None: 1191 mx = Inputs.parse_value(itype, mx) 1192 1193 # There is no valid min or max, so we cannot evaluate a range 1194 if mn is None and mx is None: 1195 return False 1196 1197 value = self.get_attribute_by_name(el, 'value', None) 1198 if value is not None: 1199 value = Inputs.parse_value(itype, value) 1200 if value is not None: 1201 if itype in ("date", "datetime-local", "month", "week", "number", "range"): 1202 if mn is not None and value < mn: 1203 out_of_range = True 1204 if not out_of_range and mx is not None and value > mx: 1205 out_of_range = True 1206 elif itype == "time": 1207 if mn is not None and mx is not None and mn > mx: 1208 # Time is periodic, so this is a reversed/discontinuous range 1209 if value < mn and value > mx: 1210 out_of_range = True 1211 else: 1212 if mn is not None and value < mn: 1213 out_of_range = True 1214 if not out_of_range and mx is not None and value > mx: 1215 out_of_range = True 1216 1217 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range 1218 1219 def match_defined(self, el): 1220 """ 1221 Match defined. 1222 1223 `:defined` is related to custom elements in a browser. 1224 1225 - If the document is XML (not XHTML), all tags will match. 1226 - Tags that are not custom (don't have a hyphen) are marked defined. 1227 - If the tag has a prefix (without or without a namespace), it will not match. 1228 1229 This is of course requires the parser to provide us with the proper prefix and namespace info, 1230 if it doesn't, there is nothing we can do. 1231 """ 1232 1233 name = self.get_tag(el) 1234 return ( 1235 name.find('-') == -1 or 1236 name.find(':') != -1 or 1237 self.get_prefix(el) is not None 1238 ) 1239 1240 def match_selectors(self, el, selectors): 1241 """Check if element matches one of the selectors.""" 1242 1243 match = False 1244 is_not = selectors.is_not 1245 is_html = selectors.is_html 1246 1247 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. 1248 if is_html: 1249 namespaces = self.namespaces 1250 iframe_restrict = self.iframe_restrict 1251 self.namespaces = {'html': NS_XHTML} 1252 self.iframe_restrict = True 1253 1254 if not is_html or self.is_html: 1255 for selector in selectors: 1256 match = is_not 1257 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) 1258 if isinstance(selector, ct.SelectorNull): 1259 continue 1260 # Verify tag matches 1261 if not self.match_tag(el, selector.tag): 1262 continue 1263 # Verify tag is defined 1264 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): 1265 continue 1266 # Verify element is root 1267 if selector.flags & ct.SEL_ROOT and not self.match_root(el): 1268 continue 1269 # Verify element is scope 1270 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): 1271 continue 1272 # Verify `nth` matches 1273 if not self.match_nth(el, selector.nth): 1274 continue 1275 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): 1276 continue 1277 # Verify id matches 1278 if selector.ids and not self.match_id(el, selector.ids): 1279 continue 1280 # Verify classes match 1281 if selector.classes and not self.match_classes(el, selector.classes): 1282 continue 1283 # Verify attribute(s) match 1284 if not self.match_attributes(el, selector.attributes): 1285 continue 1286 # Verify ranges 1287 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): 1288 continue 1289 # Verify language patterns 1290 if selector.lang and not self.match_lang(el, selector.lang): 1291 continue 1292 # Verify pseudo selector patterns 1293 if selector.selectors and not self.match_subselectors(el, selector.selectors): 1294 continue 1295 # Verify relationship selectors 1296 if selector.relation and not self.match_relations(el, selector.relation): 1297 continue 1298 # Validate that the current default selector match corresponds to the first submit button in the form 1299 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): 1300 continue 1301 # Validate that the unset radio button is among radio buttons with the same name in a form that are 1302 # also not set. 1303 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): 1304 continue 1305 # Validate element directionality 1306 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): 1307 continue 1308 # Validate that the tag contains the specified text. 1309 if not self.match_contains(el, selector.contains): 1310 continue 1311 match = not is_not 1312 break 1313 1314 # Restore actual namespaces being used for external selector lists 1315 if is_html: 1316 self.namespaces = namespaces 1317 self.iframe_restrict = iframe_restrict 1318 1319 return match 1320 1321 def select(self, limit=0): 1322 """Match all tags under the targeted tag.""" 1323 1324 if limit < 1: 1325 limit = None 1326 1327 for child in self.get_descendants(self.tag): 1328 if self.match(child): 1329 yield child 1330 if limit is not None: 1331 limit -= 1 1332 if limit < 1: 1333 break 1334 1335 def closest(self): 1336 """Match closest ancestor.""" 1337 1338 current = self.tag 1339 closest = None 1340 while closest is None and current is not None: 1341 if self.match(current): 1342 closest = current 1343 else: 1344 current = self.get_parent(current) 1345 return closest 1346 1347 def filter(self): # noqa A001 1348 """Filter tag's children.""" 1349 1350 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] 1351 1352 def match(self, el): 1353 """Match.""" 1354 1355 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) 1356 1357 1358class CSSMatch(_DocumentNav, _Match): 1359 """The Beautiful Soup CSS match class.""" 1360 1361 1362class CommentsMatch(_DocumentNav): 1363 """Comments matcher.""" 1364 1365 def __init__(self, el): 1366 """Initialize.""" 1367 1368 self.assert_valid_input(el) 1369 self.tag = el 1370 1371 def get_comments(self, limit=0): 1372 """Get comments.""" 1373 1374 if limit < 1: 1375 limit = None 1376 1377 for child in self.get_descendants(self.tag, tags=False): 1378 if self.is_comment(child): 1379 yield child 1380 if limit is not None: 1381 limit -= 1 1382 if limit < 1: 1383 break 1384 1385 1386class SoupSieve(ct.Immutable): 1387 """Compiled Soup Sieve selector matching object.""" 1388 1389 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") 1390 1391 def __init__(self, pattern, selectors, namespaces, custom, flags): 1392 """Initialize.""" 1393 1394 super(SoupSieve, self).__init__( 1395 pattern=pattern, 1396 selectors=selectors, 1397 namespaces=namespaces, 1398 custom=custom, 1399 flags=flags 1400 ) 1401 1402 def match(self, tag): 1403 """Match.""" 1404 1405 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) 1406 1407 def closest(self, tag): 1408 """Match closest ancestor.""" 1409 1410 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() 1411 1412 def filter(self, iterable): # noqa A001 1413 """ 1414 Filter. 1415 1416 `CSSMatch` can cache certain searches for tags of the same document, 1417 so if we are given a tag, all tags are from the same document, 1418 and we can take advantage of the optimization. 1419 1420 Any other kind of iterable could have tags from different documents or detached tags, 1421 so for those, we use a new `CSSMatch` for each item in the iterable. 1422 """ 1423 1424 if CSSMatch.is_tag(iterable): 1425 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() 1426 else: 1427 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] 1428 1429 @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.") 1430 def comments(self, tag, limit=0): 1431 """Get comments only.""" 1432 1433 return [comment for comment in CommentsMatch(tag).get_comments(limit)] 1434 1435 @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") 1436 def icomments(self, tag, limit=0): 1437 """Iterate comments only.""" 1438 1439 for comment in CommentsMatch(tag).get_comments(limit): 1440 yield comment 1441 1442 def select_one(self, tag): 1443 """Select a single tag.""" 1444 1445 tags = self.select(tag, limit=1) 1446 return tags[0] if tags else None 1447 1448 def select(self, tag, limit=0): 1449 """Select the specified tags.""" 1450 1451 return list(self.iselect(tag, limit)) 1452 1453 def iselect(self, tag, limit=0): 1454 """Iterate the specified tags.""" 1455 1456 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): 1457 yield el 1458 1459 def __repr__(self): # pragma: no cover 1460 """Representation.""" 1461 1462 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( 1463 self.pattern, 1464 self.namespaces, 1465 self.custom, 1466 self.flags 1467 ) 1468 1469 __str__ = __repr__ 1470 1471 1472ct.pickle_register(SoupSieve) 1473