1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3""" 4 Tokenizer, parser and parsed objects for CSS selectors. 5 6 :copyright: (c) 2007-2012 Ian Bicking and contributors. 7 See AUTHORS for more details. 8 :license: BSD, see LICENSE for more details. 9 10""" 11 12import sys 13import re 14import operator 15import string 16 17from css_selectors.errors import SelectorSyntaxError, ExpressionError 18from polyglot.builtins import unicode_type, codepoint_to_chr 19 20 21utab = {c:c+32 for c in range(ord(u'A'), ord(u'Z')+1)} 22 23if sys.version_info.major < 3: 24 tab = string.maketrans(string.ascii_uppercase, string.ascii_lowercase) 25 26 def ascii_lower(string): 27 """Lower-case, but only in the ASCII range.""" 28 return string.translate(utab if isinstance(string, unicode_type) else tab) 29 30 def urepr(x): 31 if isinstance(x, list): 32 return '[%s]' % ', '.join((map(urepr, x))) 33 ans = repr(x) 34 if ans.startswith("u'") or ans.startswith('u"'): 35 ans = ans[1:] 36 return ans 37 38 39else: 40 41 def ascii_lower(x): 42 return x.translate(utab) 43 44 urepr = repr 45 46 47# Parsed objects 48 49class Selector: 50 51 """ 52 Represents a parsed selector. 53 """ 54 55 def __init__(self, tree, pseudo_element=None): 56 self.parsed_tree = tree 57 if pseudo_element is not None and not isinstance( 58 pseudo_element, FunctionalPseudoElement): 59 pseudo_element = ascii_lower(pseudo_element) 60 #: A :class:`FunctionalPseudoElement`, 61 #: or the identifier for the pseudo-element as a string, 62 # or ``None``. 63 #: 64 #: +-------------------------+----------------+--------------------------------+ 65 #: | | Selector | Pseudo-element | 66 #: +=========================+================+================================+ 67 #: | CSS3 syntax | ``a::before`` | ``'before'`` | 68 #: +-------------------------+----------------+--------------------------------+ 69 #: | Older syntax | ``a:before`` | ``'before'`` | 70 #: +-------------------------+----------------+--------------------------------+ 71 #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | 72 #: | not in Selectors3 | | | 73 #: +-------------------------+----------------+--------------------------------+ 74 #: | Invalid pseudo-class | ``li:marker`` | ``None`` | 75 #: +-------------------------+----------------+--------------------------------+ 76 #: | Functinal | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | 77 #: +-------------------------+----------------+--------------------------------+ 78 #: 79 # : .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement 80 self.pseudo_element = pseudo_element 81 82 def __repr__(self): 83 if isinstance(self.pseudo_element, FunctionalPseudoElement): 84 pseudo_element = repr(self.pseudo_element) 85 if self.pseudo_element: 86 pseudo_element = '::%s' % self.pseudo_element 87 else: 88 pseudo_element = '' 89 return '%s[%r%s]' % ( 90 self.__class__.__name__, self.parsed_tree, pseudo_element) 91 92 def specificity(self): 93 """Return the specificity_ of this selector as a tuple of 3 integers. 94 95 .. _specificity: http://www.w3.org/TR/selectors/#specificity 96 97 """ 98 a, b, c = self.parsed_tree.specificity() 99 if self.pseudo_element: 100 c += 1 101 return a, b, c 102 103 104class Class: 105 106 """ 107 Represents selector.class_name 108 """ 109 def __init__(self, selector, class_name): 110 self.selector = selector 111 self.class_name = class_name 112 113 def __repr__(self): 114 return '%s[%r.%s]' % ( 115 self.__class__.__name__, self.selector, self.class_name) 116 117 def specificity(self): 118 a, b, c = self.selector.specificity() 119 b += 1 120 return a, b, c 121 122 123class FunctionalPseudoElement: 124 125 """ 126 Represents selector::name(arguments) 127 128 .. attribute:: name 129 130 The name (identifier) of the pseudo-element, as a string. 131 132 .. attribute:: arguments 133 134 The arguments of the pseudo-element, as a list of tokens. 135 136 **Note:** tokens are not part of the public API, 137 and may change between versions. 138 Use at your own risks. 139 140 """ 141 def __init__(self, name, arguments): 142 self.name = ascii_lower(name) 143 self.arguments = arguments 144 145 def __repr__(self): 146 return '%s[::%s(%s)]' % ( 147 self.__class__.__name__, self.name, 148 urepr([token.value for token in self.arguments])) 149 150 def argument_types(self): 151 return [token.type for token in self.arguments] 152 153 def specificity(self): 154 a, b, c = self.selector.specificity() 155 b += 1 156 return a, b, c 157 158 159class Function: 160 161 """ 162 Represents selector:name(expr) 163 """ 164 def __init__(self, selector, name, arguments): 165 self.selector = selector 166 self.name = ascii_lower(name) 167 self.arguments = arguments 168 self._parsed_arguments = None 169 170 def __repr__(self): 171 return '%s[%r:%s(%s)]' % ( 172 self.__class__.__name__, self.selector, self.name, 173 urepr([token.value for token in self.arguments])) 174 175 def argument_types(self): 176 return [token.type for token in self.arguments] 177 178 @property 179 def parsed_arguments(self): 180 if self._parsed_arguments is None: 181 try: 182 self._parsed_arguments = parse_series(self.arguments) 183 except ValueError: 184 raise ExpressionError("Invalid series: '%r'" % self.arguments) 185 return self._parsed_arguments 186 187 def parse_arguments(self): 188 if not self.arguments_parsed: 189 self.arguments_parsed = True 190 191 def specificity(self): 192 a, b, c = self.selector.specificity() 193 b += 1 194 return a, b, c 195 196 197class Pseudo: 198 199 """ 200 Represents selector:ident 201 """ 202 def __init__(self, selector, ident): 203 self.selector = selector 204 self.ident = ascii_lower(ident) 205 206 def __repr__(self): 207 return '%s[%r:%s]' % ( 208 self.__class__.__name__, self.selector, self.ident) 209 210 def specificity(self): 211 a, b, c = self.selector.specificity() 212 b += 1 213 return a, b, c 214 215 216class Negation: 217 218 """ 219 Represents selector:not(subselector) 220 """ 221 def __init__(self, selector, subselector): 222 self.selector = selector 223 self.subselector = subselector 224 225 def __repr__(self): 226 return '%s[%r:not(%r)]' % ( 227 self.__class__.__name__, self.selector, self.subselector) 228 229 def specificity(self): 230 a1, b1, c1 = self.selector.specificity() 231 a2, b2, c2 = self.subselector.specificity() 232 return a1 + a2, b1 + b2, c1 + c2 233 234 235class Attrib: 236 237 """ 238 Represents selector[namespace|attrib operator value] 239 """ 240 def __init__(self, selector, namespace, attrib, operator, value): 241 self.selector = selector 242 self.namespace = namespace 243 self.attrib = attrib 244 self.operator = operator 245 self.value = value 246 247 def __repr__(self): 248 if self.namespace: 249 attrib = '%s|%s' % (self.namespace, self.attrib) 250 else: 251 attrib = self.attrib 252 if self.operator == 'exists': 253 return '%s[%r[%s]]' % ( 254 self.__class__.__name__, self.selector, attrib) 255 else: 256 return '%s[%r[%s %s %s]]' % ( 257 self.__class__.__name__, self.selector, attrib, 258 self.operator, urepr(self.value)) 259 260 def specificity(self): 261 a, b, c = self.selector.specificity() 262 b += 1 263 return a, b, c 264 265 266class Element: 267 268 """ 269 Represents namespace|element 270 271 `None` is for the universal selector '*' 272 273 """ 274 def __init__(self, namespace=None, element=None): 275 self.namespace = namespace 276 self.element = element 277 278 def __repr__(self): 279 element = self.element or '*' 280 if self.namespace: 281 element = '%s|%s' % (self.namespace, element) 282 return '%s[%s]' % (self.__class__.__name__, element) 283 284 def specificity(self): 285 if self.element: 286 return 0, 0, 1 287 else: 288 return 0, 0, 0 289 290 291class Hash: 292 293 """ 294 Represents selector#id 295 """ 296 def __init__(self, selector, id): 297 self.selector = selector 298 self.id = id 299 300 def __repr__(self): 301 return '%s[%r#%s]' % ( 302 self.__class__.__name__, self.selector, self.id) 303 304 def specificity(self): 305 a, b, c = self.selector.specificity() 306 a += 1 307 return a, b, c 308 309 310class CombinedSelector: 311 312 def __init__(self, selector, combinator, subselector): 313 assert selector is not None 314 self.selector = selector 315 self.combinator = combinator 316 self.subselector = subselector 317 318 def __repr__(self): 319 if self.combinator == ' ': 320 comb = '<followed>' 321 else: 322 comb = self.combinator 323 return '%s[%r %s %r]' % ( 324 self.__class__.__name__, self.selector, comb, self.subselector) 325 326 def specificity(self): 327 a1, b1, c1 = self.selector.specificity() 328 a2, b2, c2 = self.subselector.specificity() 329 return a1 + a2, b1 + b2, c1 + c2 330 331 332# Parser 333 334# foo 335_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$') 336 337# foo#bar or #bar 338_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$') 339 340# foo.bar or .bar 341_class_re = re.compile( 342 r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$') 343 344 345def parse(css): 346 """Parse a CSS *group of selectors*. 347 348 :param css: 349 A *group of selectors* as an Unicode string. 350 :raises: 351 :class:`SelectorSyntaxError` on invalid selectors. 352 :returns: 353 A list of parsed :class:`Selector` objects, one for each 354 selector in the comma-separated group. 355 356 """ 357 # Fast path for simple cases 358 match = _el_re.match(css) 359 if match: 360 return [Selector(Element(element=match.group(1)))] 361 match = _id_re.match(css) 362 if match is not None: 363 return [Selector(Hash(Element(element=match.group(1) or None), 364 match.group(2)))] 365 match = _class_re.match(css) 366 if match is not None: 367 return [Selector(Class(Element(element=match.group(1) or None), 368 match.group(2)))] 369 370 stream = TokenStream(tokenize(css)) 371 stream.source = css 372 return list(parse_selector_group(stream)) 373# except SelectorSyntaxError: 374# e = sys.exc_info()[1] 375# message = "%s at %s -> %r" % ( 376# e, stream.used, stream.peek()) 377# e.msg = message 378# e.args = tuple([message]) 379# raise 380 381 382def parse_selector_group(stream): 383 stream.skip_whitespace() 384 while 1: 385 yield Selector(*parse_selector(stream)) 386 if stream.peek() == ('DELIM', ','): 387 stream.next() 388 stream.skip_whitespace() 389 else: 390 break 391 392 393def parse_selector(stream): 394 result, pseudo_element = parse_simple_selector(stream) 395 while 1: 396 stream.skip_whitespace() 397 peek = stream.peek() 398 if peek in (('EOF', None), ('DELIM', ',')): 399 break 400 if pseudo_element: 401 raise SelectorSyntaxError( 402 'Got pseudo-element ::%s not at the end of a selector' 403 % pseudo_element) 404 if peek.is_delim('+', '>', '~'): 405 # A combinator 406 combinator = stream.next().value 407 stream.skip_whitespace() 408 else: 409 # By exclusion, the last parse_simple_selector() ended 410 # at peek == ' ' 411 combinator = ' ' 412 next_selector, pseudo_element = parse_simple_selector(stream) 413 result = CombinedSelector(result, combinator, next_selector) 414 return result, pseudo_element 415 416 417special_pseudo_elements = ( 418 'first-line', 'first-letter', 'before', 'after') 419 420 421def parse_simple_selector(stream, inside_negation=False): 422 stream.skip_whitespace() 423 selector_start = len(stream.used) 424 peek = stream.peek() 425 if peek.type == 'IDENT' or peek == ('DELIM', '*'): 426 if peek.type == 'IDENT': 427 namespace = stream.next().value 428 else: 429 stream.next() 430 namespace = None 431 if stream.peek() == ('DELIM', '|'): 432 stream.next() 433 element = stream.next_ident_or_star() 434 else: 435 element = namespace 436 namespace = None 437 else: 438 element = namespace = None 439 result = Element(namespace, element) 440 pseudo_element = None 441 while 1: 442 peek = stream.peek() 443 if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or ( 444 inside_negation and peek == ('DELIM', ')')): 445 break 446 if pseudo_element: 447 raise SelectorSyntaxError( 448 'Got pseudo-element ::%s not at the end of a selector' 449 % pseudo_element) 450 if peek.type == 'HASH': 451 result = Hash(result, stream.next().value) 452 elif peek == ('DELIM', '.'): 453 stream.next() 454 result = Class(result, stream.next_ident()) 455 elif peek == ('DELIM', '['): 456 stream.next() 457 result = parse_attrib(result, stream) 458 elif peek == ('DELIM', ':'): 459 stream.next() 460 if stream.peek() == ('DELIM', ':'): 461 stream.next() 462 pseudo_element = stream.next_ident() 463 if stream.peek() == ('DELIM', '('): 464 stream.next() 465 pseudo_element = FunctionalPseudoElement( 466 pseudo_element, parse_arguments(stream)) 467 continue 468 ident = stream.next_ident() 469 if ident.lower() in special_pseudo_elements: 470 # Special case: CSS 2.1 pseudo-elements can have a single ':' 471 # Any new pseudo-element must have two. 472 pseudo_element = unicode_type(ident) 473 continue 474 if stream.peek() != ('DELIM', '('): 475 result = Pseudo(result, ident) 476 continue 477 stream.next() 478 stream.skip_whitespace() 479 if ident.lower() == 'not': 480 if inside_negation: 481 raise SelectorSyntaxError('Got nested :not()') 482 argument, argument_pseudo_element = parse_simple_selector( 483 stream, inside_negation=True) 484 next = stream.next() 485 if argument_pseudo_element: 486 raise SelectorSyntaxError( 487 'Got pseudo-element ::%s inside :not() at %s' 488 % (argument_pseudo_element, next.pos)) 489 if next != ('DELIM', ')'): 490 raise SelectorSyntaxError("Expected ')', got %s" % (next,)) 491 result = Negation(result, argument) 492 else: 493 result = Function(result, ident, parse_arguments(stream)) 494 else: 495 raise SelectorSyntaxError( 496 "Expected selector, got %s" % (peek,)) 497 if len(stream.used) == selector_start: 498 raise SelectorSyntaxError( 499 "Expected selector, got %s" % (stream.peek(),)) 500 return result, pseudo_element 501 502 503def parse_arguments(stream): 504 arguments = [] 505 while 1: 506 stream.skip_whitespace() 507 next = stream.next() 508 if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [ 509 ('DELIM', '+'), ('DELIM', '-')]: 510 arguments.append(next) 511 elif next == ('DELIM', ')'): 512 return arguments 513 else: 514 raise SelectorSyntaxError( 515 "Expected an argument, got %s" % (next,)) 516 517 518def parse_attrib(selector, stream): 519 stream.skip_whitespace() 520 attrib = stream.next_ident_or_star() 521 if attrib is None and stream.peek() != ('DELIM', '|'): 522 raise SelectorSyntaxError( 523 "Expected '|', got %s" % (stream.peek(),)) 524 if stream.peek() == ('DELIM', '|'): 525 stream.next() 526 if stream.peek() == ('DELIM', '='): 527 namespace = None 528 stream.next() 529 op = '|=' 530 else: 531 namespace = attrib 532 attrib = stream.next_ident() 533 op = None 534 else: 535 namespace = op = None 536 if op is None: 537 stream.skip_whitespace() 538 next = stream.next() 539 if next == ('DELIM', ']'): 540 return Attrib(selector, namespace, attrib, 'exists', None) 541 elif next == ('DELIM', '='): 542 op = '=' 543 elif next.is_delim('^', '$', '*', '~', '|', '!') and ( 544 stream.peek() == ('DELIM', '=')): 545 op = next.value + '=' 546 stream.next() 547 else: 548 raise SelectorSyntaxError( 549 "Operator expected, got %s" % (next,)) 550 stream.skip_whitespace() 551 value = stream.next() 552 if value.type not in ('IDENT', 'STRING'): 553 raise SelectorSyntaxError( 554 "Expected string or ident, got %s" % (value,)) 555 stream.skip_whitespace() 556 next = stream.next() 557 if next != ('DELIM', ']'): 558 raise SelectorSyntaxError( 559 "Expected ']', got %s" % (next,)) 560 return Attrib(selector, namespace, attrib, op, value.value) 561 562 563def parse_series(tokens): 564 """ 565 Parses the arguments for :nth-child() and friends. 566 567 :raises: A list of tokens 568 :returns: :``(a, b)`` 569 570 """ 571 for token in tokens: 572 if token.type == 'STRING': 573 raise ValueError('String tokens not allowed in series.') 574 s = ''.join(token.value for token in tokens).strip() 575 if s == 'odd': 576 return (2, 1) 577 elif s == 'even': 578 return (2, 0) 579 elif s == 'n': 580 return (1, 0) 581 if 'n' not in s: 582 # Just b 583 return (0, int(s)) 584 a, b = s.split('n', 1) 585 if not a: 586 a = 1 587 elif a == '-' or a == '+': 588 a = int(a+'1') 589 else: 590 a = int(a) 591 if not b: 592 b = 0 593 else: 594 b = int(b) 595 return (a, b) 596 597 598# Token objects 599 600class Token(tuple): 601 602 def __new__(cls, type_, value, pos): 603 obj = tuple.__new__(cls, (type_, value)) 604 obj.pos = pos 605 return obj 606 607 def __repr__(self): 608 return "<%s '%s' at %i>" % (self.type, self.value, self.pos) 609 610 def is_delim(self, *values): 611 return self.type == 'DELIM' and self.value in values 612 613 type = property(operator.itemgetter(0)) 614 value = property(operator.itemgetter(1)) 615 616 617class EOFToken(Token): 618 619 def __new__(cls, pos): 620 return Token.__new__(cls, 'EOF', None, pos) 621 622 def __repr__(self): 623 return '<%s at %i>' % (self.type, self.pos) 624 625 626# Tokenizer 627 628 629class TokenMacros: 630 unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?' 631 escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]' 632 string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape 633 nonascii = r'[^\0-\177]' 634 nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii) 635 nmstart = '[_a-z]|%s|%s' % (escape, nonascii) 636 637 638def _compile(pattern): 639 return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match 640 641 642_match_whitespace = _compile(r'[ \t\r\n\f]+') 643_match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)') 644_match_hash = _compile('#(?:%(nmchar)s)+') 645_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*') 646_match_string_by_quote = { 647 "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), 648 '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), 649} 650 651_sub_simple_escape = re.compile(r'\\(.)').sub 652_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub 653_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub 654 655# Same as r'\1', but faster on CPython 656if hasattr(operator, 'methodcaller'): 657 # Python 2.6+ 658 _replace_simple = operator.methodcaller('group', 1) 659else: 660 def _replace_simple(match): 661 return match.group(1) 662 663 664def _replace_unicode(match): 665 codepoint = int(match.group(1), 16) 666 if codepoint > sys.maxunicode: 667 codepoint = 0xFFFD 668 return codepoint_to_chr(codepoint) 669 670 671def unescape_ident(value): 672 value = _sub_unicode_escape(_replace_unicode, value) 673 value = _sub_simple_escape(_replace_simple, value) 674 return value 675 676 677def tokenize(s): 678 pos = 0 679 len_s = len(s) 680 while pos < len_s: 681 match = _match_whitespace(s, pos=pos) 682 if match: 683 yield Token('S', ' ', pos) 684 pos = match.end() 685 continue 686 687 match = _match_ident(s, pos=pos) 688 if match: 689 value = _sub_simple_escape(_replace_simple, 690 _sub_unicode_escape(_replace_unicode, match.group())) 691 yield Token('IDENT', value, pos) 692 pos = match.end() 693 continue 694 695 match = _match_hash(s, pos=pos) 696 if match: 697 value = _sub_simple_escape(_replace_simple, 698 _sub_unicode_escape(_replace_unicode, match.group()[1:])) 699 yield Token('HASH', value, pos) 700 pos = match.end() 701 continue 702 703 quote = s[pos] 704 if quote in _match_string_by_quote: 705 match = _match_string_by_quote[quote](s, pos=pos + 1) 706 assert match, 'Should have found at least an empty match' 707 end_pos = match.end() 708 if end_pos == len_s: 709 raise SelectorSyntaxError('Unclosed string at %s' % pos) 710 if s[end_pos] != quote: 711 raise SelectorSyntaxError('Invalid string at %s' % pos) 712 value = _sub_simple_escape(_replace_simple, 713 _sub_unicode_escape(_replace_unicode, 714 _sub_newline_escape('', match.group()))) 715 yield Token('STRING', value, pos) 716 pos = end_pos + 1 717 continue 718 719 match = _match_number(s, pos=pos) 720 if match: 721 value = match.group() 722 yield Token('NUMBER', value, pos) 723 pos = match.end() 724 continue 725 726 pos2 = pos + 2 727 if s[pos:pos2] == '/*': 728 pos = s.find('*/', pos2) 729 if pos == -1: 730 pos = len_s 731 else: 732 pos += 2 733 continue 734 735 yield Token('DELIM', s[pos], pos) 736 pos += 1 737 738 assert pos == len_s 739 yield EOFToken(pos) 740 741 742class TokenStream: 743 744 def __init__(self, tokens, source=None): 745 self.used = [] 746 self.tokens = iter(tokens) 747 self.source = source 748 self.peeked = None 749 self._peeking = False 750 try: 751 self.next_token = self.tokens.next 752 except AttributeError: 753 # Python 3 754 self.next_token = self.tokens.__next__ 755 756 def next(self): 757 if self._peeking: 758 self._peeking = False 759 self.used.append(self.peeked) 760 return self.peeked 761 else: 762 next = self.next_token() 763 self.used.append(next) 764 return next 765 766 def peek(self): 767 if not self._peeking: 768 self.peeked = self.next_token() 769 self._peeking = True 770 return self.peeked 771 772 def next_ident(self): 773 next = self.next() 774 if next.type != 'IDENT': 775 raise SelectorSyntaxError('Expected ident, got %s' % (next,)) 776 return next.value 777 778 def next_ident_or_star(self): 779 next = self.next() 780 if next.type == 'IDENT': 781 return next.value 782 elif next == ('DELIM', '*'): 783 return None 784 else: 785 raise SelectorSyntaxError( 786 "Expected ident or '*', got %s" % (next,)) 787 788 def skip_whitespace(self): 789 peek = self.peek() 790 if peek.type == 'S': 791 self.next() 792