1"""CSS selector parser.""" 2import re 3from functools import lru_cache 4from . import util 5from . import css_match as cm 6from . import css_types as ct 7from .util import SelectorSyntaxError 8 9UNICODE_REPLACEMENT_CHAR = 0xFFFD 10 11# Simple pseudo classes that take no parameters 12PSEUDO_SIMPLE = { 13 ":any-link", 14 ":empty", 15 ":first-child", 16 ":first-of-type", 17 ":in-range", 18 ":out-of-range", 19 ":last-child", 20 ":last-of-type", 21 ":link", 22 ":only-child", 23 ":only-of-type", 24 ":root", 25 ':checked', 26 ':default', 27 ':disabled', 28 ':enabled', 29 ':indeterminate', 30 ':optional', 31 ':placeholder-shown', 32 ':read-only', 33 ':read-write', 34 ':required', 35 ':scope', 36 ':defined' 37} 38 39# Supported, simple pseudo classes that match nothing in the Soup Sieve environment 40PSEUDO_SIMPLE_NO_MATCH = { 41 ':active', 42 ':current', 43 ':focus', 44 ':focus-visible', 45 ':focus-within', 46 ':future', 47 ':host', 48 ':hover', 49 ':local-link', 50 ':past', 51 ':paused', 52 ':playing', 53 ':target', 54 ':target-within', 55 ':user-invalid', 56 ':visited' 57} 58 59# Complex pseudo classes that take selector lists 60PSEUDO_COMPLEX = { 61 ':contains', 62 ':has', 63 ':is', 64 ':matches', 65 ':not', 66 ':where' 67} 68 69PSEUDO_COMPLEX_NO_MATCH = { 70 ':current', 71 ':host', 72 ':host-context' 73} 74 75# Complex pseudo classes that take very specific parameters and are handled special 76PSEUDO_SPECIAL = { 77 ':dir', 78 ':lang', 79 ':nth-child', 80 ':nth-last-child', 81 ':nth-last-of-type', 82 ':nth-of-type' 83} 84 85PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL 86 87# Sub-patterns parts 88# Whitespace 89NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])' 90WS = r'(?:[ \t]|{})'.format(NEWLINE) 91# Comments 92COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)' 93# Whitespace with comments included 94WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS) 95# CSS escapes 96CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS) 97CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE) 98# CSS Identifier 99IDENTIFIER = r''' 100(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--) 101(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*) 102'''.format(esc=CSS_ESCAPES) 103# `nth` content 104NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC) 105# Value: quoted string or identifier 106VALUE = r''' 107(?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+) 108'''.format(nl=NEWLINE, ident=IDENTIFIER) 109# Attribute value comparison. `!=` is handled special as it is non-standard. 110ATTR = r''' 111(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\] 112'''.format(ws=WSC, value=VALUE) 113 114# Selector patterns 115# IDs (`#id`) 116PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER) 117# Classes (`.class`) 118PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER) 119# Prefix:Tag (`prefix|tag`) 120PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER) 121# Attributes (`[attr]`, `[attr=value]`, etc.) 122PAT_ATTR = r''' 123\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr} 124'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR) 125# Pseudo class (`:pseudo-class`, `:pseudo-class(`) 126PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER) 127# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. 128PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER) 129# Custom pseudo class (`:--custom-pseudo`) 130PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER) 131# Closing pseudo group (`)`) 132PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC) 133# Pseudo element (`::pseudo-element`) 134PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS) 135# At rule (`@page`, etc.) (not supported) 136PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER) 137# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.) 138PAT_PSEUDO_NTH_CHILD = r''' 139(?P<pseudo_nth_child>{name} 140(?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*)) 141'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH) 142# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.) 143PAT_PSEUDO_NTH_TYPE = r''' 144(?P<pseudo_nth_type>{name} 145(?P<nth_type>{nth}|even|odd)){ws}*\) 146'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH) 147# Pseudo class language (`:lang("*-de", en)`) 148PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( 149 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE 150) 151# Pseudo class direction (`:dir(ltr)`) 152PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC) 153# Combining characters (`>`, `~`, ` `, `+`, `,`) 154PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC) 155# Extra: Contains (`:contains(text)`) 156PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( 157 name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE 158) 159 160# Regular expressions 161# CSS escape pattern 162RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I) 163RE_CSS_STR_ESC = re.compile( 164 r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I 165) 166# Pattern to break up `nth` specifiers 167RE_NTH = re.compile( 168 r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC), 169 re.I 170) 171# Pattern to iterate multiple values. 172RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X) 173# Whitespace checks 174RE_WS = re.compile(WS) 175RE_WS_BEGIN = re.compile('^{}*'.format(WSC)) 176RE_WS_END = re.compile('{}*$'.format(WSC)) 177RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X) 178 179# Constants 180# List split token 181COMMA_COMBINATOR = ',' 182# Relation token for descendant 183WS_COMBINATOR = " " 184 185# Parse flags 186FLG_PSEUDO = 0x01 187FLG_NOT = 0x02 188FLG_RELATIVE = 0x04 189FLG_DEFAULT = 0x08 190FLG_HTML = 0x10 191FLG_INDETERMINATE = 0x20 192FLG_OPEN = 0x40 193FLG_IN_RANGE = 0x80 194FLG_OUT_OF_RANGE = 0x100 195FLG_PLACEHOLDER_SHOWN = 0x200 196 197# Maximum cached patterns to store 198_MAXCACHE = 500 199 200 201@lru_cache(maxsize=_MAXCACHE) 202def _cached_css_compile(pattern, namespaces, custom, flags): 203 """Cached CSS compile.""" 204 205 custom_selectors = process_custom(custom) 206 return cm.SoupSieve( 207 pattern, 208 CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(), 209 namespaces, 210 custom, 211 flags 212 ) 213 214 215def _purge_cache(): 216 """Purge the cache.""" 217 218 _cached_css_compile.cache_clear() 219 220 221def process_custom(custom): 222 """Process custom.""" 223 224 custom_selectors = {} 225 if custom is not None: 226 for key, value in custom.items(): 227 name = util.lower(key) 228 if RE_CUSTOM.match(name) is None: 229 raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name)) 230 if name in custom_selectors: 231 raise KeyError("The custom selector '{}' has already been registered".format(name)) 232 custom_selectors[css_unescape(name)] = value 233 return custom_selectors 234 235 236def css_unescape(content, string=False): 237 """ 238 Unescape CSS value. 239 240 Strings allow for spanning the value on multiple strings by escaping a new line. 241 """ 242 243 def replace(m): 244 """Replace with the appropriate substitute.""" 245 246 if m.group(1): 247 codepoint = int(m.group(1)[1:], 16) 248 if codepoint == 0: 249 codepoint = UNICODE_REPLACEMENT_CHAR 250 value = chr(codepoint) 251 elif m.group(2): 252 value = m.group(2)[1:] 253 elif m.group(3): 254 value = '\ufffd' 255 else: 256 value = '' 257 258 return value 259 260 return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) 261 262 263def escape(ident): 264 """Escape identifier.""" 265 266 string = [] 267 length = len(ident) 268 start_dash = length > 0 and ident[0] == '-' 269 if length == 1 and start_dash: 270 # Need to escape identifier that is a single `-` with no other characters 271 string.append('\\{}'.format(ident)) 272 else: 273 for index, c in enumerate(ident): 274 codepoint = ord(c) 275 if codepoint == 0x00: 276 string.append('\ufffd') 277 elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: 278 string.append('\\{:x} '.format(codepoint)) 279 elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39): 280 string.append('\\{:x} '.format(codepoint)) 281 elif ( 282 codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or 283 (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A) 284 ): 285 string.append(c) 286 else: 287 string.append('\\{}'.format(c)) 288 return ''.join(string) 289 290 291class SelectorPattern(object): 292 """Selector pattern.""" 293 294 def __init__(self, name, pattern): 295 """Initialize.""" 296 297 self.name = name 298 self.re_pattern = re.compile(pattern, re.I | re.X | re.U) 299 300 def get_name(self): 301 """Get name.""" 302 303 return self.name 304 305 def match(self, selector, index, flags): 306 """Match the selector.""" 307 308 return self.re_pattern.match(selector, index) 309 310 311class SpecialPseudoPattern(SelectorPattern): 312 """Selector pattern.""" 313 314 def __init__(self, patterns): 315 """Initialize.""" 316 317 self.patterns = {} 318 for p in patterns: 319 name = p[0] 320 pattern = p[3](name, p[2]) 321 for pseudo in p[1]: 322 self.patterns[pseudo] = pattern 323 324 self.matched_name = None 325 self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) 326 327 def get_name(self): 328 """Get name.""" 329 330 return self.matched_name.get_name() 331 332 def match(self, selector, index, flags): 333 """Match the selector.""" 334 335 pseudo = None 336 m = self.re_pseudo_name.match(selector, index) 337 if m: 338 name = util.lower(css_unescape(m.group('name'))) 339 pattern = self.patterns.get(name) 340 if pattern: 341 pseudo = pattern.match(selector, index, flags) 342 if pseudo: 343 self.matched_name = pattern 344 345 return pseudo 346 347 348class _Selector(object): 349 """ 350 Intermediate selector class. 351 352 This stores selector data for a compound selector as we are acquiring them. 353 Once we are done collecting the data for a compound selector, we freeze 354 the data in an object that can be pickled and hashed. 355 """ 356 357 def __init__(self, **kwargs): 358 """Initialize.""" 359 360 self.tag = kwargs.get('tag', None) 361 self.ids = kwargs.get('ids', []) 362 self.classes = kwargs.get('classes', []) 363 self.attributes = kwargs.get('attributes', []) 364 self.nth = kwargs.get('nth', []) 365 self.selectors = kwargs.get('selectors', []) 366 self.relations = kwargs.get('relations', []) 367 self.rel_type = kwargs.get('rel_type', None) 368 self.contains = kwargs.get('contains', []) 369 self.lang = kwargs.get('lang', []) 370 self.flags = kwargs.get('flags', 0) 371 self.no_match = kwargs.get('no_match', False) 372 373 def _freeze_relations(self, relations): 374 """Freeze relation.""" 375 376 if relations: 377 sel = relations[0] 378 sel.relations.extend(relations[1:]) 379 return ct.SelectorList([sel.freeze()]) 380 else: 381 return ct.SelectorList() 382 383 def freeze(self): 384 """Freeze self.""" 385 386 if self.no_match: 387 return ct.SelectorNull() 388 else: 389 return ct.Selector( 390 self.tag, 391 tuple(self.ids), 392 tuple(self.classes), 393 tuple(self.attributes), 394 tuple(self.nth), 395 tuple(self.selectors), 396 self._freeze_relations(self.relations), 397 self.rel_type, 398 tuple(self.contains), 399 tuple(self.lang), 400 self.flags 401 ) 402 403 def __str__(self): # pragma: no cover 404 """String representation.""" 405 406 return ( 407 '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, ' 408 'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})' 409 ).format( 410 self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors, 411 self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match 412 ) 413 414 __repr__ = __str__ 415 416 417class CSSParser(object): 418 """Parse CSS selectors.""" 419 420 css_tokens = ( 421 SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), 422 SpecialPseudoPattern( 423 ( 424 ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS, SelectorPattern), 425 ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern), 426 ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern), 427 ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern), 428 ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern) 429 ) 430 ), 431 SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), 432 SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS), 433 SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT), 434 SelectorPattern("at_rule", PAT_AT_RULE), 435 SelectorPattern("id", PAT_ID), 436 SelectorPattern("class", PAT_CLASS), 437 SelectorPattern("tag", PAT_TAG), 438 SelectorPattern("attribute", PAT_ATTR), 439 SelectorPattern("combine", PAT_COMBINE) 440 ) 441 442 def __init__(self, selector, custom=None, flags=0): 443 """Initialize.""" 444 445 self.pattern = selector.replace('\x00', '\ufffd') 446 self.flags = flags 447 self.debug = self.flags & util.DEBUG 448 self.custom = {} if custom is None else custom 449 450 def parse_attribute_selector(self, sel, m, has_selector): 451 """Create attribute selector from the returned regex match.""" 452 453 inverse = False 454 op = m.group('cmp') 455 case = util.lower(m.group('case')) if m.group('case') else None 456 ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else '' 457 attr = css_unescape(m.group('attr_name')) 458 is_type = False 459 pattern2 = None 460 461 if case: 462 flags = re.I if case == 'i' else 0 463 elif util.lower(attr) == 'type': 464 flags = re.I 465 is_type = True 466 else: 467 flags = 0 468 469 if op: 470 if m.group('value').startswith(('"', "'")): 471 value = css_unescape(m.group('value')[1:-1], True) 472 else: 473 value = css_unescape(m.group('value')) 474 else: 475 value = None 476 if not op: 477 # Attribute name 478 pattern = None 479 elif op.startswith('^'): 480 # Value start with 481 pattern = re.compile(r'^%s.*' % re.escape(value), flags) 482 elif op.startswith('$'): 483 # Value ends with 484 pattern = re.compile(r'.*?%s$' % re.escape(value), flags) 485 elif op.startswith('*'): 486 # Value contains 487 pattern = re.compile(r'.*?%s.*' % re.escape(value), flags) 488 elif op.startswith('~'): 489 # Value contains word within space separated list 490 # `~=` should match nothing if it is empty or contains whitespace, 491 # so if either of these cases is present, use `[^\s\S]` which cannot be matched. 492 value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value) 493 pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags) 494 elif op.startswith('|'): 495 # Value starts with word in dash separated list 496 pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) 497 else: 498 # Value matches 499 pattern = re.compile(r'^%s$' % re.escape(value), flags) 500 if op.startswith('!'): 501 # Equivalent to `:not([attr=value])` 502 inverse = True 503 if is_type and pattern: 504 pattern2 = re.compile(pattern.pattern) 505 506 # Append the attribute selector 507 sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2) 508 if inverse: 509 # If we are using `!=`, we need to nest the pattern under a `:not()`. 510 sub_sel = _Selector() 511 sub_sel.attributes.append(sel_attr) 512 not_list = ct.SelectorList([sub_sel.freeze()], True, False) 513 sel.selectors.append(not_list) 514 else: 515 sel.attributes.append(sel_attr) 516 517 has_selector = True 518 return has_selector 519 520 def parse_tag_pattern(self, sel, m, has_selector): 521 """Parse tag pattern from regex match.""" 522 523 prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None 524 tag = css_unescape(m.group('tag_name')) 525 sel.tag = ct.SelectorTag(tag, prefix) 526 has_selector = True 527 return has_selector 528 529 def parse_pseudo_class_custom(self, sel, m, has_selector): 530 """ 531 Parse custom pseudo class alias. 532 533 Compile custom selectors as we need them. When compiling a custom selector, 534 set it to `None` in the dictionary so we can avoid an infinite loop. 535 """ 536 537 pseudo = util.lower(css_unescape(m.group('name'))) 538 selector = self.custom.get(pseudo) 539 if selector is None: 540 raise SelectorSyntaxError( 541 "Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)), 542 self.pattern, 543 m.end(0) 544 ) 545 546 if not isinstance(selector, ct.SelectorList): 547 self.custom[pseudo] = None 548 selector = CSSParser( 549 selector, custom=self.custom, flags=self.flags 550 ).process_selectors(flags=FLG_PSEUDO) 551 self.custom[pseudo] = selector 552 553 sel.selectors.append(selector) 554 has_selector = True 555 return has_selector 556 557 def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html): 558 """Parse pseudo class.""" 559 560 complex_pseudo = False 561 pseudo = util.lower(css_unescape(m.group('name'))) 562 if m.group('open'): 563 complex_pseudo = True 564 if complex_pseudo and pseudo in PSEUDO_COMPLEX: 565 has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0)) 566 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE: 567 if pseudo == ':root': 568 sel.flags |= ct.SEL_ROOT 569 elif pseudo == ':defined': 570 sel.flags |= ct.SEL_DEFINED 571 is_html = True 572 elif pseudo == ':scope': 573 sel.flags |= ct.SEL_SCOPE 574 elif pseudo == ':empty': 575 sel.flags |= ct.SEL_EMPTY 576 elif pseudo in (':link', ':any-link'): 577 sel.selectors.append(CSS_LINK) 578 elif pseudo == ':checked': 579 sel.selectors.append(CSS_CHECKED) 580 elif pseudo == ':default': 581 sel.selectors.append(CSS_DEFAULT) 582 elif pseudo == ':indeterminate': 583 sel.selectors.append(CSS_INDETERMINATE) 584 elif pseudo == ":disabled": 585 sel.selectors.append(CSS_DISABLED) 586 elif pseudo == ":enabled": 587 sel.selectors.append(CSS_ENABLED) 588 elif pseudo == ":required": 589 sel.selectors.append(CSS_REQUIRED) 590 elif pseudo == ":optional": 591 sel.selectors.append(CSS_OPTIONAL) 592 elif pseudo == ":read-only": 593 sel.selectors.append(CSS_READ_ONLY) 594 elif pseudo == ":read-write": 595 sel.selectors.append(CSS_READ_WRITE) 596 elif pseudo == ":in-range": 597 sel.selectors.append(CSS_IN_RANGE) 598 elif pseudo == ":out-of-range": 599 sel.selectors.append(CSS_OUT_OF_RANGE) 600 elif pseudo == ":placeholder-shown": 601 sel.selectors.append(CSS_PLACEHOLDER_SHOWN) 602 elif pseudo == ':first-child': 603 sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList())) 604 elif pseudo == ':last-child': 605 sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())) 606 elif pseudo == ':first-of-type': 607 sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList())) 608 elif pseudo == ':last-of-type': 609 sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())) 610 elif pseudo == ':only-child': 611 sel.nth.extend( 612 [ 613 ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()), 614 ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()) 615 ] 616 ) 617 elif pseudo == ':only-of-type': 618 sel.nth.extend( 619 [ 620 ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()), 621 ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()) 622 ] 623 ) 624 has_selector = True 625 elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH: 626 self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) 627 sel.no_match = True 628 has_selector = True 629 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH: 630 sel.no_match = True 631 has_selector = True 632 elif pseudo in PSEUDO_SUPPORTED: 633 raise SelectorSyntaxError( 634 "Invalid syntax for pseudo class '{}'".format(pseudo), 635 self.pattern, 636 m.start(0) 637 ) 638 else: 639 raise NotImplementedError( 640 "'{}' pseudo-class is not implemented at this time".format(pseudo) 641 ) 642 643 return has_selector, is_html 644 645 def parse_pseudo_nth(self, sel, m, has_selector, iselector): 646 """Parse `nth` pseudo.""" 647 648 mdict = m.groupdict() 649 if mdict.get('pseudo_nth_child'): 650 postfix = '_child' 651 else: 652 postfix = '_type' 653 mdict['name'] = util.lower(css_unescape(mdict['name'])) 654 content = util.lower(mdict.get('nth' + postfix)) 655 if content == 'even': 656 # 2n 657 s1 = 2 658 s2 = 0 659 var = True 660 elif content == 'odd': 661 # 2n+1 662 s1 = 2 663 s2 = 1 664 var = True 665 else: 666 nth_parts = RE_NTH.match(content) 667 s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else '' 668 a = nth_parts.group('a') 669 var = a.endswith('n') 670 if a.startswith('n'): 671 s1 += '1' 672 elif var: 673 s1 += a[:-1] 674 else: 675 s1 += a 676 s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else '' 677 if nth_parts.group('b'): 678 s2 += nth_parts.group('b') 679 else: 680 s2 = '0' 681 s1 = int(s1, 10) 682 s2 = int(s2, 10) 683 684 pseudo_sel = mdict['name'] 685 if postfix == '_child': 686 if m.group('of'): 687 # Parse the rest of `of S`. 688 nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) 689 else: 690 # Use default `*|*` for `of S`. 691 nth_sel = CSS_NTH_OF_S_DEFAULT 692 if pseudo_sel == ':nth-child': 693 sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel)) 694 elif pseudo_sel == ':nth-last-child': 695 sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel)) 696 else: 697 if pseudo_sel == ':nth-of-type': 698 sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList())) 699 elif pseudo_sel == ':nth-last-of-type': 700 sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList())) 701 has_selector = True 702 return has_selector 703 704 def parse_pseudo_open(self, sel, name, has_selector, iselector, index): 705 """Parse pseudo with opening bracket.""" 706 707 flags = FLG_PSEUDO | FLG_OPEN 708 if name == ':not': 709 flags |= FLG_NOT 710 if name == ':has': 711 flags |= FLG_RELATIVE 712 713 sel.selectors.append(self.parse_selectors(iselector, index, flags)) 714 has_selector = True 715 return has_selector 716 717 def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index): 718 """Parse combinator tokens.""" 719 720 combinator = m.group('relation').strip() 721 if not combinator: 722 combinator = WS_COMBINATOR 723 if combinator == COMMA_COMBINATOR: 724 if not has_selector: 725 # If we've not captured any selector parts, the comma is either at the beginning of the pattern 726 # or following another comma, both of which are unexpected. Commas must split selectors. 727 raise SelectorSyntaxError( 728 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), 729 self.pattern, 730 index 731 ) 732 sel.rel_type = rel_type 733 selectors[-1].relations.append(sel) 734 rel_type = ":" + WS_COMBINATOR 735 selectors.append(_Selector()) 736 else: 737 if has_selector: 738 # End the current selector and associate the leading combinator with this selector. 739 sel.rel_type = rel_type 740 selectors[-1].relations.append(sel) 741 elif rel_type[1:] != WS_COMBINATOR: 742 # It's impossible to have two whitespace combinators after each other as the patterns 743 # will gobble up trailing whitespace. It is also impossible to have a whitespace 744 # combinator after any other kind for the same reason. But we could have 745 # multiple non-whitespace combinators. So if the current combinator is not a whitespace, 746 # then we've hit the multiple combinator case, so we should fail. 747 raise SelectorSyntaxError( 748 'The multiple combinators at position {}'.format(index), 749 self.pattern, 750 index 751 ) 752 # Set the leading combinator for the next selector. 753 rel_type = ':' + combinator 754 sel = _Selector() 755 756 has_selector = False 757 return has_selector, sel, rel_type 758 759 def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index): 760 """Parse combinator tokens.""" 761 762 combinator = m.group('relation').strip() 763 if not combinator: 764 combinator = WS_COMBINATOR 765 if not has_selector: 766 raise SelectorSyntaxError( 767 "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), 768 self.pattern, 769 index 770 ) 771 772 if combinator == COMMA_COMBINATOR: 773 if not sel.tag and not is_pseudo: 774 # Implied `*` 775 sel.tag = ct.SelectorTag('*', None) 776 sel.relations.extend(relations) 777 selectors.append(sel) 778 del relations[:] 779 else: 780 sel.relations.extend(relations) 781 sel.rel_type = combinator 782 del relations[:] 783 relations.append(sel) 784 sel = _Selector() 785 786 has_selector = False 787 return has_selector, sel 788 789 def parse_class_id(self, sel, m, has_selector): 790 """Parse HTML classes and ids.""" 791 792 selector = m.group(0) 793 if selector.startswith('.'): 794 sel.classes.append(css_unescape(selector[1:])) 795 else: 796 sel.ids.append(css_unescape(selector[1:])) 797 has_selector = True 798 return has_selector 799 800 def parse_pseudo_contains(self, sel, m, has_selector): 801 """Parse contains.""" 802 803 values = m.group('values') 804 patterns = [] 805 for token in RE_VALUES.finditer(values): 806 if token.group('split'): 807 continue 808 value = token.group('value') 809 if value.startswith(("'", '"')): 810 value = css_unescape(value[1:-1], True) 811 else: 812 value = css_unescape(value) 813 patterns.append(value) 814 sel.contains.append(ct.SelectorContains(tuple(patterns))) 815 has_selector = True 816 return has_selector 817 818 def parse_pseudo_lang(self, sel, m, has_selector): 819 """Parse pseudo language.""" 820 821 values = m.group('values') 822 patterns = [] 823 for token in RE_VALUES.finditer(values): 824 if token.group('split'): 825 continue 826 value = token.group('value') 827 if value.startswith(('"', "'")): 828 value = css_unescape(value[1:-1], True) 829 else: 830 value = css_unescape(value) 831 832 patterns.append(value) 833 834 sel.lang.append(ct.SelectorLang(patterns)) 835 has_selector = True 836 837 return has_selector 838 839 def parse_pseudo_dir(self, sel, m, has_selector): 840 """Parse pseudo direction.""" 841 842 value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL 843 sel.flags |= value 844 has_selector = True 845 return has_selector 846 847 def parse_selectors(self, iselector, index=0, flags=0): 848 """Parse selectors.""" 849 850 sel = _Selector() 851 selectors = [] 852 has_selector = False 853 closed = False 854 relations = [] 855 rel_type = ":" + WS_COMBINATOR 856 is_open = bool(flags & FLG_OPEN) 857 is_pseudo = bool(flags & FLG_PSEUDO) 858 is_relative = bool(flags & FLG_RELATIVE) 859 is_not = bool(flags & FLG_NOT) 860 is_html = bool(flags & FLG_HTML) 861 is_default = bool(flags & FLG_DEFAULT) 862 is_indeterminate = bool(flags & FLG_INDETERMINATE) 863 is_in_range = bool(flags & FLG_IN_RANGE) 864 is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) 865 is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) 866 867 if self.debug: # pragma: no cover 868 if is_pseudo: 869 print(' is_pseudo: True') 870 if is_open: 871 print(' is_open: True') 872 if is_relative: 873 print(' is_relative: True') 874 if is_not: 875 print(' is_not: True') 876 if is_html: 877 print(' is_html: True') 878 if is_default: 879 print(' is_default: True') 880 if is_indeterminate: 881 print(' is_indeterminate: True') 882 if is_in_range: 883 print(' is_in_range: True') 884 if is_out_of_range: 885 print(' is_out_of_range: True') 886 if is_placeholder_shown: 887 print(' is_placeholder_shown: True') 888 889 if is_relative: 890 selectors.append(_Selector()) 891 892 try: 893 while True: 894 key, m = next(iselector) 895 896 # Handle parts 897 if key == "at_rule": 898 raise NotImplementedError("At-rules found at position {}".format(m.start(0))) 899 elif key == 'pseudo_class_custom': 900 has_selector = self.parse_pseudo_class_custom(sel, m, has_selector) 901 elif key == 'pseudo_class': 902 has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) 903 elif key == 'pseudo_element': 904 raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0))) 905 elif key == 'pseudo_contains': 906 has_selector = self.parse_pseudo_contains(sel, m, has_selector) 907 elif key in ('pseudo_nth_type', 'pseudo_nth_child'): 908 has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector) 909 elif key == 'pseudo_lang': 910 has_selector = self.parse_pseudo_lang(sel, m, has_selector) 911 elif key == 'pseudo_dir': 912 has_selector = self.parse_pseudo_dir(sel, m, has_selector) 913 # Currently only supports HTML 914 is_html = True 915 elif key == 'pseudo_close': 916 if not has_selector: 917 raise SelectorSyntaxError( 918 "Expected a selector at postion {}".format(m.start(0)), 919 self.pattern, 920 m.start(0) 921 ) 922 if is_open: 923 closed = True 924 break 925 else: 926 raise SelectorSyntaxError( 927 "Unmatched pseudo-class close at postion {}".format(m.start(0)), 928 self.pattern, 929 m.start(0) 930 ) 931 elif key == 'combine': 932 if is_relative: 933 has_selector, sel, rel_type = self.parse_has_combinator( 934 sel, m, has_selector, selectors, rel_type, index 935 ) 936 else: 937 has_selector, sel = self.parse_combinator( 938 sel, m, has_selector, selectors, relations, is_pseudo, index 939 ) 940 elif key == 'attribute': 941 has_selector = self.parse_attribute_selector(sel, m, has_selector) 942 elif key == 'tag': 943 if has_selector: 944 raise SelectorSyntaxError( 945 "Tag name found at position {} instead of at the start".format(m.start(0)), 946 self.pattern, 947 m.start(0) 948 ) 949 has_selector = self.parse_tag_pattern(sel, m, has_selector) 950 elif key in ('class', 'id'): 951 has_selector = self.parse_class_id(sel, m, has_selector) 952 953 index = m.end(0) 954 except StopIteration: 955 pass 956 957 if is_open and not closed: 958 raise SelectorSyntaxError( 959 "Unclosed pseudo-class at position {}".format(index), 960 self.pattern, 961 index 962 ) 963 964 if has_selector: 965 if not sel.tag and not is_pseudo: 966 # Implied `*` 967 sel.tag = ct.SelectorTag('*', None) 968 if is_relative: 969 sel.rel_type = rel_type 970 selectors[-1].relations.append(sel) 971 else: 972 sel.relations.extend(relations) 973 del relations[:] 974 selectors.append(sel) 975 else: 976 # We will always need to finish a selector when `:has()` is used as it leads with combining. 977 raise SelectorSyntaxError( 978 'Expected a selector at position {}'.format(index), 979 self.pattern, 980 index 981 ) 982 983 # Some patterns require additional logic, such as default. We try to make these the 984 # last pattern, and append the appropriate flag to that selector which communicates 985 # to the matcher what additional logic is required. 986 if is_default: 987 selectors[-1].flags = ct.SEL_DEFAULT 988 if is_indeterminate: 989 selectors[-1].flags = ct.SEL_INDETERMINATE 990 if is_in_range: 991 selectors[-1].flags = ct.SEL_IN_RANGE 992 if is_out_of_range: 993 selectors[-1].flags = ct.SEL_OUT_OF_RANGE 994 if is_placeholder_shown: 995 selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN 996 997 return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) 998 999 def selector_iter(self, pattern): 1000 """Iterate selector tokens.""" 1001 1002 # Ignore whitespace and comments at start and end of pattern 1003 m = RE_WS_BEGIN.search(pattern) 1004 index = m.end(0) if m else 0 1005 m = RE_WS_END.search(pattern) 1006 end = (m.start(0) - 1) if m else (len(pattern) - 1) 1007 1008 if self.debug: # pragma: no cover 1009 print('## PARSING: {!r}'.format(pattern)) 1010 while index <= end: 1011 m = None 1012 for v in self.css_tokens: 1013 m = v.match(pattern, index, self.flags) 1014 if m: 1015 name = v.get_name() 1016 if self.debug: # pragma: no cover 1017 print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0))) 1018 index = m.end(0) 1019 yield name, m 1020 break 1021 if m is None: 1022 c = pattern[index] 1023 # If the character represents the start of one of the known selector types, 1024 # throw an exception mentioning that the known selector type is in error; 1025 # otherwise, report the invalid character. 1026 if c == '[': 1027 msg = "Malformed attribute selector at position {}".format(index) 1028 elif c == '.': 1029 msg = "Malformed class selector at position {}".format(index) 1030 elif c == '#': 1031 msg = "Malformed id selector at position {}".format(index) 1032 elif c == ':': 1033 msg = "Malformed pseudo-class selector at position {}".format(index) 1034 else: 1035 msg = "Invalid character {!r} position {}".format(c, index) 1036 raise SelectorSyntaxError(msg, self.pattern, index) 1037 if self.debug: # pragma: no cover 1038 print('## END PARSING') 1039 1040 def process_selectors(self, index=0, flags=0): 1041 """Process selectors.""" 1042 1043 return self.parse_selectors(self.selector_iter(self.pattern), index, flags) 1044 1045 1046# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern) 1047# A few patterns are order dependent as they use patterns previous compiled. 1048 1049# CSS pattern for `:link` and `:any-link` 1050CSS_LINK = CSSParser( 1051 'html|*:is(a, area, link)[href]' 1052).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1053# CSS pattern for `:checked` 1054CSS_CHECKED = CSSParser( 1055 ''' 1056 html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected] 1057 ''' 1058).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1059# CSS pattern for `:default` (must compile CSS_CHECKED first) 1060CSS_DEFAULT = CSSParser( 1061 ''' 1062 :checked, 1063 1064 /* 1065 This pattern must be at the end. 1066 Special logic is applied to the last selector. 1067 */ 1068 html|form html|*:is(button, input)[type="submit"] 1069 ''' 1070).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT) 1071# CSS pattern for `:indeterminate` 1072CSS_INDETERMINATE = CSSParser( 1073 ''' 1074 html|input[type="checkbox"][indeterminate], 1075 html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]), 1076 html|progress:not([value]), 1077 1078 /* 1079 This pattern must be at the end. 1080 Special logic is applied to the last selector. 1081 */ 1082 html|input[type="radio"][name][name!='']:not([checked]) 1083 ''' 1084).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) 1085# CSS pattern for `:disabled` 1086CSS_DISABLED = CSSParser( 1087 ''' 1088 html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], 1089 html|optgroup[disabled] > html|option, 1090 html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset), 1091 html|fieldset[disabled] > 1092 html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset) 1093 ''' 1094).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1095# CSS pattern for `:enabled` 1096CSS_ENABLED = CSSParser( 1097 ''' 1098 html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) 1099 ''' 1100).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1101# CSS pattern for `:required` 1102CSS_REQUIRED = CSSParser( 1103 'html|*:is(input, textarea, select)[required]' 1104).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1105# CSS pattern for `:optional` 1106CSS_OPTIONAL = CSSParser( 1107 'html|*:is(input, textarea, select):not([required])' 1108).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1109# CSS pattern for `:placeholder-shown` 1110CSS_PLACEHOLDER_SHOWN = CSSParser( 1111 ''' 1112 html|input:is( 1113 :not([type]), 1114 [type=""], 1115 [type=text], 1116 [type=search], 1117 [type=url], 1118 [type=tel], 1119 [type=email], 1120 [type=password], 1121 [type=number] 1122 )[placeholder][placeholder!='']:is(:not([value]), [value=""]), 1123 html|textarea[placeholder][placeholder!=''] 1124 ''' 1125).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) 1126# CSS pattern default for `:nth-child` "of S" feature 1127CSS_NTH_OF_S_DEFAULT = CSSParser( 1128 '*|*' 1129).process_selectors(flags=FLG_PSEUDO) 1130# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first) 1131CSS_READ_WRITE = CSSParser( 1132 ''' 1133 html|*:is( 1134 textarea, 1135 input:is( 1136 :not([type]), 1137 [type=""], 1138 [type=text], 1139 [type=search], 1140 [type=url], 1141 [type=tel], 1142 [type=email], 1143 [type=number], 1144 [type=password], 1145 [type=date], 1146 [type=datetime-local], 1147 [type=month], 1148 [type=time], 1149 [type=week] 1150 ) 1151 ):not([readonly], :disabled), 1152 html|*:is([contenteditable=""], [contenteditable="true" i]) 1153 ''' 1154).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1155# CSS pattern for `:read-only` 1156CSS_READ_ONLY = CSSParser( 1157 ''' 1158 html|*:not(:read-write) 1159 ''' 1160).process_selectors(flags=FLG_PSEUDO | FLG_HTML) 1161# CSS pattern for `:in-range` 1162CSS_IN_RANGE = CSSParser( 1163 ''' 1164 html|input:is( 1165 [type="date"], 1166 [type="month"], 1167 [type="week"], 1168 [type="time"], 1169 [type="datetime-local"], 1170 [type="number"], 1171 [type="range"] 1172 ):is( 1173 [min], 1174 [max] 1175 ) 1176 ''' 1177).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML) 1178# CSS pattern for `:out-of-range` 1179CSS_OUT_OF_RANGE = CSSParser( 1180 ''' 1181 html|input:is( 1182 [type="date"], 1183 [type="month"], 1184 [type="week"], 1185 [type="time"], 1186 [type="datetime-local"], 1187 [type="number"], 1188 [type="range"] 1189 ):is( 1190 [min], 1191 [max] 1192 ) 1193 ''' 1194).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML) 1195