1"""CSS selector parser."""
2import re
3from functools import lru_cache
4from . import util
5from . import css_match as cm
6from . import css_types as ct
7from .util import SelectorSyntaxError
8
9UNICODE_REPLACEMENT_CHAR = 0xFFFD
10
11# Simple pseudo classes that take no parameters
12PSEUDO_SIMPLE = {
13    ":any-link",
14    ":empty",
15    ":first-child",
16    ":first-of-type",
17    ":in-range",
18    ":out-of-range",
19    ":last-child",
20    ":last-of-type",
21    ":link",
22    ":only-child",
23    ":only-of-type",
24    ":root",
25    ':checked',
26    ':default',
27    ':disabled',
28    ':enabled',
29    ':indeterminate',
30    ':optional',
31    ':placeholder-shown',
32    ':read-only',
33    ':read-write',
34    ':required',
35    ':scope',
36    ':defined'
37}
38
39# Supported, simple pseudo classes that match nothing in the Soup Sieve environment
40PSEUDO_SIMPLE_NO_MATCH = {
41    ':active',
42    ':current',
43    ':focus',
44    ':focus-visible',
45    ':focus-within',
46    ':future',
47    ':host',
48    ':hover',
49    ':local-link',
50    ':past',
51    ':paused',
52    ':playing',
53    ':target',
54    ':target-within',
55    ':user-invalid',
56    ':visited'
57}
58
59# Complex pseudo classes that take selector lists
60PSEUDO_COMPLEX = {
61    ':contains',
62    ':has',
63    ':is',
64    ':matches',
65    ':not',
66    ':where'
67}
68
69PSEUDO_COMPLEX_NO_MATCH = {
70    ':current',
71    ':host',
72    ':host-context'
73}
74
75# Complex pseudo classes that take very specific parameters and are handled special
76PSEUDO_SPECIAL = {
77    ':dir',
78    ':lang',
79    ':nth-child',
80    ':nth-last-child',
81    ':nth-last-of-type',
82    ':nth-of-type'
83}
84
85PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
86
87# Sub-patterns parts
88# Whitespace
89NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
90WS = r'(?:[ \t]|{})'.format(NEWLINE)
91# Comments
92COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
93# Whitespace with comments included
94WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS)
95# CSS escapes
96CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS)
97CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE)
98# CSS Identifier
99IDENTIFIER = r'''
100(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--)
101(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*)
102'''.format(esc=CSS_ESCAPES)
103# `nth` content
104NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC)
105# Value: quoted string or identifier
106VALUE = r'''
107(?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+)
108'''.format(nl=NEWLINE, ident=IDENTIFIER)
109# Attribute value comparison. `!=` is handled special as it is non-standard.
110ATTR = r'''
111(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
112'''.format(ws=WSC, value=VALUE)
113
114# Selector patterns
115# IDs (`#id`)
116PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
117# Classes (`.class`)
118PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
119# Prefix:Tag (`prefix|tag`)
120PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
121# Attributes (`[attr]`, `[attr=value]`, etc.)
122PAT_ATTR = r'''
123\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
124'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
125# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
126PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
127# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
128PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER)
129# Custom pseudo class (`:--custom-pseudo`)
130PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER)
131# Closing pseudo group (`)`)
132PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC)
133# Pseudo element (`::pseudo-element`)
134PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS)
135# At rule (`@page`, etc.) (not supported)
136PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER)
137# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
138PAT_PSEUDO_NTH_CHILD = r'''
139(?P<pseudo_nth_child>{name}
140(?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*))
141'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH)
142# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
143PAT_PSEUDO_NTH_TYPE = r'''
144(?P<pseudo_nth_type>{name}
145(?P<nth_type>{nth}|even|odd)){ws}*\)
146'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH)
147# Pseudo class language (`:lang("*-de", en)`)
148PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
149    name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
150)
151# Pseudo class direction (`:dir(ltr)`)
152PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC)
153# Combining characters (`>`, `~`, ` `, `+`, `,`)
154PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
155# Extra: Contains (`:contains(text)`)
156PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
157    name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
158)
159
160# Regular expressions
161# CSS escape pattern
162RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I)
163RE_CSS_STR_ESC = re.compile(
164    r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I
165)
166# Pattern to break up `nth` specifiers
167RE_NTH = re.compile(
168    r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC),
169    re.I
170)
171# Pattern to iterate multiple values.
172RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
173# Whitespace checks
174RE_WS = re.compile(WS)
175RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
176RE_WS_END = re.compile('{}*$'.format(WSC))
177RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
178
179# Constants
180# List split token
181COMMA_COMBINATOR = ','
182# Relation token for descendant
183WS_COMBINATOR = " "
184
185# Parse flags
186FLG_PSEUDO = 0x01
187FLG_NOT = 0x02
188FLG_RELATIVE = 0x04
189FLG_DEFAULT = 0x08
190FLG_HTML = 0x10
191FLG_INDETERMINATE = 0x20
192FLG_OPEN = 0x40
193FLG_IN_RANGE = 0x80
194FLG_OUT_OF_RANGE = 0x100
195FLG_PLACEHOLDER_SHOWN = 0x200
196
197# Maximum cached patterns to store
198_MAXCACHE = 500
199
200
201@lru_cache(maxsize=_MAXCACHE)
202def _cached_css_compile(pattern, namespaces, custom, flags):
203    """Cached CSS compile."""
204
205    custom_selectors = process_custom(custom)
206    return cm.SoupSieve(
207        pattern,
208        CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(),
209        namespaces,
210        custom,
211        flags
212    )
213
214
215def _purge_cache():
216    """Purge the cache."""
217
218    _cached_css_compile.cache_clear()
219
220
221def process_custom(custom):
222    """Process custom."""
223
224    custom_selectors = {}
225    if custom is not None:
226        for key, value in custom.items():
227            name = util.lower(key)
228            if RE_CUSTOM.match(name) is None:
229                raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name))
230            if name in custom_selectors:
231                raise KeyError("The custom selector '{}' has already been registered".format(name))
232            custom_selectors[css_unescape(name)] = value
233    return custom_selectors
234
235
236def css_unescape(content, string=False):
237    """
238    Unescape CSS value.
239
240    Strings allow for spanning the value on multiple strings by escaping a new line.
241    """
242
243    def replace(m):
244        """Replace with the appropriate substitute."""
245
246        if m.group(1):
247            codepoint = int(m.group(1)[1:], 16)
248            if codepoint == 0:
249                codepoint = UNICODE_REPLACEMENT_CHAR
250            value = chr(codepoint)
251        elif m.group(2):
252            value = m.group(2)[1:]
253        elif m.group(3):
254            value = '\ufffd'
255        else:
256            value = ''
257
258        return value
259
260    return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
261
262
263def escape(ident):
264    """Escape identifier."""
265
266    string = []
267    length = len(ident)
268    start_dash = length > 0 and ident[0] == '-'
269    if length == 1 and start_dash:
270        # Need to escape identifier that is a single `-` with no other characters
271        string.append('\\{}'.format(ident))
272    else:
273        for index, c in enumerate(ident):
274            codepoint = ord(c)
275            if codepoint == 0x00:
276                string.append('\ufffd')
277            elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
278                string.append('\\{:x} '.format(codepoint))
279            elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
280                string.append('\\{:x} '.format(codepoint))
281            elif (
282                codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
283                (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
284            ):
285                string.append(c)
286            else:
287                string.append('\\{}'.format(c))
288    return ''.join(string)
289
290
291class SelectorPattern(object):
292    """Selector pattern."""
293
294    def __init__(self, name, pattern):
295        """Initialize."""
296
297        self.name = name
298        self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
299
300    def get_name(self):
301        """Get name."""
302
303        return self.name
304
305    def match(self, selector, index, flags):
306        """Match the selector."""
307
308        return self.re_pattern.match(selector, index)
309
310
311class SpecialPseudoPattern(SelectorPattern):
312    """Selector pattern."""
313
314    def __init__(self, patterns):
315        """Initialize."""
316
317        self.patterns = {}
318        for p in patterns:
319            name = p[0]
320            pattern = p[3](name, p[2])
321            for pseudo in p[1]:
322                self.patterns[pseudo] = pattern
323
324        self.matched_name = None
325        self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
326
327    def get_name(self):
328        """Get name."""
329
330        return self.matched_name.get_name()
331
332    def match(self, selector, index, flags):
333        """Match the selector."""
334
335        pseudo = None
336        m = self.re_pseudo_name.match(selector, index)
337        if m:
338            name = util.lower(css_unescape(m.group('name')))
339            pattern = self.patterns.get(name)
340            if pattern:
341                pseudo = pattern.match(selector, index, flags)
342                if pseudo:
343                    self.matched_name = pattern
344
345        return pseudo
346
347
348class _Selector(object):
349    """
350    Intermediate selector class.
351
352    This stores selector data for a compound selector as we are acquiring them.
353    Once we are done collecting the data for a compound selector, we freeze
354    the data in an object that can be pickled and hashed.
355    """
356
357    def __init__(self, **kwargs):
358        """Initialize."""
359
360        self.tag = kwargs.get('tag', None)
361        self.ids = kwargs.get('ids', [])
362        self.classes = kwargs.get('classes', [])
363        self.attributes = kwargs.get('attributes', [])
364        self.nth = kwargs.get('nth', [])
365        self.selectors = kwargs.get('selectors', [])
366        self.relations = kwargs.get('relations', [])
367        self.rel_type = kwargs.get('rel_type', None)
368        self.contains = kwargs.get('contains', [])
369        self.lang = kwargs.get('lang', [])
370        self.flags = kwargs.get('flags', 0)
371        self.no_match = kwargs.get('no_match', False)
372
373    def _freeze_relations(self, relations):
374        """Freeze relation."""
375
376        if relations:
377            sel = relations[0]
378            sel.relations.extend(relations[1:])
379            return ct.SelectorList([sel.freeze()])
380        else:
381            return ct.SelectorList()
382
383    def freeze(self):
384        """Freeze self."""
385
386        if self.no_match:
387            return ct.SelectorNull()
388        else:
389            return ct.Selector(
390                self.tag,
391                tuple(self.ids),
392                tuple(self.classes),
393                tuple(self.attributes),
394                tuple(self.nth),
395                tuple(self.selectors),
396                self._freeze_relations(self.relations),
397                self.rel_type,
398                tuple(self.contains),
399                tuple(self.lang),
400                self.flags
401            )
402
403    def __str__(self):  # pragma: no cover
404        """String representation."""
405
406        return (
407            '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, '
408            'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})'
409        ).format(
410            self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors,
411            self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match
412        )
413
414    __repr__ = __str__
415
416
417class CSSParser(object):
418    """Parse CSS selectors."""
419
420    css_tokens = (
421        SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
422        SpecialPseudoPattern(
423            (
424                ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS, SelectorPattern),
425                ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
426                ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
427                ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
428                ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
429            )
430        ),
431        SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
432        SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
433        SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
434        SelectorPattern("at_rule", PAT_AT_RULE),
435        SelectorPattern("id", PAT_ID),
436        SelectorPattern("class", PAT_CLASS),
437        SelectorPattern("tag", PAT_TAG),
438        SelectorPattern("attribute", PAT_ATTR),
439        SelectorPattern("combine", PAT_COMBINE)
440    )
441
442    def __init__(self, selector, custom=None, flags=0):
443        """Initialize."""
444
445        self.pattern = selector.replace('\x00', '\ufffd')
446        self.flags = flags
447        self.debug = self.flags & util.DEBUG
448        self.custom = {} if custom is None else custom
449
450    def parse_attribute_selector(self, sel, m, has_selector):
451        """Create attribute selector from the returned regex match."""
452
453        inverse = False
454        op = m.group('cmp')
455        case = util.lower(m.group('case')) if m.group('case') else None
456        ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
457        attr = css_unescape(m.group('attr_name'))
458        is_type = False
459        pattern2 = None
460
461        if case:
462            flags = re.I if case == 'i' else 0
463        elif util.lower(attr) == 'type':
464            flags = re.I
465            is_type = True
466        else:
467            flags = 0
468
469        if op:
470            if m.group('value').startswith(('"', "'")):
471                value = css_unescape(m.group('value')[1:-1], True)
472            else:
473                value = css_unescape(m.group('value'))
474        else:
475            value = None
476        if not op:
477            # Attribute name
478            pattern = None
479        elif op.startswith('^'):
480            # Value start with
481            pattern = re.compile(r'^%s.*' % re.escape(value), flags)
482        elif op.startswith('$'):
483            # Value ends with
484            pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
485        elif op.startswith('*'):
486            # Value contains
487            pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
488        elif op.startswith('~'):
489            # Value contains word within space separated list
490            # `~=` should match nothing if it is empty or contains whitespace,
491            # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
492            value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
493            pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
494        elif op.startswith('|'):
495            # Value starts with word in dash separated list
496            pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
497        else:
498            # Value matches
499            pattern = re.compile(r'^%s$' % re.escape(value), flags)
500            if op.startswith('!'):
501                # Equivalent to `:not([attr=value])`
502                inverse = True
503        if is_type and pattern:
504            pattern2 = re.compile(pattern.pattern)
505
506        # Append the attribute selector
507        sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
508        if inverse:
509            # If we are using `!=`, we need to nest the pattern under a `:not()`.
510            sub_sel = _Selector()
511            sub_sel.attributes.append(sel_attr)
512            not_list = ct.SelectorList([sub_sel.freeze()], True, False)
513            sel.selectors.append(not_list)
514        else:
515            sel.attributes.append(sel_attr)
516
517        has_selector = True
518        return has_selector
519
520    def parse_tag_pattern(self, sel, m, has_selector):
521        """Parse tag pattern from regex match."""
522
523        prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
524        tag = css_unescape(m.group('tag_name'))
525        sel.tag = ct.SelectorTag(tag, prefix)
526        has_selector = True
527        return has_selector
528
529    def parse_pseudo_class_custom(self, sel, m, has_selector):
530        """
531        Parse custom pseudo class alias.
532
533        Compile custom selectors as we need them. When compiling a custom selector,
534        set it to `None` in the dictionary so we can avoid an infinite loop.
535        """
536
537        pseudo = util.lower(css_unescape(m.group('name')))
538        selector = self.custom.get(pseudo)
539        if selector is None:
540            raise SelectorSyntaxError(
541                "Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)),
542                self.pattern,
543                m.end(0)
544            )
545
546        if not isinstance(selector, ct.SelectorList):
547            self.custom[pseudo] = None
548            selector = CSSParser(
549                selector, custom=self.custom, flags=self.flags
550            ).process_selectors(flags=FLG_PSEUDO)
551            self.custom[pseudo] = selector
552
553        sel.selectors.append(selector)
554        has_selector = True
555        return has_selector
556
557    def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html):
558        """Parse pseudo class."""
559
560        complex_pseudo = False
561        pseudo = util.lower(css_unescape(m.group('name')))
562        if m.group('open'):
563            complex_pseudo = True
564        if complex_pseudo and pseudo in PSEUDO_COMPLEX:
565            has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
566        elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
567            if pseudo == ':root':
568                sel.flags |= ct.SEL_ROOT
569            elif pseudo == ':defined':
570                sel.flags |= ct.SEL_DEFINED
571                is_html = True
572            elif pseudo == ':scope':
573                sel.flags |= ct.SEL_SCOPE
574            elif pseudo == ':empty':
575                sel.flags |= ct.SEL_EMPTY
576            elif pseudo in (':link', ':any-link'):
577                sel.selectors.append(CSS_LINK)
578            elif pseudo == ':checked':
579                sel.selectors.append(CSS_CHECKED)
580            elif pseudo == ':default':
581                sel.selectors.append(CSS_DEFAULT)
582            elif pseudo == ':indeterminate':
583                sel.selectors.append(CSS_INDETERMINATE)
584            elif pseudo == ":disabled":
585                sel.selectors.append(CSS_DISABLED)
586            elif pseudo == ":enabled":
587                sel.selectors.append(CSS_ENABLED)
588            elif pseudo == ":required":
589                sel.selectors.append(CSS_REQUIRED)
590            elif pseudo == ":optional":
591                sel.selectors.append(CSS_OPTIONAL)
592            elif pseudo == ":read-only":
593                sel.selectors.append(CSS_READ_ONLY)
594            elif pseudo == ":read-write":
595                sel.selectors.append(CSS_READ_WRITE)
596            elif pseudo == ":in-range":
597                sel.selectors.append(CSS_IN_RANGE)
598            elif pseudo == ":out-of-range":
599                sel.selectors.append(CSS_OUT_OF_RANGE)
600            elif pseudo == ":placeholder-shown":
601                sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
602            elif pseudo == ':first-child':
603                sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
604            elif pseudo == ':last-child':
605                sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
606            elif pseudo == ':first-of-type':
607                sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
608            elif pseudo == ':last-of-type':
609                sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
610            elif pseudo == ':only-child':
611                sel.nth.extend(
612                    [
613                        ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
614                        ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
615                    ]
616                )
617            elif pseudo == ':only-of-type':
618                sel.nth.extend(
619                    [
620                        ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
621                        ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
622                    ]
623                )
624            has_selector = True
625        elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
626            self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
627            sel.no_match = True
628            has_selector = True
629        elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
630            sel.no_match = True
631            has_selector = True
632        elif pseudo in PSEUDO_SUPPORTED:
633            raise SelectorSyntaxError(
634                "Invalid syntax for pseudo class '{}'".format(pseudo),
635                self.pattern,
636                m.start(0)
637            )
638        else:
639            raise NotImplementedError(
640                "'{}' pseudo-class is not implemented at this time".format(pseudo)
641            )
642
643        return has_selector, is_html
644
645    def parse_pseudo_nth(self, sel, m, has_selector, iselector):
646        """Parse `nth` pseudo."""
647
648        mdict = m.groupdict()
649        if mdict.get('pseudo_nth_child'):
650            postfix = '_child'
651        else:
652            postfix = '_type'
653        mdict['name'] = util.lower(css_unescape(mdict['name']))
654        content = util.lower(mdict.get('nth' + postfix))
655        if content == 'even':
656            # 2n
657            s1 = 2
658            s2 = 0
659            var = True
660        elif content == 'odd':
661            # 2n+1
662            s1 = 2
663            s2 = 1
664            var = True
665        else:
666            nth_parts = RE_NTH.match(content)
667            s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
668            a = nth_parts.group('a')
669            var = a.endswith('n')
670            if a.startswith('n'):
671                s1 += '1'
672            elif var:
673                s1 += a[:-1]
674            else:
675                s1 += a
676            s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
677            if nth_parts.group('b'):
678                s2 += nth_parts.group('b')
679            else:
680                s2 = '0'
681            s1 = int(s1, 10)
682            s2 = int(s2, 10)
683
684        pseudo_sel = mdict['name']
685        if postfix == '_child':
686            if m.group('of'):
687                # Parse the rest of `of S`.
688                nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
689            else:
690                # Use default `*|*` for `of S`.
691                nth_sel = CSS_NTH_OF_S_DEFAULT
692            if pseudo_sel == ':nth-child':
693                sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
694            elif pseudo_sel == ':nth-last-child':
695                sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
696        else:
697            if pseudo_sel == ':nth-of-type':
698                sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
699            elif pseudo_sel == ':nth-last-of-type':
700                sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
701        has_selector = True
702        return has_selector
703
704    def parse_pseudo_open(self, sel, name, has_selector, iselector, index):
705        """Parse pseudo with opening bracket."""
706
707        flags = FLG_PSEUDO | FLG_OPEN
708        if name == ':not':
709            flags |= FLG_NOT
710        if name == ':has':
711            flags |= FLG_RELATIVE
712
713        sel.selectors.append(self.parse_selectors(iselector, index, flags))
714        has_selector = True
715        return has_selector
716
717    def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index):
718        """Parse combinator tokens."""
719
720        combinator = m.group('relation').strip()
721        if not combinator:
722            combinator = WS_COMBINATOR
723        if combinator == COMMA_COMBINATOR:
724            if not has_selector:
725                # If we've not captured any selector parts, the comma is either at the beginning of the pattern
726                # or following another comma, both of which are unexpected. Commas must split selectors.
727                raise SelectorSyntaxError(
728                    "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
729                    self.pattern,
730                    index
731                )
732            sel.rel_type = rel_type
733            selectors[-1].relations.append(sel)
734            rel_type = ":" + WS_COMBINATOR
735            selectors.append(_Selector())
736        else:
737            if has_selector:
738                # End the current selector and associate the leading combinator with this selector.
739                sel.rel_type = rel_type
740                selectors[-1].relations.append(sel)
741            elif rel_type[1:] != WS_COMBINATOR:
742                # It's impossible to have two whitespace combinators after each other as the patterns
743                # will gobble up trailing whitespace. It is also impossible to have a whitespace
744                # combinator after any other kind for the same reason. But we could have
745                # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
746                # then we've hit the multiple combinator case, so we should fail.
747                raise SelectorSyntaxError(
748                    'The multiple combinators at position {}'.format(index),
749                    self.pattern,
750                    index
751                )
752            # Set the leading combinator for the next selector.
753            rel_type = ':' + combinator
754        sel = _Selector()
755
756        has_selector = False
757        return has_selector, sel, rel_type
758
759    def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index):
760        """Parse combinator tokens."""
761
762        combinator = m.group('relation').strip()
763        if not combinator:
764            combinator = WS_COMBINATOR
765        if not has_selector:
766            raise SelectorSyntaxError(
767                "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
768                self.pattern,
769                index
770            )
771
772        if combinator == COMMA_COMBINATOR:
773            if not sel.tag and not is_pseudo:
774                # Implied `*`
775                sel.tag = ct.SelectorTag('*', None)
776            sel.relations.extend(relations)
777            selectors.append(sel)
778            del relations[:]
779        else:
780            sel.relations.extend(relations)
781            sel.rel_type = combinator
782            del relations[:]
783            relations.append(sel)
784        sel = _Selector()
785
786        has_selector = False
787        return has_selector, sel
788
789    def parse_class_id(self, sel, m, has_selector):
790        """Parse HTML classes and ids."""
791
792        selector = m.group(0)
793        if selector.startswith('.'):
794            sel.classes.append(css_unescape(selector[1:]))
795        else:
796            sel.ids.append(css_unescape(selector[1:]))
797        has_selector = True
798        return has_selector
799
800    def parse_pseudo_contains(self, sel, m, has_selector):
801        """Parse contains."""
802
803        values = m.group('values')
804        patterns = []
805        for token in RE_VALUES.finditer(values):
806            if token.group('split'):
807                continue
808            value = token.group('value')
809            if value.startswith(("'", '"')):
810                value = css_unescape(value[1:-1], True)
811            else:
812                value = css_unescape(value)
813            patterns.append(value)
814        sel.contains.append(ct.SelectorContains(tuple(patterns)))
815        has_selector = True
816        return has_selector
817
818    def parse_pseudo_lang(self, sel, m, has_selector):
819        """Parse pseudo language."""
820
821        values = m.group('values')
822        patterns = []
823        for token in RE_VALUES.finditer(values):
824            if token.group('split'):
825                continue
826            value = token.group('value')
827            if value.startswith(('"', "'")):
828                value = css_unescape(value[1:-1], True)
829            else:
830                value = css_unescape(value)
831
832            patterns.append(value)
833
834        sel.lang.append(ct.SelectorLang(patterns))
835        has_selector = True
836
837        return has_selector
838
839    def parse_pseudo_dir(self, sel, m, has_selector):
840        """Parse pseudo direction."""
841
842        value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
843        sel.flags |= value
844        has_selector = True
845        return has_selector
846
847    def parse_selectors(self, iselector, index=0, flags=0):
848        """Parse selectors."""
849
850        sel = _Selector()
851        selectors = []
852        has_selector = False
853        closed = False
854        relations = []
855        rel_type = ":" + WS_COMBINATOR
856        is_open = bool(flags & FLG_OPEN)
857        is_pseudo = bool(flags & FLG_PSEUDO)
858        is_relative = bool(flags & FLG_RELATIVE)
859        is_not = bool(flags & FLG_NOT)
860        is_html = bool(flags & FLG_HTML)
861        is_default = bool(flags & FLG_DEFAULT)
862        is_indeterminate = bool(flags & FLG_INDETERMINATE)
863        is_in_range = bool(flags & FLG_IN_RANGE)
864        is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
865        is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
866
867        if self.debug:  # pragma: no cover
868            if is_pseudo:
869                print('    is_pseudo: True')
870            if is_open:
871                print('    is_open: True')
872            if is_relative:
873                print('    is_relative: True')
874            if is_not:
875                print('    is_not: True')
876            if is_html:
877                print('    is_html: True')
878            if is_default:
879                print('    is_default: True')
880            if is_indeterminate:
881                print('    is_indeterminate: True')
882            if is_in_range:
883                print('    is_in_range: True')
884            if is_out_of_range:
885                print('    is_out_of_range: True')
886            if is_placeholder_shown:
887                print('    is_placeholder_shown: True')
888
889        if is_relative:
890            selectors.append(_Selector())
891
892        try:
893            while True:
894                key, m = next(iselector)
895
896                # Handle parts
897                if key == "at_rule":
898                    raise NotImplementedError("At-rules found at position {}".format(m.start(0)))
899                elif key == 'pseudo_class_custom':
900                    has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
901                elif key == 'pseudo_class':
902                    has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
903                elif key == 'pseudo_element':
904                    raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
905                elif key == 'pseudo_contains':
906                    has_selector = self.parse_pseudo_contains(sel, m, has_selector)
907                elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
908                    has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
909                elif key == 'pseudo_lang':
910                    has_selector = self.parse_pseudo_lang(sel, m, has_selector)
911                elif key == 'pseudo_dir':
912                    has_selector = self.parse_pseudo_dir(sel, m, has_selector)
913                    # Currently only supports HTML
914                    is_html = True
915                elif key == 'pseudo_close':
916                    if not has_selector:
917                        raise SelectorSyntaxError(
918                            "Expected a selector at postion {}".format(m.start(0)),
919                            self.pattern,
920                            m.start(0)
921                        )
922                    if is_open:
923                        closed = True
924                        break
925                    else:
926                        raise SelectorSyntaxError(
927                            "Unmatched pseudo-class close at postion {}".format(m.start(0)),
928                            self.pattern,
929                            m.start(0)
930                        )
931                elif key == 'combine':
932                    if is_relative:
933                        has_selector, sel, rel_type = self.parse_has_combinator(
934                            sel, m, has_selector, selectors, rel_type, index
935                        )
936                    else:
937                        has_selector, sel = self.parse_combinator(
938                            sel, m, has_selector, selectors, relations, is_pseudo, index
939                        )
940                elif key == 'attribute':
941                    has_selector = self.parse_attribute_selector(sel, m, has_selector)
942                elif key == 'tag':
943                    if has_selector:
944                        raise SelectorSyntaxError(
945                            "Tag name found at position {} instead of at the start".format(m.start(0)),
946                            self.pattern,
947                            m.start(0)
948                        )
949                    has_selector = self.parse_tag_pattern(sel, m, has_selector)
950                elif key in ('class', 'id'):
951                    has_selector = self.parse_class_id(sel, m, has_selector)
952
953                index = m.end(0)
954        except StopIteration:
955            pass
956
957        if is_open and not closed:
958            raise SelectorSyntaxError(
959                "Unclosed pseudo-class at position {}".format(index),
960                self.pattern,
961                index
962            )
963
964        if has_selector:
965            if not sel.tag and not is_pseudo:
966                # Implied `*`
967                sel.tag = ct.SelectorTag('*', None)
968            if is_relative:
969                sel.rel_type = rel_type
970                selectors[-1].relations.append(sel)
971            else:
972                sel.relations.extend(relations)
973                del relations[:]
974                selectors.append(sel)
975        else:
976            # We will always need to finish a selector when `:has()` is used as it leads with combining.
977            raise SelectorSyntaxError(
978                'Expected a selector at position {}'.format(index),
979                self.pattern,
980                index
981            )
982
983        # Some patterns require additional logic, such as default. We try to make these the
984        # last pattern, and append the appropriate flag to that selector which communicates
985        # to the matcher what additional logic is required.
986        if is_default:
987            selectors[-1].flags = ct.SEL_DEFAULT
988        if is_indeterminate:
989            selectors[-1].flags = ct.SEL_INDETERMINATE
990        if is_in_range:
991            selectors[-1].flags = ct.SEL_IN_RANGE
992        if is_out_of_range:
993            selectors[-1].flags = ct.SEL_OUT_OF_RANGE
994        if is_placeholder_shown:
995            selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
996
997        return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
998
999    def selector_iter(self, pattern):
1000        """Iterate selector tokens."""
1001
1002        # Ignore whitespace and comments at start and end of pattern
1003        m = RE_WS_BEGIN.search(pattern)
1004        index = m.end(0) if m else 0
1005        m = RE_WS_END.search(pattern)
1006        end = (m.start(0) - 1) if m else (len(pattern) - 1)
1007
1008        if self.debug:  # pragma: no cover
1009            print('## PARSING: {!r}'.format(pattern))
1010        while index <= end:
1011            m = None
1012            for v in self.css_tokens:
1013                m = v.match(pattern, index, self.flags)
1014                if m:
1015                    name = v.get_name()
1016                    if self.debug:  # pragma: no cover
1017                        print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0)))
1018                    index = m.end(0)
1019                    yield name, m
1020                    break
1021            if m is None:
1022                c = pattern[index]
1023                # If the character represents the start of one of the known selector types,
1024                # throw an exception mentioning that the known selector type is in error;
1025                # otherwise, report the invalid character.
1026                if c == '[':
1027                    msg = "Malformed attribute selector at position {}".format(index)
1028                elif c == '.':
1029                    msg = "Malformed class selector at position {}".format(index)
1030                elif c == '#':
1031                    msg = "Malformed id selector at position {}".format(index)
1032                elif c == ':':
1033                    msg = "Malformed pseudo-class selector at position {}".format(index)
1034                else:
1035                    msg = "Invalid character {!r} position {}".format(c, index)
1036                raise SelectorSyntaxError(msg, self.pattern, index)
1037        if self.debug:  # pragma: no cover
1038            print('## END PARSING')
1039
1040    def process_selectors(self, index=0, flags=0):
1041        """Process selectors."""
1042
1043        return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
1044
1045
1046# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
1047# A few patterns are order dependent as they use patterns previous compiled.
1048
1049# CSS pattern for `:link` and `:any-link`
1050CSS_LINK = CSSParser(
1051    'html|*:is(a, area, link)[href]'
1052).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1053# CSS pattern for `:checked`
1054CSS_CHECKED = CSSParser(
1055    '''
1056    html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
1057    '''
1058).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1059# CSS pattern for `:default` (must compile CSS_CHECKED first)
1060CSS_DEFAULT = CSSParser(
1061    '''
1062    :checked,
1063
1064    /*
1065    This pattern must be at the end.
1066    Special logic is applied to the last selector.
1067    */
1068    html|form html|*:is(button, input)[type="submit"]
1069    '''
1070).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
1071# CSS pattern for `:indeterminate`
1072CSS_INDETERMINATE = CSSParser(
1073    '''
1074    html|input[type="checkbox"][indeterminate],
1075    html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
1076    html|progress:not([value]),
1077
1078    /*
1079    This pattern must be at the end.
1080    Special logic is applied to the last selector.
1081    */
1082    html|input[type="radio"][name][name!='']:not([checked])
1083    '''
1084).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
1085# CSS pattern for `:disabled`
1086CSS_DISABLED = CSSParser(
1087    '''
1088    html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
1089    html|optgroup[disabled] > html|option,
1090    html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
1091    html|fieldset[disabled] >
1092        html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
1093    '''
1094).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1095# CSS pattern for `:enabled`
1096CSS_ENABLED = CSSParser(
1097    '''
1098    html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
1099    '''
1100).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1101# CSS pattern for `:required`
1102CSS_REQUIRED = CSSParser(
1103    'html|*:is(input, textarea, select)[required]'
1104).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1105# CSS pattern for `:optional`
1106CSS_OPTIONAL = CSSParser(
1107    'html|*:is(input, textarea, select):not([required])'
1108).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1109# CSS pattern for `:placeholder-shown`
1110CSS_PLACEHOLDER_SHOWN = CSSParser(
1111    '''
1112    html|input:is(
1113        :not([type]),
1114        [type=""],
1115        [type=text],
1116        [type=search],
1117        [type=url],
1118        [type=tel],
1119        [type=email],
1120        [type=password],
1121        [type=number]
1122    )[placeholder][placeholder!='']:is(:not([value]), [value=""]),
1123    html|textarea[placeholder][placeholder!='']
1124    '''
1125).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
1126# CSS pattern default for `:nth-child` "of S" feature
1127CSS_NTH_OF_S_DEFAULT = CSSParser(
1128    '*|*'
1129).process_selectors(flags=FLG_PSEUDO)
1130# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
1131CSS_READ_WRITE = CSSParser(
1132    '''
1133    html|*:is(
1134        textarea,
1135        input:is(
1136            :not([type]),
1137            [type=""],
1138            [type=text],
1139            [type=search],
1140            [type=url],
1141            [type=tel],
1142            [type=email],
1143            [type=number],
1144            [type=password],
1145            [type=date],
1146            [type=datetime-local],
1147            [type=month],
1148            [type=time],
1149            [type=week]
1150        )
1151    ):not([readonly], :disabled),
1152    html|*:is([contenteditable=""], [contenteditable="true" i])
1153    '''
1154).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1155# CSS pattern for `:read-only`
1156CSS_READ_ONLY = CSSParser(
1157    '''
1158    html|*:not(:read-write)
1159    '''
1160).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
1161# CSS pattern for `:in-range`
1162CSS_IN_RANGE = CSSParser(
1163    '''
1164    html|input:is(
1165        [type="date"],
1166        [type="month"],
1167        [type="week"],
1168        [type="time"],
1169        [type="datetime-local"],
1170        [type="number"],
1171        [type="range"]
1172    ):is(
1173        [min],
1174        [max]
1175    )
1176    '''
1177).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
1178# CSS pattern for `:out-of-range`
1179CSS_OUT_OF_RANGE = CSSParser(
1180    '''
1181    html|input:is(
1182        [type="date"],
1183        [type="month"],
1184        [type="week"],
1185        [type="time"],
1186        [type="datetime-local"],
1187        [type="number"],
1188        [type="range"]
1189    ):is(
1190        [min],
1191        [max]
1192    )
1193    '''
1194).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)
1195