1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3"""
4    Tokenizer, parser and parsed objects for CSS selectors.
5
6    :copyright: (c) 2007-2012 Ian Bicking and contributors.
7                See AUTHORS for more details.
8    :license: BSD, see LICENSE for more details.
9
10"""
11
12import sys
13import re
14import operator
15import string
16
17from css_selectors.errors import SelectorSyntaxError, ExpressionError
18from polyglot.builtins import unicode_type, codepoint_to_chr
19
20
21utab = {c:c+32 for c in range(ord(u'A'), ord(u'Z')+1)}
22
23if sys.version_info.major < 3:
24    tab = string.maketrans(string.ascii_uppercase, string.ascii_lowercase)
25
26    def ascii_lower(string):
27        """Lower-case, but only in the ASCII range."""
28        return string.translate(utab if isinstance(string, unicode_type) else tab)
29
30    def urepr(x):
31        if isinstance(x, list):
32            return '[%s]' % ', '.join((map(urepr, x)))
33        ans = repr(x)
34        if ans.startswith("u'") or ans.startswith('u"'):
35            ans = ans[1:]
36        return ans
37
38
39else:
40
41    def ascii_lower(x):
42        return x.translate(utab)
43
44    urepr = repr
45
46
47# Parsed objects
48
49class Selector:
50
51    """
52    Represents a parsed selector.
53    """
54
55    def __init__(self, tree, pseudo_element=None):
56        self.parsed_tree = tree
57        if pseudo_element is not None and not isinstance(
58                pseudo_element, FunctionalPseudoElement):
59            pseudo_element = ascii_lower(pseudo_element)
60        #: A :class:`FunctionalPseudoElement`,
61        #: or the identifier for the pseudo-element as a string,
62        #  or ``None``.
63        #:
64        #: +-------------------------+----------------+--------------------------------+
65        #: |                         | Selector       | Pseudo-element                 |
66        #: +=========================+================+================================+
67        #: | CSS3 syntax             | ``a::before``  | ``'before'``                   |
68        #: +-------------------------+----------------+--------------------------------+
69        #: | Older syntax            | ``a:before``   | ``'before'``                   |
70        #: +-------------------------+----------------+--------------------------------+
71        #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'``                   |
72        #: | not in Selectors3       |                |                                |
73        #: +-------------------------+----------------+--------------------------------+
74        #: | Invalid pseudo-class    | ``li:marker``  | ``None``                       |
75        #: +-------------------------+----------------+--------------------------------+
76        #: | Functinal               | ``a::foo(2)``  | ``FunctionalPseudoElement(…)`` |
77        #: +-------------------------+----------------+--------------------------------+
78        #:
79        # : .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
80        self.pseudo_element = pseudo_element
81
82    def __repr__(self):
83        if isinstance(self.pseudo_element, FunctionalPseudoElement):
84            pseudo_element = repr(self.pseudo_element)
85        if self.pseudo_element:
86            pseudo_element = '::%s' % self.pseudo_element
87        else:
88            pseudo_element = ''
89        return '%s[%r%s]' % (
90            self.__class__.__name__, self.parsed_tree, pseudo_element)
91
92    def specificity(self):
93        """Return the specificity_ of this selector as a tuple of 3 integers.
94
95        .. _specificity: http://www.w3.org/TR/selectors/#specificity
96
97        """
98        a, b, c = self.parsed_tree.specificity()
99        if self.pseudo_element:
100            c += 1
101        return a, b, c
102
103
104class Class:
105
106    """
107    Represents selector.class_name
108    """
109    def __init__(self, selector, class_name):
110        self.selector = selector
111        self.class_name = class_name
112
113    def __repr__(self):
114        return '%s[%r.%s]' % (
115            self.__class__.__name__, self.selector, self.class_name)
116
117    def specificity(self):
118        a, b, c = self.selector.specificity()
119        b += 1
120        return a, b, c
121
122
123class FunctionalPseudoElement:
124
125    """
126    Represents selector::name(arguments)
127
128    .. attribute:: name
129
130        The name (identifier) of the pseudo-element, as a string.
131
132    .. attribute:: arguments
133
134        The arguments of the pseudo-element, as a list of tokens.
135
136        **Note:** tokens are not part of the public API,
137        and may change between versions.
138        Use at your own risks.
139
140    """
141    def __init__(self, name, arguments):
142        self.name = ascii_lower(name)
143        self.arguments = arguments
144
145    def __repr__(self):
146        return '%s[::%s(%s)]' % (
147            self.__class__.__name__, self.name,
148            urepr([token.value for token in self.arguments]))
149
150    def argument_types(self):
151        return [token.type for token in self.arguments]
152
153    def specificity(self):
154        a, b, c = self.selector.specificity()
155        b += 1
156        return a, b, c
157
158
159class Function:
160
161    """
162    Represents selector:name(expr)
163    """
164    def __init__(self, selector, name, arguments):
165        self.selector = selector
166        self.name = ascii_lower(name)
167        self.arguments = arguments
168        self._parsed_arguments = None
169
170    def __repr__(self):
171        return '%s[%r:%s(%s)]' % (
172            self.__class__.__name__, self.selector, self.name,
173            urepr([token.value for token in self.arguments]))
174
175    def argument_types(self):
176        return [token.type for token in self.arguments]
177
178    @property
179    def parsed_arguments(self):
180        if self._parsed_arguments is None:
181            try:
182                self._parsed_arguments = parse_series(self.arguments)
183            except ValueError:
184                raise ExpressionError("Invalid series: '%r'" % self.arguments)
185        return self._parsed_arguments
186
187    def parse_arguments(self):
188        if not self.arguments_parsed:
189            self.arguments_parsed = True
190
191    def specificity(self):
192        a, b, c = self.selector.specificity()
193        b += 1
194        return a, b, c
195
196
197class Pseudo:
198
199    """
200    Represents selector:ident
201    """
202    def __init__(self, selector, ident):
203        self.selector = selector
204        self.ident = ascii_lower(ident)
205
206    def __repr__(self):
207        return '%s[%r:%s]' % (
208            self.__class__.__name__, self.selector, self.ident)
209
210    def specificity(self):
211        a, b, c = self.selector.specificity()
212        b += 1
213        return a, b, c
214
215
216class Negation:
217
218    """
219    Represents selector:not(subselector)
220    """
221    def __init__(self, selector, subselector):
222        self.selector = selector
223        self.subselector = subselector
224
225    def __repr__(self):
226        return '%s[%r:not(%r)]' % (
227            self.__class__.__name__, self.selector, self.subselector)
228
229    def specificity(self):
230        a1, b1, c1 = self.selector.specificity()
231        a2, b2, c2 = self.subselector.specificity()
232        return a1 + a2, b1 + b2, c1 + c2
233
234
235class Attrib:
236
237    """
238    Represents selector[namespace|attrib operator value]
239    """
240    def __init__(self, selector, namespace, attrib, operator, value):
241        self.selector = selector
242        self.namespace = namespace
243        self.attrib = attrib
244        self.operator = operator
245        self.value = value
246
247    def __repr__(self):
248        if self.namespace:
249            attrib = '%s|%s' % (self.namespace, self.attrib)
250        else:
251            attrib = self.attrib
252        if self.operator == 'exists':
253            return '%s[%r[%s]]' % (
254                self.__class__.__name__, self.selector, attrib)
255        else:
256            return '%s[%r[%s %s %s]]' % (
257                self.__class__.__name__, self.selector, attrib,
258                self.operator, urepr(self.value))
259
260    def specificity(self):
261        a, b, c = self.selector.specificity()
262        b += 1
263        return a, b, c
264
265
266class Element:
267
268    """
269    Represents namespace|element
270
271    `None` is for the universal selector '*'
272
273    """
274    def __init__(self, namespace=None, element=None):
275        self.namespace = namespace
276        self.element = element
277
278    def __repr__(self):
279        element = self.element or '*'
280        if self.namespace:
281            element = '%s|%s' % (self.namespace, element)
282        return '%s[%s]' % (self.__class__.__name__, element)
283
284    def specificity(self):
285        if self.element:
286            return 0, 0, 1
287        else:
288            return 0, 0, 0
289
290
291class Hash:
292
293    """
294    Represents selector#id
295    """
296    def __init__(self, selector, id):
297        self.selector = selector
298        self.id = id
299
300    def __repr__(self):
301        return '%s[%r#%s]' % (
302            self.__class__.__name__, self.selector, self.id)
303
304    def specificity(self):
305        a, b, c = self.selector.specificity()
306        a += 1
307        return a, b, c
308
309
310class CombinedSelector:
311
312    def __init__(self, selector, combinator, subselector):
313        assert selector is not None
314        self.selector = selector
315        self.combinator = combinator
316        self.subselector = subselector
317
318    def __repr__(self):
319        if self.combinator == ' ':
320            comb = '<followed>'
321        else:
322            comb = self.combinator
323        return '%s[%r %s %r]' % (
324            self.__class__.__name__, self.selector, comb, self.subselector)
325
326    def specificity(self):
327        a1, b1, c1 = self.selector.specificity()
328        a2, b2, c2 = self.subselector.specificity()
329        return a1 + a2, b1 + b2, c1 + c2
330
331
332# Parser
333
334# foo
335_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$')
336
337# foo#bar or #bar
338_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$')
339
340# foo.bar or .bar
341_class_re = re.compile(
342    r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$')
343
344
345def parse(css):
346    """Parse a CSS *group of selectors*.
347
348    :param css:
349        A *group of selectors* as an Unicode string.
350    :raises:
351        :class:`SelectorSyntaxError` on invalid selectors.
352    :returns:
353        A list of parsed :class:`Selector` objects, one for each
354        selector in the comma-separated group.
355
356    """
357    # Fast path for simple cases
358    match = _el_re.match(css)
359    if match:
360        return [Selector(Element(element=match.group(1)))]
361    match = _id_re.match(css)
362    if match is not None:
363        return [Selector(Hash(Element(element=match.group(1) or None),
364                              match.group(2)))]
365    match = _class_re.match(css)
366    if match is not None:
367        return [Selector(Class(Element(element=match.group(1) or None),
368                               match.group(2)))]
369
370    stream = TokenStream(tokenize(css))
371    stream.source = css
372    return list(parse_selector_group(stream))
373#    except SelectorSyntaxError:
374#        e = sys.exc_info()[1]
375#        message = "%s at %s -> %r" % (
376#            e, stream.used, stream.peek())
377#        e.msg = message
378#        e.args = tuple([message])
379#        raise
380
381
382def parse_selector_group(stream):
383    stream.skip_whitespace()
384    while 1:
385        yield Selector(*parse_selector(stream))
386        if stream.peek() == ('DELIM', ','):
387            stream.next()
388            stream.skip_whitespace()
389        else:
390            break
391
392
393def parse_selector(stream):
394    result, pseudo_element = parse_simple_selector(stream)
395    while 1:
396        stream.skip_whitespace()
397        peek = stream.peek()
398        if peek in (('EOF', None), ('DELIM', ',')):
399            break
400        if pseudo_element:
401            raise SelectorSyntaxError(
402                'Got pseudo-element ::%s not at the end of a selector'
403                % pseudo_element)
404        if peek.is_delim('+', '>', '~'):
405            # A combinator
406            combinator = stream.next().value
407            stream.skip_whitespace()
408        else:
409            # By exclusion, the last parse_simple_selector() ended
410            # at peek == ' '
411            combinator = ' '
412        next_selector, pseudo_element = parse_simple_selector(stream)
413        result = CombinedSelector(result, combinator, next_selector)
414    return result, pseudo_element
415
416
417special_pseudo_elements = (
418    'first-line', 'first-letter', 'before', 'after')
419
420
421def parse_simple_selector(stream, inside_negation=False):
422    stream.skip_whitespace()
423    selector_start = len(stream.used)
424    peek = stream.peek()
425    if peek.type == 'IDENT' or peek == ('DELIM', '*'):
426        if peek.type == 'IDENT':
427            namespace = stream.next().value
428        else:
429            stream.next()
430            namespace = None
431        if stream.peek() == ('DELIM', '|'):
432            stream.next()
433            element = stream.next_ident_or_star()
434        else:
435            element = namespace
436            namespace = None
437    else:
438        element = namespace = None
439    result = Element(namespace, element)
440    pseudo_element = None
441    while 1:
442        peek = stream.peek()
443        if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or (
444                inside_negation and peek == ('DELIM', ')')):
445            break
446        if pseudo_element:
447            raise SelectorSyntaxError(
448                'Got pseudo-element ::%s not at the end of a selector'
449                % pseudo_element)
450        if peek.type == 'HASH':
451            result = Hash(result, stream.next().value)
452        elif peek == ('DELIM', '.'):
453            stream.next()
454            result = Class(result, stream.next_ident())
455        elif peek == ('DELIM', '['):
456            stream.next()
457            result = parse_attrib(result, stream)
458        elif peek == ('DELIM', ':'):
459            stream.next()
460            if stream.peek() == ('DELIM', ':'):
461                stream.next()
462                pseudo_element = stream.next_ident()
463                if stream.peek() == ('DELIM', '('):
464                    stream.next()
465                    pseudo_element = FunctionalPseudoElement(
466                        pseudo_element, parse_arguments(stream))
467                continue
468            ident = stream.next_ident()
469            if ident.lower() in special_pseudo_elements:
470                # Special case: CSS 2.1 pseudo-elements can have a single ':'
471                # Any new pseudo-element must have two.
472                pseudo_element = unicode_type(ident)
473                continue
474            if stream.peek() != ('DELIM', '('):
475                result = Pseudo(result, ident)
476                continue
477            stream.next()
478            stream.skip_whitespace()
479            if ident.lower() == 'not':
480                if inside_negation:
481                    raise SelectorSyntaxError('Got nested :not()')
482                argument, argument_pseudo_element = parse_simple_selector(
483                    stream, inside_negation=True)
484                next = stream.next()
485                if argument_pseudo_element:
486                    raise SelectorSyntaxError(
487                        'Got pseudo-element ::%s inside :not() at %s'
488                        % (argument_pseudo_element, next.pos))
489                if next != ('DELIM', ')'):
490                    raise SelectorSyntaxError("Expected ')', got %s" % (next,))
491                result = Negation(result, argument)
492            else:
493                result = Function(result, ident, parse_arguments(stream))
494        else:
495            raise SelectorSyntaxError(
496                "Expected selector, got %s" % (peek,))
497    if len(stream.used) == selector_start:
498        raise SelectorSyntaxError(
499            "Expected selector, got %s" % (stream.peek(),))
500    return result, pseudo_element
501
502
503def parse_arguments(stream):
504    arguments = []
505    while 1:
506        stream.skip_whitespace()
507        next = stream.next()
508        if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [
509                ('DELIM', '+'), ('DELIM', '-')]:
510            arguments.append(next)
511        elif next == ('DELIM', ')'):
512            return arguments
513        else:
514            raise SelectorSyntaxError(
515                "Expected an argument, got %s" % (next,))
516
517
518def parse_attrib(selector, stream):
519    stream.skip_whitespace()
520    attrib = stream.next_ident_or_star()
521    if attrib is None and stream.peek() != ('DELIM', '|'):
522        raise SelectorSyntaxError(
523            "Expected '|', got %s" % (stream.peek(),))
524    if stream.peek() == ('DELIM', '|'):
525        stream.next()
526        if stream.peek() == ('DELIM', '='):
527            namespace = None
528            stream.next()
529            op = '|='
530        else:
531            namespace = attrib
532            attrib = stream.next_ident()
533            op = None
534    else:
535        namespace = op = None
536    if op is None:
537        stream.skip_whitespace()
538        next = stream.next()
539        if next == ('DELIM', ']'):
540            return Attrib(selector, namespace, attrib, 'exists', None)
541        elif next == ('DELIM', '='):
542            op = '='
543        elif next.is_delim('^', '$', '*', '~', '|', '!') and (
544                stream.peek() == ('DELIM', '=')):
545            op = next.value + '='
546            stream.next()
547        else:
548            raise SelectorSyntaxError(
549                "Operator expected, got %s" % (next,))
550    stream.skip_whitespace()
551    value = stream.next()
552    if value.type not in ('IDENT', 'STRING'):
553        raise SelectorSyntaxError(
554            "Expected string or ident, got %s" % (value,))
555    stream.skip_whitespace()
556    next = stream.next()
557    if next != ('DELIM', ']'):
558        raise SelectorSyntaxError(
559            "Expected ']', got %s" % (next,))
560    return Attrib(selector, namespace, attrib, op, value.value)
561
562
563def parse_series(tokens):
564    """
565    Parses the arguments for :nth-child() and friends.
566
567    :raises: A list of tokens
568    :returns: :``(a, b)``
569
570    """
571    for token in tokens:
572        if token.type == 'STRING':
573            raise ValueError('String tokens not allowed in series.')
574    s = ''.join(token.value for token in tokens).strip()
575    if s == 'odd':
576        return (2, 1)
577    elif s == 'even':
578        return (2, 0)
579    elif s == 'n':
580        return (1, 0)
581    if 'n' not in s:
582        # Just b
583        return (0, int(s))
584    a, b = s.split('n', 1)
585    if not a:
586        a = 1
587    elif a == '-' or a == '+':
588        a = int(a+'1')
589    else:
590        a = int(a)
591    if not b:
592        b = 0
593    else:
594        b = int(b)
595    return (a, b)
596
597
598# Token objects
599
600class Token(tuple):
601
602    def __new__(cls, type_, value, pos):
603        obj = tuple.__new__(cls, (type_, value))
604        obj.pos = pos
605        return obj
606
607    def __repr__(self):
608        return "<%s '%s' at %i>" % (self.type, self.value, self.pos)
609
610    def is_delim(self, *values):
611        return self.type == 'DELIM' and self.value in values
612
613    type = property(operator.itemgetter(0))
614    value = property(operator.itemgetter(1))
615
616
617class EOFToken(Token):
618
619    def __new__(cls, pos):
620        return Token.__new__(cls, 'EOF', None, pos)
621
622    def __repr__(self):
623        return '<%s at %i>' % (self.type, self.pos)
624
625
626# Tokenizer
627
628
629class TokenMacros:
630    unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?'
631    escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]'
632    string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape
633    nonascii = r'[^\0-\177]'
634    nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii)
635    nmstart = '[_a-z]|%s|%s' % (escape, nonascii)
636
637
638def _compile(pattern):
639    return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
640
641
642_match_whitespace = _compile(r'[ \t\r\n\f]+')
643_match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)')
644_match_hash = _compile('#(?:%(nmchar)s)+')
645_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*')
646_match_string_by_quote = {
647    "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
648    '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
649}
650
651_sub_simple_escape = re.compile(r'\\(.)').sub
652_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
653_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub
654
655# Same as r'\1', but faster on CPython
656if hasattr(operator, 'methodcaller'):
657    # Python 2.6+
658    _replace_simple = operator.methodcaller('group', 1)
659else:
660    def _replace_simple(match):
661        return match.group(1)
662
663
664def _replace_unicode(match):
665    codepoint = int(match.group(1), 16)
666    if codepoint > sys.maxunicode:
667        codepoint = 0xFFFD
668    return codepoint_to_chr(codepoint)
669
670
671def unescape_ident(value):
672    value = _sub_unicode_escape(_replace_unicode, value)
673    value = _sub_simple_escape(_replace_simple, value)
674    return value
675
676
677def tokenize(s):
678    pos = 0
679    len_s = len(s)
680    while pos < len_s:
681        match = _match_whitespace(s, pos=pos)
682        if match:
683            yield Token('S', ' ', pos)
684            pos = match.end()
685            continue
686
687        match = _match_ident(s, pos=pos)
688        if match:
689            value = _sub_simple_escape(_replace_simple,
690                    _sub_unicode_escape(_replace_unicode, match.group()))
691            yield Token('IDENT', value, pos)
692            pos = match.end()
693            continue
694
695        match = _match_hash(s, pos=pos)
696        if match:
697            value = _sub_simple_escape(_replace_simple,
698                    _sub_unicode_escape(_replace_unicode, match.group()[1:]))
699            yield Token('HASH', value, pos)
700            pos = match.end()
701            continue
702
703        quote = s[pos]
704        if quote in _match_string_by_quote:
705            match = _match_string_by_quote[quote](s, pos=pos + 1)
706            assert match, 'Should have found at least an empty match'
707            end_pos = match.end()
708            if end_pos == len_s:
709                raise SelectorSyntaxError('Unclosed string at %s' % pos)
710            if s[end_pos] != quote:
711                raise SelectorSyntaxError('Invalid string at %s' % pos)
712            value = _sub_simple_escape(_replace_simple,
713                    _sub_unicode_escape(_replace_unicode,
714                    _sub_newline_escape('', match.group())))
715            yield Token('STRING', value, pos)
716            pos = end_pos + 1
717            continue
718
719        match = _match_number(s, pos=pos)
720        if match:
721            value = match.group()
722            yield Token('NUMBER', value, pos)
723            pos = match.end()
724            continue
725
726        pos2 = pos + 2
727        if s[pos:pos2] == '/*':
728            pos = s.find('*/', pos2)
729            if pos == -1:
730                pos = len_s
731            else:
732                pos += 2
733            continue
734
735        yield Token('DELIM', s[pos], pos)
736        pos += 1
737
738    assert pos == len_s
739    yield EOFToken(pos)
740
741
742class TokenStream:
743
744    def __init__(self, tokens, source=None):
745        self.used = []
746        self.tokens = iter(tokens)
747        self.source = source
748        self.peeked = None
749        self._peeking = False
750        try:
751            self.next_token = self.tokens.next
752        except AttributeError:
753            # Python 3
754            self.next_token = self.tokens.__next__
755
756    def next(self):
757        if self._peeking:
758            self._peeking = False
759            self.used.append(self.peeked)
760            return self.peeked
761        else:
762            next = self.next_token()
763            self.used.append(next)
764            return next
765
766    def peek(self):
767        if not self._peeking:
768            self.peeked = self.next_token()
769            self._peeking = True
770        return self.peeked
771
772    def next_ident(self):
773        next = self.next()
774        if next.type != 'IDENT':
775            raise SelectorSyntaxError('Expected ident, got %s' % (next,))
776        return next.value
777
778    def next_ident_or_star(self):
779        next = self.next()
780        if next.type == 'IDENT':
781            return next.value
782        elif next == ('DELIM', '*'):
783            return None
784        else:
785            raise SelectorSyntaxError(
786                "Expected ident or '*', got %s" % (next,))
787
788    def skip_whitespace(self):
789        peek = self.peek()
790        if peek.type == 'S':
791            self.next()
792