1"""CSS matcher."""
2from __future__ import unicode_literals
3from datetime import datetime
4from . import util
5import re
6from .import css_types as ct
7import unicodedata
8
9# Empty tag pattern (whitespace okay)
10RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
11
12RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
13
14# Relationships
15REL_PARENT = ' '
16REL_CLOSE_PARENT = '>'
17REL_SIBLING = '~'
18REL_CLOSE_SIBLING = '+'
19
20# Relationships for :has() (forward looking)
21REL_HAS_PARENT = ': '
22REL_HAS_CLOSE_PARENT = ':>'
23REL_HAS_SIBLING = ':~'
24REL_HAS_CLOSE_SIBLING = ':+'
25
26NS_XHTML = 'http://www.w3.org/1999/xhtml'
27NS_XML = 'http://www.w3.org/XML/1998/namespace'
28
29DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
30RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
31
32DIR_MAP = {
33    'ltr': ct.SEL_DIR_LTR,
34    'rtl': ct.SEL_DIR_RTL,
35    'auto': 0
36}
37
38RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
39RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
40RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
41RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
42RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
43RE_DATETIME = re.compile(
44    r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
45)
46
47MONTHS_30 = (4, 6, 9, 11)  # April, June, September, and November
48FEB = 2
49SHORT_MONTH = 30
50LONG_MONTH = 31
51FEB_MONTH = 28
52FEB_LEAP_MONTH = 29
53DAYS_IN_WEEK = 7
54
55
56class _FakeParent(object):
57    """
58    Fake parent class.
59
60    When we have a fragment with no `BeautifulSoup` document object,
61    we can't evaluate `nth` selectors properly.  Create a temporary
62    fake parent so we can traverse the root element as a child.
63    """
64
65    def __init__(self, element):
66        """Initialize."""
67
68        self.contents = [element]
69
70    def __len__(self):
71        """Length."""
72
73        return len(self.contents)
74
75
76class _DocumentNav(object):
77    """Navigate a Beautiful Soup document."""
78
79    @classmethod
80    def assert_valid_input(cls, tag):
81        """Check if valid input tag or document."""
82
83        # Fail on unexpected types.
84        if not cls.is_tag(tag):
85            raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
86
87    @staticmethod
88    def is_doc(obj):
89        """Is `BeautifulSoup` object."""
90
91        import bs4
92        return isinstance(obj, bs4.BeautifulSoup)
93
94    @staticmethod
95    def is_tag(obj):
96        """Is tag."""
97
98        import bs4
99        return isinstance(obj, bs4.Tag)
100
101    @staticmethod
102    def is_comment(obj):
103        """Is comment."""
104
105        import bs4
106        return isinstance(obj, bs4.Comment)
107
108    @staticmethod
109    def is_declaration(obj):  # pragma: no cover
110        """Is declaration."""
111
112        import bs4
113        return isinstance(obj, bs4.Declaration)
114
115    @staticmethod
116    def is_cdata(obj):
117        """Is CDATA."""
118
119        import bs4
120        return isinstance(obj, bs4.CData)
121
122    @staticmethod
123    def is_processing_instruction(obj):  # pragma: no cover
124        """Is processing instruction."""
125
126        import bs4
127        return isinstance(obj, bs4.ProcessingInstruction)
128
129    @staticmethod
130    def is_navigable_string(obj):
131        """Is navigable string."""
132
133        import bs4
134        return isinstance(obj, bs4.NavigableString)
135
136    @staticmethod
137    def is_special_string(obj):
138        """Is special string."""
139
140        import bs4
141        return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
142
143    @classmethod
144    def is_content_string(cls, obj):
145        """Check if node is content string."""
146
147        return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
148
149    @staticmethod
150    def create_fake_parent(el):
151        """Create fake parent for a given element."""
152
153        return _FakeParent(el)
154
155    @staticmethod
156    def is_xml_tree(el):
157        """Check if element (or document) is from a XML tree."""
158
159        return el._is_xml
160
161    def is_iframe(self, el):
162        """Check if element is an `iframe`."""
163
164        return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)
165
166    def is_root(self, el):
167        """
168        Return whether element is a root element.
169
170        We check that the element is the root of the tree (which we have already pre-calculated),
171        and we check if it is the root element under an `iframe`.
172        """
173
174        root = self.root and self.root is el
175        if not root:
176            parent = self.get_parent(el)
177            root = parent is not None and self.is_html and self.is_iframe(parent)
178        return root
179
180    def get_contents(self, el, no_iframe=False):
181        """Get contents or contents in reverse."""
182        if not no_iframe or not self.is_iframe(el):
183            for content in el.contents:
184                yield content
185
186    def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
187        """Get children."""
188
189        if not no_iframe or not self.is_iframe(el):
190            last = len(el.contents) - 1
191            if start is None:
192                index = last if reverse else 0
193            else:
194                index = start
195            end = -1 if reverse else last + 1
196            incr = -1 if reverse else 1
197
198            if 0 <= index <= last:
199                while index != end:
200                    node = el.contents[index]
201                    index += incr
202                    if not tags or self.is_tag(node):
203                        yield node
204
205    def get_descendants(self, el, tags=True, no_iframe=False):
206        """Get descendants."""
207
208        if not no_iframe or not self.is_iframe(el):
209            next_good = None
210            for child in el.descendants:
211
212                if next_good is not None:
213                    if child is not next_good:
214                        continue
215                    next_good = None
216
217                is_tag = self.is_tag(child)
218
219                if no_iframe and is_tag and self.is_iframe(child):
220                    if child.next_sibling is not None:
221                        next_good = child.next_sibling
222                    else:
223                        last_child = child
224                        while self.is_tag(last_child) and last_child.contents:
225                            last_child = last_child.contents[-1]
226                        next_good = last_child.next_element
227                    yield child
228                    if next_good is None:
229                        break
230                    # Coverage isn't seeing this even though it's executed
231                    continue  # pragma: no cover
232
233                if not tags or is_tag:
234                    yield child
235
236    def get_parent(self, el, no_iframe=False):
237        """Get parent."""
238
239        parent = el.parent
240        if no_iframe and parent is not None and self.is_iframe(parent):
241            parent = None
242        return parent
243
244    @staticmethod
245    def get_tag_name(el):
246        """Get tag."""
247
248        return el.name
249
250    @staticmethod
251    def get_prefix_name(el):
252        """Get prefix."""
253
254        return el.prefix
255
256    @staticmethod
257    def get_uri(el):
258        """Get namespace `URI`."""
259
260        return el.namespace
261
262    @classmethod
263    def get_next(cls, el, tags=True):
264        """Get next sibling tag."""
265
266        sibling = el.next_sibling
267        while tags and not cls.is_tag(sibling) and sibling is not None:
268            sibling = sibling.next_sibling
269        return sibling
270
271    @classmethod
272    def get_previous(cls, el, tags=True):
273        """Get previous sibling tag."""
274
275        sibling = el.previous_sibling
276        while tags and not cls.is_tag(sibling) and sibling is not None:
277            sibling = sibling.previous_sibling
278        return sibling
279
280    @staticmethod
281    def has_html_ns(el):
282        """
283        Check if element has an HTML namespace.
284
285        This is a bit different than whether a element is treated as having an HTML namespace,
286        like we do in the case of `is_html_tag`.
287        """
288
289        ns = getattr(el, 'namespace') if el else None
290        return ns and ns == NS_XHTML
291
292    @staticmethod
293    def split_namespace(el, attr_name):
294        """Return namespace and attribute name without the prefix."""
295
296        return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
297
298    @staticmethod
299    def get_attribute_by_name(el, name, default=None):
300        """Get attribute by name."""
301
302        value = default
303        if el._is_xml:
304            try:
305                value = el.attrs[name]
306            except KeyError:
307                pass
308        else:
309            for k, v in el.attrs.items():
310                if util.lower(k) == name:
311                    value = v
312                    break
313        return value
314
315    @staticmethod
316    def iter_attributes(el):
317        """Iterate attributes."""
318
319        for k, v in el.attrs.items():
320            yield k, v
321
322    @classmethod
323    def get_classes(cls, el):
324        """Get classes."""
325
326        classes = cls.get_attribute_by_name(el, 'class', [])
327        if isinstance(classes, util.ustr):
328            classes = RE_NOT_WS.findall(classes)
329        return classes
330
331    def get_text(self, el, no_iframe=False):
332        """Get text."""
333
334        return ''.join(
335            [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
336        )
337
338
339class Inputs(object):
340    """Class for parsing and validating input items."""
341
342    @staticmethod
343    def validate_day(year, month, day):
344        """Validate day."""
345
346        max_days = LONG_MONTH
347        if month == FEB:
348            max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
349        elif month in MONTHS_30:
350            max_days = SHORT_MONTH
351        return 1 <= day <= max_days
352
353    @staticmethod
354    def validate_week(year, week):
355        """Validate week."""
356
357        max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
358        if max_week == 1:
359            max_week = 53
360        return 1 <= week <= max_week
361
362    @staticmethod
363    def validate_month(month):
364        """Validate month."""
365
366        return 1 <= month <= 12
367
368    @staticmethod
369    def validate_year(year):
370        """Validate year."""
371
372        return 1 <= year
373
374    @staticmethod
375    def validate_hour(hour):
376        """Validate hour."""
377
378        return 0 <= hour <= 23
379
380    @staticmethod
381    def validate_minutes(minutes):
382        """Validate minutes."""
383
384        return 0 <= minutes <= 59
385
386    @classmethod
387    def parse_value(cls, itype, value):
388        """Parse the input value."""
389
390        parsed = None
391        if itype == "date":
392            m = RE_DATE.match(value)
393            if m:
394                year = int(m.group('year'), 10)
395                month = int(m.group('month'), 10)
396                day = int(m.group('day'), 10)
397                if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
398                    parsed = (year, month, day)
399        elif itype == "month":
400            m = RE_MONTH.match(value)
401            if m:
402                year = int(m.group('year'), 10)
403                month = int(m.group('month'), 10)
404                if cls.validate_year(year) and cls.validate_month(month):
405                    parsed = (year, month)
406        elif itype == "week":
407            m = RE_WEEK.match(value)
408            if m:
409                year = int(m.group('year'), 10)
410                week = int(m.group('week'), 10)
411                if cls.validate_year(year) and cls.validate_week(year, week):
412                    parsed = (year, week)
413        elif itype == "time":
414            m = RE_TIME.match(value)
415            if m:
416                hour = int(m.group('hour'), 10)
417                minutes = int(m.group('minutes'), 10)
418                if cls.validate_hour(hour) and cls.validate_minutes(minutes):
419                    parsed = (hour, minutes)
420        elif itype == "datetime-local":
421            m = RE_DATETIME.match(value)
422            if m:
423                year = int(m.group('year'), 10)
424                month = int(m.group('month'), 10)
425                day = int(m.group('day'), 10)
426                hour = int(m.group('hour'), 10)
427                minutes = int(m.group('minutes'), 10)
428                if (
429                    cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
430                    cls.validate_hour(hour) and cls.validate_minutes(minutes)
431                ):
432                    parsed = (year, month, day, hour, minutes)
433        elif itype in ("number", "range"):
434            m = RE_NUM.match(value)
435            if m:
436                parsed = float(m.group('value'))
437        return parsed
438
439
440class _Match(object):
441    """Perform CSS matching."""
442
443    def __init__(self, selectors, scope, namespaces, flags):
444        """Initialize."""
445
446        self.assert_valid_input(scope)
447        self.tag = scope
448        self.cached_meta_lang = []
449        self.cached_default_forms = []
450        self.cached_indeterminate_forms = []
451        self.selectors = selectors
452        self.namespaces = {} if namespaces is None else namespaces
453        self.flags = flags
454        self.iframe_restrict = False
455
456        # Find the root element for the whole tree
457        doc = scope
458        parent = self.get_parent(doc)
459        while parent:
460            doc = parent
461            parent = self.get_parent(doc)
462        root = None
463        if not self.is_doc(doc):
464            root = doc
465        else:
466            for child in self.get_children(doc):
467                root = child
468                break
469
470        self.root = root
471        self.scope = scope if scope is not doc else root
472        self.has_html_namespace = self.has_html_ns(root)
473
474        # A document can be both XML and HTML (XHTML)
475        self.is_xml = self.is_xml_tree(doc)
476        self.is_html = not self.is_xml or self.has_html_namespace
477
478    def supports_namespaces(self):
479        """Check if namespaces are supported in the HTML type."""
480
481        return self.is_xml or self.has_html_namespace
482
483    def get_tag_ns(self, el):
484        """Get tag namespace."""
485
486        if self.supports_namespaces():
487            namespace = ''
488            ns = self.get_uri(el)
489            if ns:
490                namespace = ns
491        else:
492            namespace = NS_XHTML
493        return namespace
494
495    def is_html_tag(self, el):
496        """Check if tag is in HTML namespace."""
497
498        return self.get_tag_ns(el) == NS_XHTML
499
500    def get_tag(self, el):
501        """Get tag."""
502
503        name = self.get_tag_name(el)
504        return util.lower(name) if name is not None and not self.is_xml else name
505
506    def get_prefix(self, el):
507        """Get prefix."""
508
509        prefix = self.get_prefix_name(el)
510        return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
511
512    def find_bidi(self, el):
513        """Get directionality from element text."""
514
515        for node in self.get_children(el, tags=False):
516
517            # Analyze child text nodes
518            if self.is_tag(node):
519
520                # Avoid analyzing certain elements specified in the specification.
521                direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
522                if (
523                    self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
524                    not self.is_html_tag(node) or
525                    direction is not None
526                ):
527                    continue  # pragma: no cover
528
529                # Check directionality of this node's text
530                value = self.find_bidi(node)
531                if value is not None:
532                    return value
533
534                # Direction could not be determined
535                continue  # pragma: no cover
536
537            # Skip `doctype` comments, etc.
538            if self.is_special_string(node):
539                continue
540
541            # Analyze text nodes for directionality.
542            for c in node:
543                bidi = unicodedata.bidirectional(c)
544                if bidi in ('AL', 'R', 'L'):
545                    return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
546        return None
547
548    def match_attribute_name(self, el, attr, prefix):
549        """Match attribute name and return value if it exists."""
550
551        value = None
552        if self.supports_namespaces():
553            value = None
554            # If we have not defined namespaces, we can't very well find them, so don't bother trying.
555            if prefix:
556                ns = self.namespaces.get(prefix)
557                if ns is None and prefix != '*':
558                    return None
559            else:
560                ns = None
561
562            for k, v in self.iter_attributes(el):
563
564                # Get attribute parts
565                namespace, name = self.split_namespace(el, k)
566
567                # Can't match a prefix attribute as we haven't specified one to match
568                # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
569                if ns is None:
570                    if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
571                        value = v
572                        break
573                    # Coverage is not finding this even though it is executed.
574                    # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
575                    # Ignore the false positive message.
576                    continue  # pragma: no cover
577
578                # We can't match our desired prefix attribute as the attribute doesn't have a prefix
579                if namespace is None or ns != namespace and prefix != '*':
580                    continue
581
582                # The attribute doesn't match.
583                if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
584                    continue
585
586                value = v
587                break
588        else:
589            for k, v in self.iter_attributes(el):
590                if util.lower(attr) != util.lower(k):
591                    continue
592                value = v
593                break
594        return value
595
596    def match_namespace(self, el, tag):
597        """Match the namespace of the element."""
598
599        match = True
600        namespace = self.get_tag_ns(el)
601        default_namespace = self.namespaces.get('')
602        tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
603        # We must match the default namespace if one is not provided
604        if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
605            match = False
606        # If we specified `|tag`, we must not have a namespace.
607        elif (tag.prefix is not None and tag.prefix == '' and namespace):
608            match = False
609        # Verify prefix matches
610        elif (
611            tag.prefix and
612            tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
613        ):
614            match = False
615        return match
616
617    def match_attributes(self, el, attributes):
618        """Match attributes."""
619
620        match = True
621        if attributes:
622            for a in attributes:
623                value = self.match_attribute_name(el, a.attribute, a.prefix)
624                pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
625                if isinstance(value, list):
626                    value = ' '.join(value)
627                if value is None:
628                    match = False
629                    break
630                elif pattern is None:
631                    continue
632                elif pattern.match(value) is None:
633                    match = False
634                    break
635        return match
636
637    def match_tagname(self, el, tag):
638        """Match tag name."""
639
640        name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
641        return not (
642            name is not None and
643            name not in (self.get_tag(el), '*')
644        )
645
646    def match_tag(self, el, tag):
647        """Match the tag."""
648
649        match = True
650        if tag is not None:
651            # Verify namespace
652            if not self.match_namespace(el, tag):
653                match = False
654            if not self.match_tagname(el, tag):
655                match = False
656        return match
657
658    def match_past_relations(self, el, relation):
659        """Match past relationship."""
660
661        found = False
662        if relation[0].rel_type == REL_PARENT:
663            parent = self.get_parent(el, no_iframe=self.iframe_restrict)
664            while not found and parent:
665                found = self.match_selectors(parent, relation)
666                parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
667        elif relation[0].rel_type == REL_CLOSE_PARENT:
668            parent = self.get_parent(el, no_iframe=self.iframe_restrict)
669            if parent:
670                found = self.match_selectors(parent, relation)
671        elif relation[0].rel_type == REL_SIBLING:
672            sibling = self.get_previous(el)
673            while not found and sibling:
674                found = self.match_selectors(sibling, relation)
675                sibling = self.get_previous(sibling)
676        elif relation[0].rel_type == REL_CLOSE_SIBLING:
677            sibling = self.get_previous(el)
678            if sibling and self.is_tag(sibling):
679                found = self.match_selectors(sibling, relation)
680        return found
681
682    def match_future_child(self, parent, relation, recursive=False):
683        """Match future child."""
684
685        match = False
686        children = self.get_descendants if recursive else self.get_children
687        for child in children(parent, no_iframe=self.iframe_restrict):
688            match = self.match_selectors(child, relation)
689            if match:
690                break
691        return match
692
693    def match_future_relations(self, el, relation):
694        """Match future relationship."""
695
696        found = False
697        if relation[0].rel_type == REL_HAS_PARENT:
698            found = self.match_future_child(el, relation, True)
699        elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
700            found = self.match_future_child(el, relation)
701        elif relation[0].rel_type == REL_HAS_SIBLING:
702            sibling = self.get_next(el)
703            while not found and sibling:
704                found = self.match_selectors(sibling, relation)
705                sibling = self.get_next(sibling)
706        elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
707            sibling = self.get_next(el)
708            if sibling and self.is_tag(sibling):
709                found = self.match_selectors(sibling, relation)
710        return found
711
712    def match_relations(self, el, relation):
713        """Match relationship to other elements."""
714
715        found = False
716
717        if relation[0].rel_type.startswith(':'):
718            found = self.match_future_relations(el, relation)
719        else:
720            found = self.match_past_relations(el, relation)
721
722        return found
723
724    def match_id(self, el, ids):
725        """Match element's ID."""
726
727        found = True
728        for i in ids:
729            if i != self.get_attribute_by_name(el, 'id', ''):
730                found = False
731                break
732        return found
733
734    def match_classes(self, el, classes):
735        """Match element's classes."""
736
737        current_classes = self.get_classes(el)
738        found = True
739        for c in classes:
740            if c not in current_classes:
741                found = False
742                break
743        return found
744
745    def match_root(self, el):
746        """Match element as root."""
747
748        is_root = self.is_root(el)
749        if is_root:
750            sibling = self.get_previous(el, tags=False)
751            while is_root and sibling is not None:
752                if (
753                    self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
754                    self.is_cdata(sibling)
755                ):
756                    is_root = False
757                else:
758                    sibling = self.get_previous(sibling, tags=False)
759        if is_root:
760            sibling = self.get_next(el, tags=False)
761            while is_root and sibling is not None:
762                if (
763                    self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
764                    self.is_cdata(sibling)
765                ):
766                    is_root = False
767                else:
768                    sibling = self.get_next(sibling, tags=False)
769        return is_root
770
771    def match_scope(self, el):
772        """Match element as scope."""
773
774        return self.scope is el
775
776    def match_nth_tag_type(self, el, child):
777        """Match tag type for `nth` matches."""
778
779        return(
780            (self.get_tag(child) == self.get_tag(el)) and
781            (self.get_tag_ns(child) == self.get_tag_ns(el))
782        )
783
784    def match_nth(self, el, nth):
785        """Match `nth` elements."""
786
787        matched = True
788
789        for n in nth:
790            matched = False
791            if n.selectors and not self.match_selectors(el, n.selectors):
792                break
793            parent = self.get_parent(el)
794            if parent is None:
795                parent = self.create_fake_parent(el)
796            last = n.last
797            last_index = len(parent) - 1
798            index = last_index if last else 0
799            relative_index = 0
800            a = n.a
801            b = n.b
802            var = n.n
803            count = 0
804            count_incr = 1
805            factor = -1 if last else 1
806            idx = last_idx = a * count + b if var else a
807
808            # We can only adjust bounds within a variable index
809            if var:
810                # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
811                # Otherwise, increment to try to get in bounds.
812                adjust = None
813                while idx < 1 or idx > last_index:
814                    if idx < 0:
815                        diff_low = 0 - idx
816                        if adjust is not None and adjust == 1:
817                            break
818                        adjust = -1
819                        count += count_incr
820                        idx = last_idx = a * count + b if var else a
821                        diff = 0 - idx
822                        if diff >= diff_low:
823                            break
824                    else:
825                        diff_high = idx - last_index
826                        if adjust is not None and adjust == -1:
827                            break
828                        adjust = 1
829                        count += count_incr
830                        idx = last_idx = a * count + b if var else a
831                        diff = idx - last_index
832                        if diff >= diff_high:
833                            break
834                        diff_high = diff
835
836                # If a < 0, our count is working backwards, so floor the index by increasing the count.
837                # Find the count that yields the lowest, in bound value and use that.
838                # Lastly reverse count increment so that we'll increase our index.
839                lowest = count
840                if a < 0:
841                    while idx >= 1:
842                        lowest = count
843                        count += count_incr
844                        idx = last_idx = a * count + b if var else a
845                    count_incr = -1
846                count = lowest
847                idx = last_idx = a * count + b if var else a
848
849            # Evaluate elements while our calculated nth index is still in range
850            while 1 <= idx <= last_index + 1:
851                child = None
852                # Evaluate while our child index is still in range.
853                for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
854                    index += factor
855                    if not self.is_tag(child):
856                        continue
857                    # Handle `of S` in `nth-child`
858                    if n.selectors and not self.match_selectors(child, n.selectors):
859                        continue
860                    # Handle `of-type`
861                    if n.of_type and not self.match_nth_tag_type(el, child):
862                        continue
863                    relative_index += 1
864                    if relative_index == idx:
865                        if child is el:
866                            matched = True
867                        else:
868                            break
869                    if child is el:
870                        break
871                if child is el:
872                    break
873                last_idx = idx
874                count += count_incr
875                if count < 0:
876                    # Count is counting down and has now ventured into invalid territory.
877                    break
878                idx = a * count + b if var else a
879                if last_idx == idx:
880                    break
881            if not matched:
882                break
883        return matched
884
885    def match_empty(self, el):
886        """Check if element is empty (if requested)."""
887
888        is_empty = True
889        for child in self.get_children(el, tags=False):
890            if self.is_tag(child):
891                is_empty = False
892                break
893            elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
894                is_empty = False
895                break
896        return is_empty
897
898    def match_subselectors(self, el, selectors):
899        """Match selectors."""
900
901        match = True
902        for sel in selectors:
903            if not self.match_selectors(el, sel):
904                match = False
905        return match
906
907    def match_contains(self, el, contains):
908        """Match element if it contains text."""
909
910        match = True
911        content = None
912        for contain_list in contains:
913            if content is None:
914                content = self.get_text(el, no_iframe=self.is_html)
915            found = False
916            for text in contain_list.text:
917                if text in content:
918                    found = True
919                    break
920            if not found:
921                match = False
922        return match
923
924    def match_default(self, el):
925        """Match default."""
926
927        match = False
928
929        # Find this input's form
930        form = None
931        parent = self.get_parent(el, no_iframe=True)
932        while parent and form is None:
933            if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
934                form = parent
935            else:
936                parent = self.get_parent(parent, no_iframe=True)
937
938        # Look in form cache to see if we've already located its default button
939        found_form = False
940        for f, t in self.cached_default_forms:
941            if f is form:
942                found_form = True
943                if t is el:
944                    match = True
945                break
946
947        # We didn't have the form cached, so look for its default button
948        if not found_form:
949            for child in self.get_descendants(form, no_iframe=True):
950                name = self.get_tag(child)
951                # Can't do nested forms (haven't figured out why we never hit this)
952                if name == 'form':  # pragma: no cover
953                    break
954                if name in ('input', 'button'):
955                    v = self.get_attribute_by_name(child, 'type', '')
956                    if v and util.lower(v) == 'submit':
957                        self.cached_default_forms.append([form, child])
958                        if el is child:
959                            match = True
960                        break
961        return match
962
963    def match_indeterminate(self, el):
964        """Match default."""
965
966        match = False
967        name = self.get_attribute_by_name(el, 'name')
968
969        def get_parent_form(el):
970            """Find this input's form."""
971            form = None
972            parent = self.get_parent(el, no_iframe=True)
973            while form is None:
974                if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
975                    form = parent
976                    break
977                last_parent = parent
978                parent = self.get_parent(parent, no_iframe=True)
979                if parent is None:
980                    form = last_parent
981                    break
982            return form
983
984        form = get_parent_form(el)
985
986        # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
987        found_form = False
988        for f, n, i in self.cached_indeterminate_forms:
989            if f is form and n == name:
990                found_form = True
991                if i is True:
992                    match = True
993                break
994
995        # We didn't have the form cached, so validate that the radio button is indeterminate
996        if not found_form:
997            checked = False
998            for child in self.get_descendants(form, no_iframe=True):
999                if child is el:
1000                    continue
1001                tag_name = self.get_tag(child)
1002                if tag_name == 'input':
1003                    is_radio = False
1004                    check = False
1005                    has_name = False
1006                    for k, v in self.iter_attributes(child):
1007                        if util.lower(k) == 'type' and util.lower(v) == 'radio':
1008                            is_radio = True
1009                        elif util.lower(k) == 'name' and v == name:
1010                            has_name = True
1011                        elif util.lower(k) == 'checked':
1012                            check = True
1013                        if is_radio and check and has_name and get_parent_form(child) is form:
1014                            checked = True
1015                            break
1016                if checked:
1017                    break
1018            if not checked:
1019                match = True
1020            self.cached_indeterminate_forms.append([form, name, match])
1021
1022        return match
1023
1024    def match_lang(self, el, langs):
1025        """Match languages."""
1026
1027        match = False
1028        has_ns = self.supports_namespaces()
1029        root = self.root
1030        has_html_namespace = self.has_html_namespace
1031
1032        # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1033        parent = el
1034        found_lang = None
1035        last = None
1036        while not found_lang:
1037            has_html_ns = self.has_html_ns(parent)
1038            for k, v in self.iter_attributes(parent):
1039                attr_ns, attr = self.split_namespace(parent, k)
1040                if (
1041                    ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1042                    (
1043                        has_ns and not has_html_ns and attr_ns == NS_XML and
1044                        (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1045                    )
1046                ):
1047                    found_lang = v
1048                    break
1049            last = parent
1050            parent = self.get_parent(parent, no_iframe=self.is_html)
1051
1052            if parent is None:
1053                root = last
1054                has_html_namespace = self.has_html_ns(root)
1055                parent = last
1056                break
1057
1058        # Use cached meta language.
1059        if not found_lang and self.cached_meta_lang:
1060            for cache in self.cached_meta_lang:
1061                if root is cache[0]:
1062                    found_lang = cache[1]
1063
1064        # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1065        if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1066            # Find head
1067            found = False
1068            for tag in ('html', 'head'):
1069                found = False
1070                for child in self.get_children(parent, no_iframe=self.is_html):
1071                    if self.get_tag(child) == tag and self.is_html_tag(child):
1072                        found = True
1073                        parent = child
1074                        break
1075                if not found:  # pragma: no cover
1076                    break
1077
1078            # Search meta tags
1079            if found:
1080                for child in parent:
1081                    if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1082                        c_lang = False
1083                        content = None
1084                        for k, v in self.iter_attributes(child):
1085                            if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1086                                c_lang = True
1087                            if util.lower(k) == 'content':
1088                                content = v
1089                            if c_lang and content:
1090                                found_lang = content
1091                                self.cached_meta_lang.append((root, found_lang))
1092                                break
1093                    if found_lang:
1094                        break
1095                if not found_lang:
1096                    self.cached_meta_lang.append((root, False))
1097
1098        # If we determined a language, compare.
1099        if found_lang:
1100            for patterns in langs:
1101                match = False
1102                for pattern in patterns:
1103                    if pattern.match(found_lang):
1104                        match = True
1105                if not match:
1106                    break
1107
1108        return match
1109
1110    def match_dir(self, el, directionality):
1111        """Check directionality."""
1112
1113        # If we have to match both left and right, we can't match either.
1114        if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1115            return False
1116
1117        if el is None or not self.is_html_tag(el):
1118            return False
1119
1120        # Element has defined direction of left to right or right to left
1121        direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1122        if direction not in (None, 0):
1123            return direction == directionality
1124
1125        # Element is the document element (the root) and no direction assigned, assume left to right.
1126        is_root = self.is_root(el)
1127        if is_root and direction is None:
1128            return ct.SEL_DIR_LTR == directionality
1129
1130        # If `input[type=telephone]` and no direction is assigned, assume left to right.
1131        name = self.get_tag(el)
1132        is_input = name == 'input'
1133        is_textarea = name == 'textarea'
1134        is_bdi = name == 'bdi'
1135        itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1136        if is_input and itype == 'tel' and direction is None:
1137            return ct.SEL_DIR_LTR == directionality
1138
1139        # Auto handling for text inputs
1140        if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1141            if is_textarea:
1142                value = []
1143                for node in self.get_contents(el, no_iframe=True):
1144                    if self.is_content_string(node):
1145                        value.append(node)
1146                value = ''.join(value)
1147            else:
1148                value = self.get_attribute_by_name(el, 'value', '')
1149            if value:
1150                for c in value:
1151                    bidi = unicodedata.bidirectional(c)
1152                    if bidi in ('AL', 'R', 'L'):
1153                        direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1154                        return direction == directionality
1155                # Assume left to right
1156                return ct.SEL_DIR_LTR == directionality
1157            elif is_root:
1158                return ct.SEL_DIR_LTR == directionality
1159            return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1160
1161        # Auto handling for `bdi` and other non text inputs.
1162        if (is_bdi and direction is None) or direction == 0:
1163            direction = self.find_bidi(el)
1164            if direction is not None:
1165                return direction == directionality
1166            elif is_root:
1167                return ct.SEL_DIR_LTR == directionality
1168            return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1169
1170        # Match parents direction
1171        return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1172
1173    def match_range(self, el, condition):
1174        """
1175        Match range.
1176
1177        Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1178        if the value is out of range, and if not, it is in range. So a missing value
1179        will not evaluate out of range; therefore, value is in range. Personally, I
1180        feel like this should evaluate as neither in or out of range.
1181        """
1182
1183        out_of_range = False
1184
1185        itype = self.get_attribute_by_name(el, 'type').lower()
1186        mn = self.get_attribute_by_name(el, 'min', None)
1187        if mn is not None:
1188            mn = Inputs.parse_value(itype, mn)
1189        mx = self.get_attribute_by_name(el, 'max', None)
1190        if mx is not None:
1191            mx = Inputs.parse_value(itype, mx)
1192
1193        # There is no valid min or max, so we cannot evaluate a range
1194        if mn is None and mx is None:
1195            return False
1196
1197        value = self.get_attribute_by_name(el, 'value', None)
1198        if value is not None:
1199            value = Inputs.parse_value(itype, value)
1200        if value is not None:
1201            if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1202                if mn is not None and value < mn:
1203                    out_of_range = True
1204                if not out_of_range and mx is not None and value > mx:
1205                    out_of_range = True
1206            elif itype == "time":
1207                if mn is not None and mx is not None and mn > mx:
1208                    # Time is periodic, so this is a reversed/discontinuous range
1209                    if value < mn and value > mx:
1210                        out_of_range = True
1211                else:
1212                    if mn is not None and value < mn:
1213                        out_of_range = True
1214                    if not out_of_range and mx is not None and value > mx:
1215                        out_of_range = True
1216
1217        return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1218
1219    def match_defined(self, el):
1220        """
1221        Match defined.
1222
1223        `:defined` is related to custom elements in a browser.
1224
1225        - If the document is XML (not XHTML), all tags will match.
1226        - Tags that are not custom (don't have a hyphen) are marked defined.
1227        - If the tag has a prefix (without or without a namespace), it will not match.
1228
1229        This is of course requires the parser to provide us with the proper prefix and namespace info,
1230        if it doesn't, there is nothing we can do.
1231        """
1232
1233        name = self.get_tag(el)
1234        return (
1235            name.find('-') == -1 or
1236            name.find(':') != -1 or
1237            self.get_prefix(el) is not None
1238        )
1239
1240    def match_selectors(self, el, selectors):
1241        """Check if element matches one of the selectors."""
1242
1243        match = False
1244        is_not = selectors.is_not
1245        is_html = selectors.is_html
1246
1247        # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1248        if is_html:
1249            namespaces = self.namespaces
1250            iframe_restrict = self.iframe_restrict
1251            self.namespaces = {'html': NS_XHTML}
1252            self.iframe_restrict = True
1253
1254        if not is_html or self.is_html:
1255            for selector in selectors:
1256                match = is_not
1257                # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1258                if isinstance(selector, ct.SelectorNull):
1259                    continue
1260                # Verify tag matches
1261                if not self.match_tag(el, selector.tag):
1262                    continue
1263                # Verify tag is defined
1264                if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1265                    continue
1266                # Verify element is root
1267                if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1268                    continue
1269                # Verify element is scope
1270                if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1271                    continue
1272                # Verify `nth` matches
1273                if not self.match_nth(el, selector.nth):
1274                    continue
1275                if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1276                    continue
1277                # Verify id matches
1278                if selector.ids and not self.match_id(el, selector.ids):
1279                    continue
1280                # Verify classes match
1281                if selector.classes and not self.match_classes(el, selector.classes):
1282                    continue
1283                # Verify attribute(s) match
1284                if not self.match_attributes(el, selector.attributes):
1285                    continue
1286                # Verify ranges
1287                if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1288                    continue
1289                # Verify language patterns
1290                if selector.lang and not self.match_lang(el, selector.lang):
1291                    continue
1292                # Verify pseudo selector patterns
1293                if selector.selectors and not self.match_subselectors(el, selector.selectors):
1294                    continue
1295                # Verify relationship selectors
1296                if selector.relation and not self.match_relations(el, selector.relation):
1297                    continue
1298                # Validate that the current default selector match corresponds to the first submit button in the form
1299                if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1300                    continue
1301                # Validate that the unset radio button is among radio buttons with the same name in a form that are
1302                # also not set.
1303                if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1304                    continue
1305                # Validate element directionality
1306                if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1307                    continue
1308                # Validate that the tag contains the specified text.
1309                if not self.match_contains(el, selector.contains):
1310                    continue
1311                match = not is_not
1312                break
1313
1314        # Restore actual namespaces being used for external selector lists
1315        if is_html:
1316            self.namespaces = namespaces
1317            self.iframe_restrict = iframe_restrict
1318
1319        return match
1320
1321    def select(self, limit=0):
1322        """Match all tags under the targeted tag."""
1323
1324        if limit < 1:
1325            limit = None
1326
1327        for child in self.get_descendants(self.tag):
1328            if self.match(child):
1329                yield child
1330                if limit is not None:
1331                    limit -= 1
1332                    if limit < 1:
1333                        break
1334
1335    def closest(self):
1336        """Match closest ancestor."""
1337
1338        current = self.tag
1339        closest = None
1340        while closest is None and current is not None:
1341            if self.match(current):
1342                closest = current
1343            else:
1344                current = self.get_parent(current)
1345        return closest
1346
1347    def filter(self):  # noqa A001
1348        """Filter tag's children."""
1349
1350        return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1351
1352    def match(self, el):
1353        """Match."""
1354
1355        return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1356
1357
1358class CSSMatch(_DocumentNav, _Match):
1359    """The Beautiful Soup CSS match class."""
1360
1361
1362class CommentsMatch(_DocumentNav):
1363    """Comments matcher."""
1364
1365    def __init__(self, el):
1366        """Initialize."""
1367
1368        self.assert_valid_input(el)
1369        self.tag = el
1370
1371    def get_comments(self, limit=0):
1372        """Get comments."""
1373
1374        if limit < 1:
1375            limit = None
1376
1377        for child in self.get_descendants(self.tag, tags=False):
1378            if self.is_comment(child):
1379                yield child
1380                if limit is not None:
1381                    limit -= 1
1382                    if limit < 1:
1383                        break
1384
1385
1386class SoupSieve(ct.Immutable):
1387    """Compiled Soup Sieve selector matching object."""
1388
1389    __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1390
1391    def __init__(self, pattern, selectors, namespaces, custom, flags):
1392        """Initialize."""
1393
1394        super(SoupSieve, self).__init__(
1395            pattern=pattern,
1396            selectors=selectors,
1397            namespaces=namespaces,
1398            custom=custom,
1399            flags=flags
1400        )
1401
1402    def match(self, tag):
1403        """Match."""
1404
1405        return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1406
1407    def closest(self, tag):
1408        """Match closest ancestor."""
1409
1410        return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1411
1412    def filter(self, iterable):  # noqa A001
1413        """
1414        Filter.
1415
1416        `CSSMatch` can cache certain searches for tags of the same document,
1417        so if we are given a tag, all tags are from the same document,
1418        and we can take advantage of the optimization.
1419
1420        Any other kind of iterable could have tags from different documents or detached tags,
1421        so for those, we use a new `CSSMatch` for each item in the iterable.
1422        """
1423
1424        if CSSMatch.is_tag(iterable):
1425            return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1426        else:
1427            return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1428
1429    @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
1430    def comments(self, tag, limit=0):
1431        """Get comments only."""
1432
1433        return [comment for comment in CommentsMatch(tag).get_comments(limit)]
1434
1435    @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
1436    def icomments(self, tag, limit=0):
1437        """Iterate comments only."""
1438
1439        for comment in CommentsMatch(tag).get_comments(limit):
1440            yield comment
1441
1442    def select_one(self, tag):
1443        """Select a single tag."""
1444
1445        tags = self.select(tag, limit=1)
1446        return tags[0] if tags else None
1447
1448    def select(self, tag, limit=0):
1449        """Select the specified tags."""
1450
1451        return list(self.iselect(tag, limit))
1452
1453    def iselect(self, tag, limit=0):
1454        """Iterate the specified tags."""
1455
1456        for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
1457            yield el
1458
1459    def __repr__(self):  # pragma: no cover
1460        """Representation."""
1461
1462        return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
1463            self.pattern,
1464            self.namespaces,
1465            self.custom,
1466            self.flags
1467        )
1468
1469    __str__ = __repr__
1470
1471
1472ct.pickle_register(SoupSieve)
1473