1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4try:
5    from collections.abc import Callable # Python 3.6
6except ImportError as e:
7    from collections import Callable
8import re
9import sys
10import warnings
11try:
12    import soupsieve
13except ImportError as e:
14    soupsieve = None
15    warnings.warn(
16        'The soupsieve package is not installed. CSS selectors cannot be used.'
17    )
18
19from bs4.formatter import (
20    Formatter,
21    HTMLFormatter,
22    XMLFormatter,
23)
24
25DEFAULT_OUTPUT_ENCODING = "utf-8"
26PY3K = (sys.version_info[0] > 2)
27
28nonwhitespace_re = re.compile(r"\S+")
29
30# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31# the off chance someone imported it for their own use.
32whitespace_re = re.compile(r"\s+")
33
34def _alias(attr):
35    """Alias one attribute name to another for backward compatibility"""
36    @property
37    def alias(self):
38        return getattr(self, attr)
39
40    @alias.setter
41    def alias(self):
42        return setattr(self, attr)
43    return alias
44
45
46# These encodings are recognized by Python (so PageElement.encode
47# could theoretically support them) but XML and HTML don't recognize
48# them (so they should not show up in an XML or HTML document as that
49# document's encoding).
50#
51# If an XML document is encoded in one of these encodings, no encoding
52# will be mentioned in the XML declaration. If an HTML document is
53# encoded in one of these encodings, and the HTML document has a
54# <meta> tag that mentions an encoding, the encoding will be given as
55# the empty string.
56#
57# Source:
58# https://docs.python.org/3/library/codecs.html#python-specific-encodings
59PYTHON_SPECIFIC_ENCODINGS = set([
60    "idna",
61    "mbcs",
62    "oem",
63    "palmos",
64    "punycode",
65    "raw_unicode_escape",
66    "undefined",
67    "unicode_escape",
68    "raw-unicode-escape",
69    "unicode-escape",
70    "string-escape",
71    "string_escape",
72])
73
74
75class NamespacedAttribute(str):
76    """A namespaced string (e.g. 'xml:lang') that remembers the namespace
77    ('xml') and the name ('lang') that were used to create it.
78    """
79
80    def __new__(cls, prefix, name=None, namespace=None):
81        if not name:
82            # This is the default namespace. Its name "has no value"
83            # per https://www.w3.org/TR/xml-names/#defaulting
84            name = None
85
86        if not name:
87            obj = str.__new__(cls, prefix)
88        elif not prefix:
89            # Not really namespaced.
90            obj = str.__new__(cls, name)
91        else:
92            obj = str.__new__(cls, prefix + ":" + name)
93        obj.prefix = prefix
94        obj.name = name
95        obj.namespace = namespace
96        return obj
97
98class AttributeValueWithCharsetSubstitution(str):
99    """A stand-in object for a character encoding specified in HTML."""
100
101class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
102    """A generic stand-in for the value of a meta tag's 'charset' attribute.
103
104    When Beautiful Soup parses the markup '<meta charset="utf8">', the
105    value of the 'charset' attribute will be one of these objects.
106    """
107
108    def __new__(cls, original_value):
109        obj = str.__new__(cls, original_value)
110        obj.original_value = original_value
111        return obj
112
113    def encode(self, encoding):
114        """When an HTML document is being encoded to a given encoding, the
115        value of a meta tag's 'charset' is the name of the encoding.
116        """
117        if encoding in PYTHON_SPECIFIC_ENCODINGS:
118            return ''
119        return encoding
120
121
122class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
123    """A generic stand-in for the value of a meta tag's 'content' attribute.
124
125    When Beautiful Soup parses the markup:
126     <meta http-equiv="content-type" content="text/html; charset=utf8">
127
128    The value of the 'content' attribute will be one of these objects.
129    """
130
131    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132
133    def __new__(cls, original_value):
134        match = cls.CHARSET_RE.search(original_value)
135        if match is None:
136            # No substitution necessary.
137            return str.__new__(str, original_value)
138
139        obj = str.__new__(cls, original_value)
140        obj.original_value = original_value
141        return obj
142
143    def encode(self, encoding):
144        if encoding in PYTHON_SPECIFIC_ENCODINGS:
145            return ''
146        def rewrite(match):
147            return match.group(1) + encoding
148        return self.CHARSET_RE.sub(rewrite, self.original_value)
149
150
151class PageElement(object):
152    """Contains the navigational information for some part of the page:
153    that is, its current location in the parse tree.
154
155    NavigableString, Tag, etc. are all subclasses of PageElement.
156    """
157
158    def setup(self, parent=None, previous_element=None, next_element=None,
159              previous_sibling=None, next_sibling=None):
160        """Sets up the initial relations between this element and
161        other elements.
162
163        :param parent: The parent of this element.
164
165        :param previous_element: The element parsed immediately before
166            this one.
167
168        :param next_element: The element parsed immediately before
169            this one.
170
171        :param previous_sibling: The most recently encountered element
172            on the same level of the parse tree as this one.
173
174        :param previous_sibling: The next element to be encountered
175            on the same level of the parse tree as this one.
176        """
177        self.parent = parent
178
179        self.previous_element = previous_element
180        if previous_element is not None:
181            self.previous_element.next_element = self
182
183        self.next_element = next_element
184        if self.next_element is not None:
185            self.next_element.previous_element = self
186
187        self.next_sibling = next_sibling
188        if self.next_sibling is not None:
189            self.next_sibling.previous_sibling = self
190
191        if (previous_sibling is None
192            and self.parent is not None and self.parent.contents):
193            previous_sibling = self.parent.contents[-1]
194
195        self.previous_sibling = previous_sibling
196        if previous_sibling is not None:
197            self.previous_sibling.next_sibling = self
198
199    def format_string(self, s, formatter):
200        """Format the given string using the given formatter.
201
202        :param s: A string.
203        :param formatter: A Formatter object, or a string naming one of the standard formatters.
204        """
205        if formatter is None:
206            return s
207        if not isinstance(formatter, Formatter):
208            formatter = self.formatter_for_name(formatter)
209        output = formatter.substitute(s)
210        return output
211
212    def formatter_for_name(self, formatter):
213        """Look up or create a Formatter for the given identifier,
214        if necessary.
215
216        :param formatter: Can be a Formatter object (used as-is), a
217            function (used as the entity substitution hook for an
218            XMLFormatter or HTMLFormatter), or a string (used to look
219            up an XMLFormatter or HTMLFormatter in the appropriate
220            registry.
221        """
222        if isinstance(formatter, Formatter):
223            return formatter
224        if self._is_xml:
225            c = XMLFormatter
226        else:
227            c = HTMLFormatter
228        if isinstance(formatter, Callable):
229            return c(entity_substitution=formatter)
230        return c.REGISTRY[formatter]
231
232    @property
233    def _is_xml(self):
234        """Is this element part of an XML tree or an HTML tree?
235
236        This is used in formatter_for_name, when deciding whether an
237        XMLFormatter or HTMLFormatter is more appropriate. It can be
238        inefficient, but it should be called very rarely.
239        """
240        if self.known_xml is not None:
241            # Most of the time we will have determined this when the
242            # document is parsed.
243            return self.known_xml
244
245        # Otherwise, it's likely that this element was created by
246        # direct invocation of the constructor from within the user's
247        # Python code.
248        if self.parent is None:
249            # This is the top-level object. It should have .known_xml set
250            # from tree creation. If not, take a guess--BS is usually
251            # used on HTML markup.
252            return getattr(self, 'is_xml', False)
253        return self.parent._is_xml
254
255    nextSibling = _alias("next_sibling")  # BS3
256    previousSibling = _alias("previous_sibling")  # BS3
257
258    default = object()
259    def _all_strings(self, strip=False, types=default):
260        """Yield all strings of certain classes, possibly stripping them.
261
262        This is implemented differently in Tag and NavigableString.
263        """
264        raise NotImplementedError()
265
266    @property
267    def stripped_strings(self):
268        """Yield all strings in this PageElement, stripping them first.
269
270        :yield: A sequence of stripped strings.
271        """
272        for string in self._all_strings(True):
273            yield string
274
275    def get_text(self, separator="", strip=False,
276                 types=default):
277        """Get all child strings of this PageElement, concatenated using the
278        given separator.
279
280        :param separator: Strings will be concatenated using this separator.
281
282        :param strip: If True, strings will be stripped before being
283            concatenated.
284
285        :param types: A tuple of NavigableString subclasses. Any
286            strings of a subclass not found in this list will be
287            ignored. Although there are exceptions, the default
288            behavior in most cases is to consider only NavigableString
289            and CData objects. That means no comments, processing
290            instructions, etc.
291
292        :return: A string.
293        """
294        return separator.join([s for s in self._all_strings(
295                    strip, types=types)])
296    getText = get_text
297    text = property(get_text)
298
299    def replace_with(self, *args):
300        """Replace this PageElement with one or more PageElements, keeping the
301        rest of the tree the same.
302
303        :param args: One or more PageElements.
304        :return: `self`, no longer part of the tree.
305        """
306        if self.parent is None:
307            raise ValueError(
308                "Cannot replace one element with another when the "
309                "element to be replaced is not part of a tree.")
310        if len(args) == 1 and args[0] is self:
311            return
312        if any(x is self.parent for x in args):
313            raise ValueError("Cannot replace a Tag with its parent.")
314        old_parent = self.parent
315        my_index = self.parent.index(self)
316        self.extract(_self_index=my_index)
317        for idx, replace_with in enumerate(args, start=my_index):
318            old_parent.insert(idx, replace_with)
319        return self
320    replaceWith = replace_with  # BS3
321
322    def unwrap(self):
323        """Replace this PageElement with its contents.
324
325        :return: `self`, no longer part of the tree.
326        """
327        my_parent = self.parent
328        if self.parent is None:
329            raise ValueError(
330                "Cannot replace an element with its contents when that"
331                "element is not part of a tree.")
332        my_index = self.parent.index(self)
333        self.extract(_self_index=my_index)
334        for child in reversed(self.contents[:]):
335            my_parent.insert(my_index, child)
336        return self
337    replace_with_children = unwrap
338    replaceWithChildren = unwrap  # BS3
339
340    def wrap(self, wrap_inside):
341        """Wrap this PageElement inside another one.
342
343        :param wrap_inside: A PageElement.
344        :return: `wrap_inside`, occupying the position in the tree that used
345           to be occupied by `self`, and with `self` inside it.
346        """
347        me = self.replace_with(wrap_inside)
348        wrap_inside.append(me)
349        return wrap_inside
350
351    def extract(self, _self_index=None):
352        """Destructively rips this element out of the tree.
353
354        :param _self_index: The location of this element in its parent's
355           .contents, if known. Passing this in allows for a performance
356           optimization.
357
358        :return: `self`, no longer part of the tree.
359        """
360        if self.parent is not None:
361            if _self_index is None:
362                _self_index = self.parent.index(self)
363            del self.parent.contents[_self_index]
364
365        #Find the two elements that would be next to each other if
366        #this element (and any children) hadn't been parsed. Connect
367        #the two.
368        last_child = self._last_descendant()
369        next_element = last_child.next_element
370
371        if (self.previous_element is not None and
372            self.previous_element is not next_element):
373            self.previous_element.next_element = next_element
374        if next_element is not None and next_element is not self.previous_element:
375            next_element.previous_element = self.previous_element
376        self.previous_element = None
377        last_child.next_element = None
378
379        self.parent = None
380        if (self.previous_sibling is not None
381            and self.previous_sibling is not self.next_sibling):
382            self.previous_sibling.next_sibling = self.next_sibling
383        if (self.next_sibling is not None
384            and self.next_sibling is not self.previous_sibling):
385            self.next_sibling.previous_sibling = self.previous_sibling
386        self.previous_sibling = self.next_sibling = None
387        return self
388
389    def _last_descendant(self, is_initialized=True, accept_self=True):
390        """Finds the last element beneath this object to be parsed.
391
392        :param is_initialized: Has `setup` been called on this PageElement
393            yet?
394        :param accept_self: Is `self` an acceptable answer to the question?
395        """
396        if is_initialized and self.next_sibling is not None:
397            last_child = self.next_sibling.previous_element
398        else:
399            last_child = self
400            while isinstance(last_child, Tag) and last_child.contents:
401                last_child = last_child.contents[-1]
402        if not accept_self and last_child is self:
403            last_child = None
404        return last_child
405    # BS3: Not part of the API!
406    _lastRecursiveChild = _last_descendant
407
408    def insert(self, position, new_child):
409        """Insert a new PageElement in the list of this PageElement's children.
410
411        This works the same way as `list.insert`.
412
413        :param position: The numeric position that should be occupied
414           in `self.children` by the new PageElement.
415        :param new_child: A PageElement.
416        """
417        if new_child is None:
418            raise ValueError("Cannot insert None into a tag.")
419        if new_child is self:
420            raise ValueError("Cannot insert a tag into itself.")
421        if (isinstance(new_child, str)
422            and not isinstance(new_child, NavigableString)):
423            new_child = NavigableString(new_child)
424
425        from bs4 import BeautifulSoup
426        if isinstance(new_child, BeautifulSoup):
427            # We don't want to end up with a situation where one BeautifulSoup
428            # object contains another. Insert the children one at a time.
429            for subchild in list(new_child.contents):
430                self.insert(position, subchild)
431                position += 1
432            return
433        position = min(position, len(self.contents))
434        if hasattr(new_child, 'parent') and new_child.parent is not None:
435            # We're 'inserting' an element that's already one
436            # of this object's children.
437            if new_child.parent is self:
438                current_index = self.index(new_child)
439                if current_index < position:
440                    # We're moving this element further down the list
441                    # of this object's children. That means that when
442                    # we extract this element, our target index will
443                    # jump down one.
444                    position -= 1
445            new_child.extract()
446
447        new_child.parent = self
448        previous_child = None
449        if position == 0:
450            new_child.previous_sibling = None
451            new_child.previous_element = self
452        else:
453            previous_child = self.contents[position - 1]
454            new_child.previous_sibling = previous_child
455            new_child.previous_sibling.next_sibling = new_child
456            new_child.previous_element = previous_child._last_descendant(False)
457        if new_child.previous_element is not None:
458            new_child.previous_element.next_element = new_child
459
460        new_childs_last_element = new_child._last_descendant(False)
461
462        if position >= len(self.contents):
463            new_child.next_sibling = None
464
465            parent = self
466            parents_next_sibling = None
467            while parents_next_sibling is None and parent is not None:
468                parents_next_sibling = parent.next_sibling
469                parent = parent.parent
470                if parents_next_sibling is not None:
471                    # We found the element that comes next in the document.
472                    break
473            if parents_next_sibling is not None:
474                new_childs_last_element.next_element = parents_next_sibling
475            else:
476                # The last element of this tag is the last element in
477                # the document.
478                new_childs_last_element.next_element = None
479        else:
480            next_child = self.contents[position]
481            new_child.next_sibling = next_child
482            if new_child.next_sibling is not None:
483                new_child.next_sibling.previous_sibling = new_child
484            new_childs_last_element.next_element = next_child
485
486        if new_childs_last_element.next_element is not None:
487            new_childs_last_element.next_element.previous_element = new_childs_last_element
488        self.contents.insert(position, new_child)
489
490    def append(self, tag):
491        """Appends the given PageElement to the contents of this one.
492
493        :param tag: A PageElement.
494        """
495        self.insert(len(self.contents), tag)
496
497    def extend(self, tags):
498        """Appends the given PageElements to this one's contents.
499
500        :param tags: A list of PageElements.
501        """
502        if isinstance(tags, Tag):
503            # Calling self.append() on another tag's contents will change
504            # the list we're iterating over. Make a list that won't
505            # change.
506            tags = list(tags.contents)
507        for tag in tags:
508            self.append(tag)
509
510    def insert_before(self, *args):
511        """Makes the given element(s) the immediate predecessor of this one.
512
513        All the elements will have the same parent, and the given elements
514        will be immediately before this one.
515
516        :param args: One or more PageElements.
517        """
518        parent = self.parent
519        if parent is None:
520            raise ValueError(
521                "Element has no parent, so 'before' has no meaning.")
522        if any(x is self for x in args):
523                raise ValueError("Can't insert an element before itself.")
524        for predecessor in args:
525            # Extract first so that the index won't be screwed up if they
526            # are siblings.
527            if isinstance(predecessor, PageElement):
528                predecessor.extract()
529            index = parent.index(self)
530            parent.insert(index, predecessor)
531
532    def insert_after(self, *args):
533        """Makes the given element(s) the immediate successor of this one.
534
535        The elements will have the same parent, and the given elements
536        will be immediately after this one.
537
538        :param args: One or more PageElements.
539        """
540        # Do all error checking before modifying the tree.
541        parent = self.parent
542        if parent is None:
543            raise ValueError(
544                "Element has no parent, so 'after' has no meaning.")
545        if any(x is self for x in args):
546            raise ValueError("Can't insert an element after itself.")
547
548        offset = 0
549        for successor in args:
550            # Extract first so that the index won't be screwed up if they
551            # are siblings.
552            if isinstance(successor, PageElement):
553                successor.extract()
554            index = parent.index(self)
555            parent.insert(index+1+offset, successor)
556            offset += 1
557
558    def find_next(self, name=None, attrs={}, text=None, **kwargs):
559        """Find the first PageElement that matches the given criteria and
560        appears later in the document than this PageElement.
561
562        All find_* methods take a common set of arguments. See the online
563        documentation for detailed explanations.
564
565        :param name: A filter on tag name.
566        :param attrs: A dictionary of filters on attribute values.
567        :param text: A filter for a NavigableString with specific text.
568        :kwargs: A dictionary of filters on attribute values.
569        :return: A PageElement.
570        :rtype: bs4.element.Tag | bs4.element.NavigableString
571        """
572        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
573    findNext = find_next  # BS3
574
575    def find_all_next(self, name=None, attrs={}, text=None, limit=None,
576                    **kwargs):
577        """Find all PageElements that match the given criteria and appear
578        later in the document than this PageElement.
579
580        All find_* methods take a common set of arguments. See the online
581        documentation for detailed explanations.
582
583        :param name: A filter on tag name.
584        :param attrs: A dictionary of filters on attribute values.
585        :param text: A filter for a NavigableString with specific text.
586        :param limit: Stop looking after finding this many results.
587        :kwargs: A dictionary of filters on attribute values.
588        :return: A ResultSet containing PageElements.
589        """
590        return self._find_all(name, attrs, text, limit, self.next_elements,
591                             **kwargs)
592    findAllNext = find_all_next  # BS3
593
594    def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
595        """Find the closest sibling to this PageElement that matches the
596        given criteria and appears later in the document.
597
598        All find_* methods take a common set of arguments. See the
599        online documentation for detailed explanations.
600
601        :param name: A filter on tag name.
602        :param attrs: A dictionary of filters on attribute values.
603        :param text: A filter for a NavigableString with specific text.
604        :kwargs: A dictionary of filters on attribute values.
605        :return: A PageElement.
606        :rtype: bs4.element.Tag | bs4.element.NavigableString
607        """
608        return self._find_one(self.find_next_siblings, name, attrs, text,
609                             **kwargs)
610    findNextSibling = find_next_sibling  # BS3
611
612    def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
613                           **kwargs):
614        """Find all siblings of this PageElement that match the given criteria
615        and appear later in the document.
616
617        All find_* methods take a common set of arguments. See the online
618        documentation for detailed explanations.
619
620        :param name: A filter on tag name.
621        :param attrs: A dictionary of filters on attribute values.
622        :param text: A filter for a NavigableString with specific text.
623        :param limit: Stop looking after finding this many results.
624        :kwargs: A dictionary of filters on attribute values.
625        :return: A ResultSet of PageElements.
626        :rtype: bs4.element.ResultSet
627        """
628        return self._find_all(name, attrs, text, limit,
629                              self.next_siblings, **kwargs)
630    findNextSiblings = find_next_siblings   # BS3
631    fetchNextSiblings = find_next_siblings  # BS2
632
633    def find_previous(self, name=None, attrs={}, text=None, **kwargs):
634        """Look backwards in the document from this PageElement and find the
635        first PageElement that matches the given criteria.
636
637        All find_* methods take a common set of arguments. See the online
638        documentation for detailed explanations.
639
640        :param name: A filter on tag name.
641        :param attrs: A dictionary of filters on attribute values.
642        :param text: A filter for a NavigableString with specific text.
643        :kwargs: A dictionary of filters on attribute values.
644        :return: A PageElement.
645        :rtype: bs4.element.Tag | bs4.element.NavigableString
646        """
647        return self._find_one(
648            self.find_all_previous, name, attrs, text, **kwargs)
649    findPrevious = find_previous  # BS3
650
651    def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
652                        **kwargs):
653        """Look backwards in the document from this PageElement and find all
654        PageElements that match the given criteria.
655
656        All find_* methods take a common set of arguments. See the online
657        documentation for detailed explanations.
658
659        :param name: A filter on tag name.
660        :param attrs: A dictionary of filters on attribute values.
661        :param text: A filter for a NavigableString with specific text.
662        :param limit: Stop looking after finding this many results.
663        :kwargs: A dictionary of filters on attribute values.
664        :return: A ResultSet of PageElements.
665        :rtype: bs4.element.ResultSet
666        """
667        return self._find_all(name, attrs, text, limit, self.previous_elements,
668                           **kwargs)
669    findAllPrevious = find_all_previous  # BS3
670    fetchPrevious = find_all_previous    # BS2
671
672    def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
673        """Returns the closest sibling to this PageElement that matches the
674        given criteria and appears earlier in the document.
675
676        All find_* methods take a common set of arguments. See the online
677        documentation for detailed explanations.
678
679        :param name: A filter on tag name.
680        :param attrs: A dictionary of filters on attribute values.
681        :param text: A filter for a NavigableString with specific text.
682        :kwargs: A dictionary of filters on attribute values.
683        :return: A PageElement.
684        :rtype: bs4.element.Tag | bs4.element.NavigableString
685        """
686        return self._find_one(self.find_previous_siblings, name, attrs, text,
687                             **kwargs)
688    findPreviousSibling = find_previous_sibling  # BS3
689
690    def find_previous_siblings(self, name=None, attrs={}, text=None,
691                               limit=None, **kwargs):
692        """Returns all siblings to this PageElement that match the
693        given criteria and appear earlier in the document.
694
695        All find_* methods take a common set of arguments. See the online
696        documentation for detailed explanations.
697
698        :param name: A filter on tag name.
699        :param attrs: A dictionary of filters on attribute values.
700        :param text: A filter for a NavigableString with specific text.
701        :param limit: Stop looking after finding this many results.
702        :kwargs: A dictionary of filters on attribute values.
703        :return: A ResultSet of PageElements.
704        :rtype: bs4.element.ResultSet
705        """
706        return self._find_all(name, attrs, text, limit,
707                              self.previous_siblings, **kwargs)
708    findPreviousSiblings = find_previous_siblings   # BS3
709    fetchPreviousSiblings = find_previous_siblings  # BS2
710
711    def find_parent(self, name=None, attrs={}, **kwargs):
712        """Find the closest parent of this PageElement that matches the given
713        criteria.
714
715        All find_* methods take a common set of arguments. See the online
716        documentation for detailed explanations.
717
718        :param name: A filter on tag name.
719        :param attrs: A dictionary of filters on attribute values.
720        :kwargs: A dictionary of filters on attribute values.
721
722        :return: A PageElement.
723        :rtype: bs4.element.Tag | bs4.element.NavigableString
724        """
725        # NOTE: We can't use _find_one because findParents takes a different
726        # set of arguments.
727        r = None
728        l = self.find_parents(name, attrs, 1, **kwargs)
729        if l:
730            r = l[0]
731        return r
732    findParent = find_parent  # BS3
733
734    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
735        """Find all parents of this PageElement that match the given criteria.
736
737        All find_* methods take a common set of arguments. See the online
738        documentation for detailed explanations.
739
740        :param name: A filter on tag name.
741        :param attrs: A dictionary of filters on attribute values.
742        :param limit: Stop looking after finding this many results.
743        :kwargs: A dictionary of filters on attribute values.
744
745        :return: A PageElement.
746        :rtype: bs4.element.Tag | bs4.element.NavigableString
747        """
748        return self._find_all(name, attrs, None, limit, self.parents,
749                             **kwargs)
750    findParents = find_parents   # BS3
751    fetchParents = find_parents  # BS2
752
753    @property
754    def next(self):
755        """The PageElement, if any, that was parsed just after this one.
756
757        :return: A PageElement.
758        :rtype: bs4.element.Tag | bs4.element.NavigableString
759        """
760        return self.next_element
761
762    @property
763    def previous(self):
764        """The PageElement, if any, that was parsed just before this one.
765
766        :return: A PageElement.
767        :rtype: bs4.element.Tag | bs4.element.NavigableString
768        """
769        return self.previous_element
770
771    #These methods do the real heavy lifting.
772
773    def _find_one(self, method, name, attrs, text, **kwargs):
774        r = None
775        l = method(name, attrs, text, 1, **kwargs)
776        if l:
777            r = l[0]
778        return r
779
780    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
781        "Iterates over a generator looking for things that match."
782
783        if text is None and 'string' in kwargs:
784            text = kwargs['string']
785            del kwargs['string']
786
787        if isinstance(name, SoupStrainer):
788            strainer = name
789        else:
790            strainer = SoupStrainer(name, attrs, text, **kwargs)
791
792        if text is None and not limit and not attrs and not kwargs:
793            if name is True or name is None:
794                # Optimization to find all tags.
795                result = (element for element in generator
796                          if isinstance(element, Tag))
797                return ResultSet(strainer, result)
798            elif isinstance(name, str):
799                # Optimization to find all tags with a given name.
800                if name.count(':') == 1:
801                    # This is a name with a prefix. If this is a namespace-aware document,
802                    # we need to match the local name against tag.name. If not,
803                    # we need to match the fully-qualified name against tag.name.
804                    prefix, local_name = name.split(':', 1)
805                else:
806                    prefix = None
807                    local_name = name
808                result = (element for element in generator
809                          if isinstance(element, Tag)
810                          and (
811                              element.name == name
812                          ) or (
813                              element.name == local_name
814                              and (prefix is None or element.prefix == prefix)
815                          )
816                )
817                return ResultSet(strainer, result)
818        results = ResultSet(strainer)
819        while True:
820            try:
821                i = next(generator)
822            except StopIteration:
823                break
824            if i:
825                found = strainer.search(i)
826                if found:
827                    results.append(found)
828                    if limit and len(results) >= limit:
829                        break
830        return results
831
832    #These generators can be used to navigate starting from both
833    #NavigableStrings and Tags.
834    @property
835    def next_elements(self):
836        """All PageElements that were parsed after this one.
837
838        :yield: A sequence of PageElements.
839        """
840        i = self.next_element
841        while i is not None:
842            yield i
843            i = i.next_element
844
845    @property
846    def next_siblings(self):
847        """All PageElements that are siblings of this one but were parsed
848        later.
849
850        :yield: A sequence of PageElements.
851        """
852        i = self.next_sibling
853        while i is not None:
854            yield i
855            i = i.next_sibling
856
857    @property
858    def previous_elements(self):
859        """All PageElements that were parsed before this one.
860
861        :yield: A sequence of PageElements.
862        """
863        i = self.previous_element
864        while i is not None:
865            yield i
866            i = i.previous_element
867
868    @property
869    def previous_siblings(self):
870        """All PageElements that are siblings of this one but were parsed
871        earlier.
872
873        :yield: A sequence of PageElements.
874        """
875        i = self.previous_sibling
876        while i is not None:
877            yield i
878            i = i.previous_sibling
879
880    @property
881    def parents(self):
882        """All PageElements that are parents of this PageElement.
883
884        :yield: A sequence of PageElements.
885        """
886        i = self.parent
887        while i is not None:
888            yield i
889            i = i.parent
890
891    @property
892    def decomposed(self):
893        """Check whether a PageElement has been decomposed.
894
895        :rtype: bool
896        """
897        return getattr(self, '_decomposed', False) or False
898
899    # Old non-property versions of the generators, for backwards
900    # compatibility with BS3.
901    def nextGenerator(self):
902        return self.next_elements
903
904    def nextSiblingGenerator(self):
905        return self.next_siblings
906
907    def previousGenerator(self):
908        return self.previous_elements
909
910    def previousSiblingGenerator(self):
911        return self.previous_siblings
912
913    def parentGenerator(self):
914        return self.parents
915
916
917class NavigableString(str, PageElement):
918    """A Python Unicode string that is part of a parse tree.
919
920    When Beautiful Soup parses the markup <b>penguin</b>, it will
921    create a NavigableString for the string "penguin".
922    """
923
924    PREFIX = ''
925    SUFFIX = ''
926
927    # We can't tell just by looking at a string whether it's contained
928    # in an XML document or an HTML document.
929
930    known_xml = None
931
932    def __new__(cls, value):
933        """Create a new NavigableString.
934
935        When unpickling a NavigableString, this method is called with
936        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
937        passed in to the superclass's __new__ or the superclass won't know
938        how to handle non-ASCII characters.
939        """
940        if isinstance(value, str):
941            u = str.__new__(cls, value)
942        else:
943            u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
944        u.setup()
945        return u
946
947    def __copy__(self):
948        """A copy of a NavigableString has the same contents and class
949        as the original, but it is not connected to the parse tree.
950        """
951        return type(self)(self)
952
953    def __getnewargs__(self):
954        return (str(self),)
955
956    def __getattr__(self, attr):
957        """text.string gives you text. This is for backwards
958        compatibility for Navigable*String, but for CData* it lets you
959        get the string without the CData wrapper."""
960        if attr == 'string':
961            return self
962        else:
963            raise AttributeError(
964                "'%s' object has no attribute '%s'" % (
965                    self.__class__.__name__, attr))
966
967    def output_ready(self, formatter="minimal"):
968        """Run the string through the provided formatter.
969
970        :param formatter: A Formatter object, or a string naming one of the standard formatters.
971        """
972        output = self.format_string(self, formatter)
973        return self.PREFIX + output + self.SUFFIX
974
975    @property
976    def name(self):
977        """Since a NavigableString is not a Tag, it has no .name.
978
979        This property is implemented so that code like this doesn't crash
980        when run on a mixture of Tag and NavigableString objects:
981            [x.name for x in tag.children]
982        """
983        return None
984
985    @name.setter
986    def name(self, name):
987        """Prevent NavigableString.name from ever being set."""
988        raise AttributeError("A NavigableString cannot be given a name.")
989
990    def _all_strings(self, strip=False, types=PageElement.default):
991        """Yield all strings of certain classes, possibly stripping them.
992
993        This makes it easy for NavigableString to implement methods
994        like get_text() as conveniences, creating a consistent
995        text-extraction API across all PageElements.
996
997        :param strip: If True, all strings will be stripped before being
998            yielded.
999
1000        :param types: A tuple of NavigableString subclasses. If this
1001            NavigableString isn't one of those subclasses, the
1002            sequence will be empty. By default, the subclasses
1003            considered are NavigableString and CData objects. That
1004            means no comments, processing instructions, etc.
1005
1006        :yield: A sequence that either contains this string, or is empty.
1007
1008        """
1009        if types is self.default:
1010            # This is kept in Tag because it's full of subclasses of
1011            # this class, which aren't defined until later in the file.
1012            types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1013
1014        # Do nothing if the caller is looking for specific types of
1015        # string, and we're of a different type.
1016        my_type = type(self)
1017        if types is not None:
1018            if isinstance(types, type):
1019                # Looking for a single type.
1020                if my_type is not types:
1021                    return
1022            elif my_type not in types:
1023                # Looking for one of a list of types.
1024                return
1025
1026        value = self
1027        if strip:
1028            value = value.strip()
1029        if len(value) > 0:
1030            yield value
1031    strings = property(_all_strings)
1032
1033class PreformattedString(NavigableString):
1034    """A NavigableString not subject to the normal formatting rules.
1035
1036    This is an abstract class used for special kinds of strings such
1037    as comments (the Comment class) and CDATA blocks (the CData
1038    class).
1039    """
1040
1041    PREFIX = ''
1042    SUFFIX = ''
1043
1044    def output_ready(self, formatter=None):
1045        """Make this string ready for output by adding any subclass-specific
1046            prefix or suffix.
1047
1048        :param formatter: A Formatter object, or a string naming one
1049            of the standard formatters. The string will be passed into the
1050            Formatter, but only to trigger any side effects: the return
1051            value is ignored.
1052
1053        :return: The string, with any subclass-specific prefix and
1054           suffix added on.
1055        """
1056        if formatter is not None:
1057            ignore = self.format_string(self, formatter)
1058        return self.PREFIX + self + self.SUFFIX
1059
1060class CData(PreformattedString):
1061    """A CDATA block."""
1062    PREFIX = '<![CDATA['
1063    SUFFIX = ']]>'
1064
1065class ProcessingInstruction(PreformattedString):
1066    """A SGML processing instruction."""
1067
1068    PREFIX = '<?'
1069    SUFFIX = '>'
1070
1071class XMLProcessingInstruction(ProcessingInstruction):
1072    """An XML processing instruction."""
1073    PREFIX = '<?'
1074    SUFFIX = '?>'
1075
1076class Comment(PreformattedString):
1077    """An HTML or XML comment."""
1078    PREFIX = '<!--'
1079    SUFFIX = '-->'
1080
1081
1082class Declaration(PreformattedString):
1083    """An XML declaration."""
1084    PREFIX = '<?'
1085    SUFFIX = '?>'
1086
1087
1088class Doctype(PreformattedString):
1089    """A document type declaration."""
1090    @classmethod
1091    def for_name_and_ids(cls, name, pub_id, system_id):
1092        """Generate an appropriate document type declaration for a given
1093        public ID and system ID.
1094
1095        :param name: The name of the document's root element, e.g. 'html'.
1096        :param pub_id: The Formal Public Identifier for this document type,
1097            e.g. '-//W3C//DTD XHTML 1.1//EN'
1098        :param system_id: The system identifier for this document type,
1099            e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1100
1101        :return: A Doctype.
1102        """
1103        value = name or ''
1104        if pub_id is not None:
1105            value += ' PUBLIC "%s"' % pub_id
1106            if system_id is not None:
1107                value += ' "%s"' % system_id
1108        elif system_id is not None:
1109            value += ' SYSTEM "%s"' % system_id
1110
1111        return Doctype(value)
1112
1113    PREFIX = '<!DOCTYPE '
1114    SUFFIX = '>\n'
1115
1116
1117class Stylesheet(NavigableString):
1118    """A NavigableString representing an stylesheet (probably
1119    CSS).
1120
1121    Used to distinguish embedded stylesheets from textual content.
1122    """
1123    pass
1124
1125
1126class Script(NavigableString):
1127    """A NavigableString representing an executable script (probably
1128    Javascript).
1129
1130    Used to distinguish executable code from textual content.
1131    """
1132    pass
1133
1134
1135class TemplateString(NavigableString):
1136    """A NavigableString representing a string found inside an HTML
1137    template embedded in a larger document.
1138
1139    Used to distinguish such strings from the main body of the document.
1140    """
1141    pass
1142
1143
1144class Tag(PageElement):
1145    """Represents an HTML or XML tag that is part of a parse tree, along
1146    with its attributes and contents.
1147
1148    When Beautiful Soup parses the markup <b>penguin</b>, it will
1149    create a Tag object representing the <b> tag.
1150    """
1151
1152    def __init__(self, parser=None, builder=None, name=None, namespace=None,
1153                 prefix=None, attrs=None, parent=None, previous=None,
1154                 is_xml=None, sourceline=None, sourcepos=None,
1155                 can_be_empty_element=None, cdata_list_attributes=None,
1156                 preserve_whitespace_tags=None,
1157                 interesting_string_types=None,
1158    ):
1159        """Basic constructor.
1160
1161        :param parser: A BeautifulSoup object.
1162        :param builder: A TreeBuilder.
1163        :param name: The name of the tag.
1164        :param namespace: The URI of this Tag's XML namespace, if any.
1165        :param prefix: The prefix for this Tag's XML namespace, if any.
1166        :param attrs: A dictionary of this Tag's attribute values.
1167        :param parent: The PageElement to use as this Tag's parent.
1168        :param previous: The PageElement that was parsed immediately before
1169            this tag.
1170        :param is_xml: If True, this is an XML tag. Otherwise, this is an
1171            HTML tag.
1172        :param sourceline: The line number where this tag was found in its
1173            source document.
1174        :param sourcepos: The character position within `sourceline` where this
1175            tag was found.
1176        :param can_be_empty_element: If True, this tag should be
1177            represented as <tag/>. If False, this tag should be represented
1178            as <tag></tag>.
1179        :param cdata_list_attributes: A list of attributes whose values should
1180            be treated as CDATA if they ever show up on this tag.
1181        :param preserve_whitespace_tags: A list of tag names whose contents
1182            should have their whitespace preserved.
1183        :param interesting_string_types: This is a NavigableString
1184            subclass or a tuple of them. When iterating over this
1185            Tag's strings in methods like Tag.strings or Tag.get_text,
1186            these are the types of strings that are interesting enough
1187            to be considered. The default is to consider
1188            NavigableString and CData the only interesting string
1189            subtypes.
1190        """
1191        if parser is None:
1192            self.parser_class = None
1193        else:
1194            # We don't actually store the parser object: that lets extracted
1195            # chunks be garbage-collected.
1196            self.parser_class = parser.__class__
1197        if name is None:
1198            raise ValueError("No value provided for new tag's name.")
1199        self.name = name
1200        self.namespace = namespace
1201        self.prefix = prefix
1202        if ((not builder or builder.store_line_numbers)
1203            and (sourceline is not None or sourcepos is not None)):
1204            self.sourceline = sourceline
1205            self.sourcepos = sourcepos
1206        if attrs is None:
1207            attrs = {}
1208        elif attrs:
1209            if builder is not None and builder.cdata_list_attributes:
1210                attrs = builder._replace_cdata_list_attribute_values(
1211                    self.name, attrs)
1212            else:
1213                attrs = dict(attrs)
1214        else:
1215            attrs = dict(attrs)
1216
1217        # If possible, determine ahead of time whether this tag is an
1218        # XML tag.
1219        if builder:
1220            self.known_xml = builder.is_xml
1221        else:
1222            self.known_xml = is_xml
1223        self.attrs = attrs
1224        self.contents = []
1225        self.setup(parent, previous)
1226        self.hidden = False
1227
1228        if builder is None:
1229            # In the absence of a TreeBuilder, use whatever values were
1230            # passed in here. They're probably None, unless this is a copy of some
1231            # other tag.
1232            self.can_be_empty_element = can_be_empty_element
1233            self.cdata_list_attributes = cdata_list_attributes
1234            self.preserve_whitespace_tags = preserve_whitespace_tags
1235            self.interesting_string_types = interesting_string_types
1236        else:
1237            # Set up any substitutions for this tag, such as the charset in a META tag.
1238            builder.set_up_substitutions(self)
1239
1240            # Ask the TreeBuilder whether this tag might be an empty-element tag.
1241            self.can_be_empty_element = builder.can_be_empty_element(name)
1242
1243            # Keep track of the list of attributes of this tag that
1244            # might need to be treated as a list.
1245            #
1246            # For performance reasons, we store the whole data structure
1247            # rather than asking the question of every tag. Asking would
1248            # require building a new data structure every time, and
1249            # (unlike can_be_empty_element), we almost never need
1250            # to check this.
1251            self.cdata_list_attributes = builder.cdata_list_attributes
1252
1253            # Keep track of the names that might cause this tag to be treated as a
1254            # whitespace-preserved tag.
1255            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1256
1257            if self.name in builder.string_containers:
1258                # This sort of tag uses a special string container
1259                # subclass for most of its strings. When we ask the
1260                self.interesting_string_types = builder.string_containers[self.name]
1261            else:
1262                self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
1263
1264    parserClass = _alias("parser_class")  # BS3
1265
1266    def __copy__(self):
1267        """A copy of a Tag is a new Tag, unconnected to the parse tree.
1268        Its contents are a copy of the old Tag's contents.
1269        """
1270        clone = type(self)(
1271            None, self.builder, self.name, self.namespace,
1272            self.prefix, self.attrs, is_xml=self._is_xml,
1273            sourceline=self.sourceline, sourcepos=self.sourcepos,
1274            can_be_empty_element=self.can_be_empty_element,
1275            cdata_list_attributes=self.cdata_list_attributes,
1276            preserve_whitespace_tags=self.preserve_whitespace_tags
1277        )
1278        for attr in ('can_be_empty_element', 'hidden'):
1279            setattr(clone, attr, getattr(self, attr))
1280        for child in self.contents:
1281            clone.append(child.__copy__())
1282        return clone
1283
1284    @property
1285    def is_empty_element(self):
1286        """Is this tag an empty-element tag? (aka a self-closing tag)
1287
1288        A tag that has contents is never an empty-element tag.
1289
1290        A tag that has no contents may or may not be an empty-element
1291        tag. It depends on the builder used to create the tag. If the
1292        builder has a designated list of empty-element tags, then only
1293        a tag whose name shows up in that list is considered an
1294        empty-element tag.
1295
1296        If the builder has no designated list of empty-element tags,
1297        then any tag with no contents is an empty-element tag.
1298        """
1299        return len(self.contents) == 0 and self.can_be_empty_element
1300    isSelfClosing = is_empty_element  # BS3
1301
1302    @property
1303    def string(self):
1304        """Convenience property to get the single string within this
1305        PageElement.
1306
1307        TODO It might make sense to have NavigableString.string return
1308        itself.
1309
1310        :return: If this element has a single string child, return
1311         value is that string. If this element has one child tag,
1312         return value is the 'string' attribute of the child tag,
1313         recursively. If this element is itself a string, has no
1314         children, or has more than one child, return value is None.
1315        """
1316        if len(self.contents) != 1:
1317            return None
1318        child = self.contents[0]
1319        if isinstance(child, NavigableString):
1320            return child
1321        return child.string
1322
1323    @string.setter
1324    def string(self, string):
1325        """Replace this PageElement's contents with `string`."""
1326        self.clear()
1327        self.append(string.__class__(string))
1328
1329    DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1330    def _all_strings(self, strip=False, types=PageElement.default):
1331        """Yield all strings of certain classes, possibly stripping them.
1332
1333        :param strip: If True, all strings will be stripped before being
1334            yielded.
1335
1336        :param types: A tuple of NavigableString subclasses. Any strings of
1337            a subclass not found in this list will be ignored. By
1338            default, the subclasses considered are the ones found in
1339            self.interesting_string_types. If that's not specified,
1340            only NavigableString and CData objects will be
1341            considered. That means no comments, processing
1342            instructions, etc.
1343
1344        :yield: A sequence of strings.
1345
1346        """
1347        if types is self.default:
1348            types = self.interesting_string_types
1349
1350        for descendant in self.descendants:
1351            if (types is None and not isinstance(descendant, NavigableString)):
1352                continue
1353            descendant_type = type(descendant)
1354            if isinstance(types, type):
1355                if descendant_type is not types:
1356                    # We're not interested in strings of this type.
1357                    continue
1358            elif types is not None and descendant_type not in types:
1359                # We're not interested in strings of this type.
1360                continue
1361            if strip:
1362                descendant = descendant.strip()
1363                if len(descendant) == 0:
1364                    continue
1365            yield descendant
1366    strings = property(_all_strings)
1367
1368    def decompose(self):
1369        """Recursively destroys this PageElement and its children.
1370
1371        This element will be removed from the tree and wiped out; so
1372        will everything beneath it.
1373
1374        The behavior of a decomposed PageElement is undefined and you
1375        should never use one for anything, but if you need to _check_
1376        whether an element has been decomposed, you can use the
1377        `decomposed` property.
1378        """
1379        self.extract()
1380        i = self
1381        while i is not None:
1382            n = i.next_element
1383            i.__dict__.clear()
1384            i.contents = []
1385            i._decomposed = True
1386            i = n
1387
1388    def clear(self, decompose=False):
1389        """Wipe out all children of this PageElement by calling extract()
1390           on them.
1391
1392        :param decompose: If this is True, decompose() (a more
1393            destructive method) will be called instead of extract().
1394        """
1395        if decompose:
1396            for element in self.contents[:]:
1397                if isinstance(element, Tag):
1398                    element.decompose()
1399                else:
1400                    element.extract()
1401        else:
1402            for element in self.contents[:]:
1403                element.extract()
1404
1405    def smooth(self):
1406        """Smooth out this element's children by consolidating consecutive
1407        strings.
1408
1409        This makes pretty-printed output look more natural following a
1410        lot of operations that modified the tree.
1411        """
1412        # Mark the first position of every pair of children that need
1413        # to be consolidated.  Do this rather than making a copy of
1414        # self.contents, since in most cases very few strings will be
1415        # affected.
1416        marked = []
1417        for i, a in enumerate(self.contents):
1418            if isinstance(a, Tag):
1419                # Recursively smooth children.
1420                a.smooth()
1421            if i == len(self.contents)-1:
1422                # This is the last item in .contents, and it's not a
1423                # tag. There's no chance it needs any work.
1424                continue
1425            b = self.contents[i+1]
1426            if (isinstance(a, NavigableString)
1427                and isinstance(b, NavigableString)
1428                and not isinstance(a, PreformattedString)
1429                and not isinstance(b, PreformattedString)
1430            ):
1431                marked.append(i)
1432
1433        # Go over the marked positions in reverse order, so that
1434        # removing items from .contents won't affect the remaining
1435        # positions.
1436        for i in reversed(marked):
1437            a = self.contents[i]
1438            b = self.contents[i+1]
1439            b.extract()
1440            n = NavigableString(a+b)
1441            a.replace_with(n)
1442
1443    def index(self, element):
1444        """Find the index of a child by identity, not value.
1445
1446        Avoids issues with tag.contents.index(element) getting the
1447        index of equal elements.
1448
1449        :param element: Look for this PageElement in `self.contents`.
1450        """
1451        for i, child in enumerate(self.contents):
1452            if child is element:
1453                return i
1454        raise ValueError("Tag.index: element not in tag")
1455
1456    def get(self, key, default=None):
1457        """Returns the value of the 'key' attribute for the tag, or
1458        the value given for 'default' if it doesn't have that
1459        attribute."""
1460        return self.attrs.get(key, default)
1461
1462    def get_attribute_list(self, key, default=None):
1463        """The same as get(), but always returns a list.
1464
1465        :param key: The attribute to look for.
1466        :param default: Use this value if the attribute is not present
1467            on this PageElement.
1468        :return: A list of values, probably containing only a single
1469            value.
1470        """
1471        value = self.get(key, default)
1472        if not isinstance(value, list):
1473            value = [value]
1474        return value
1475
1476    def has_attr(self, key):
1477        """Does this PageElement have an attribute with the given name?"""
1478        return key in self.attrs
1479
1480    def __hash__(self):
1481        return str(self).__hash__()
1482
1483    def __getitem__(self, key):
1484        """tag[key] returns the value of the 'key' attribute for the Tag,
1485        and throws an exception if it's not there."""
1486        return self.attrs[key]
1487
1488    def __iter__(self):
1489        "Iterating over a Tag iterates over its contents."
1490        return iter(self.contents)
1491
1492    def __len__(self):
1493        "The length of a Tag is the length of its list of contents."
1494        return len(self.contents)
1495
1496    def __contains__(self, x):
1497        return x in self.contents
1498
1499    def __bool__(self):
1500        "A tag is non-None even if it has no contents."
1501        return True
1502
1503    def __setitem__(self, key, value):
1504        """Setting tag[key] sets the value of the 'key' attribute for the
1505        tag."""
1506        self.attrs[key] = value
1507
1508    def __delitem__(self, key):
1509        "Deleting tag[key] deletes all 'key' attributes for the tag."
1510        self.attrs.pop(key, None)
1511
1512    def __call__(self, *args, **kwargs):
1513        """Calling a Tag like a function is the same as calling its
1514        find_all() method. Eg. tag('a') returns a list of all the A tags
1515        found within this tag."""
1516        return self.find_all(*args, **kwargs)
1517
1518    def __getattr__(self, tag):
1519        """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1520        #print("Getattr %s.%s" % (self.__class__, tag))
1521        if len(tag) > 3 and tag.endswith('Tag'):
1522            # BS3: soup.aTag -> "soup.find("a")
1523            tag_name = tag[:-3]
1524            warnings.warn(
1525                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1526                    name=tag_name
1527                )
1528            )
1529            return self.find(tag_name)
1530        # We special case contents to avoid recursion.
1531        elif not tag.startswith("__") and not tag == "contents":
1532            return self.find(tag)
1533        raise AttributeError(
1534            "'%s' object has no attribute '%s'" % (self.__class__, tag))
1535
1536    def __eq__(self, other):
1537        """Returns true iff this Tag has the same name, the same attributes,
1538        and the same contents (recursively) as `other`."""
1539        if self is other:
1540            return True
1541        if (not hasattr(other, 'name') or
1542            not hasattr(other, 'attrs') or
1543            not hasattr(other, 'contents') or
1544            self.name != other.name or
1545            self.attrs != other.attrs or
1546            len(self) != len(other)):
1547            return False
1548        for i, my_child in enumerate(self.contents):
1549            if my_child != other.contents[i]:
1550                return False
1551        return True
1552
1553    def __ne__(self, other):
1554        """Returns true iff this Tag is not identical to `other`,
1555        as defined in __eq__."""
1556        return not self == other
1557
1558    def __repr__(self, encoding="unicode-escape"):
1559        """Renders this PageElement as a string.
1560
1561        :param encoding: The encoding to use (Python 2 only).
1562        :return: Under Python 2, a bytestring; under Python 3,
1563            a Unicode string.
1564        """
1565        if PY3K:
1566            # "The return value must be a string object", i.e. Unicode
1567            return self.decode()
1568        else:
1569            # "The return value must be a string object", i.e. a bytestring.
1570            # By convention, the return value of __repr__ should also be
1571            # an ASCII string.
1572            return self.encode(encoding)
1573
1574    def __unicode__(self):
1575        """Renders this PageElement as a Unicode string."""
1576        return self.decode()
1577
1578    def __str__(self):
1579        """Renders this PageElement as a generic string.
1580
1581        :return: Under Python 2, a UTF-8 bytestring; under Python 3,
1582            a Unicode string.
1583        """
1584        if PY3K:
1585            return self.decode()
1586        else:
1587            return self.encode()
1588
1589    if PY3K:
1590        __str__ = __repr__ = __unicode__
1591
1592    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1593               indent_level=None, formatter="minimal",
1594               errors="xmlcharrefreplace"):
1595        """Render a bytestring representation of this PageElement and its
1596        contents.
1597
1598        :param encoding: The destination encoding.
1599        :param indent_level: Each line of the rendering will be
1600            indented this many spaces. Used internally in
1601            recursive calls while pretty-printing.
1602        :param formatter: A Formatter object, or a string naming one of
1603            the standard formatters.
1604        :param errors: An error handling strategy such as
1605            'xmlcharrefreplace'. This value is passed along into
1606            encode() and its value should be one of the constants
1607            defined by Python.
1608        :return: A bytestring.
1609
1610        """
1611        # Turn the data structure into Unicode, then encode the
1612        # Unicode.
1613        u = self.decode(indent_level, encoding, formatter)
1614        return u.encode(encoding, errors)
1615
1616    def decode(self, indent_level=None,
1617               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1618               formatter="minimal"):
1619        """Render a Unicode representation of this PageElement and its
1620        contents.
1621
1622        :param indent_level: Each line of the rendering will be
1623             indented this many spaces. Used internally in
1624             recursive calls while pretty-printing.
1625        :param eventual_encoding: The tag is destined to be
1626            encoded into this encoding. This method is _not_
1627            responsible for performing that encoding. This information
1628            is passed in so that it can be substituted in if the
1629            document contains a <META> tag that mentions the document's
1630            encoding.
1631        :param formatter: A Formatter object, or a string naming one of
1632            the standard formatters.
1633        """
1634
1635        # First off, turn a non-Formatter `formatter` into a Formatter
1636        # object. This will stop the lookup from happening over and
1637        # over again.
1638        if not isinstance(formatter, Formatter):
1639            formatter = self.formatter_for_name(formatter)
1640        attributes = formatter.attributes(self)
1641        attrs = []
1642        for key, val in attributes:
1643            if val is None:
1644                decoded = key
1645            else:
1646                if isinstance(val, list) or isinstance(val, tuple):
1647                    val = ' '.join(val)
1648                elif not isinstance(val, str):
1649                    val = str(val)
1650                elif (
1651                        isinstance(val, AttributeValueWithCharsetSubstitution)
1652                        and eventual_encoding is not None
1653                ):
1654                    val = val.encode(eventual_encoding)
1655
1656                text = formatter.attribute_value(val)
1657                decoded = (
1658                    str(key) + '='
1659                    + formatter.quoted_attribute_value(text))
1660            attrs.append(decoded)
1661        close = ''
1662        closeTag = ''
1663
1664        prefix = ''
1665        if self.prefix:
1666            prefix = self.prefix + ":"
1667
1668        if self.is_empty_element:
1669            close = formatter.void_element_close_prefix or ''
1670        else:
1671            closeTag = '</%s%s>' % (prefix, self.name)
1672
1673        pretty_print = self._should_pretty_print(indent_level)
1674        space = ''
1675        indent_space = ''
1676        if indent_level is not None:
1677            indent_space = (' ' * (indent_level - 1))
1678        if pretty_print:
1679            space = indent_space
1680            indent_contents = indent_level + 1
1681        else:
1682            indent_contents = None
1683        contents = self.decode_contents(
1684            indent_contents, eventual_encoding, formatter
1685        )
1686
1687        if self.hidden:
1688            # This is the 'document root' object.
1689            s = contents
1690        else:
1691            s = []
1692            attribute_string = ''
1693            if attrs:
1694                attribute_string = ' ' + ' '.join(attrs)
1695            if indent_level is not None:
1696                # Even if this particular tag is not pretty-printed,
1697                # we should indent up to the start of the tag.
1698                s.append(indent_space)
1699            s.append('<%s%s%s%s>' % (
1700                    prefix, self.name, attribute_string, close))
1701            if pretty_print:
1702                s.append("\n")
1703            s.append(contents)
1704            if pretty_print and contents and contents[-1] != "\n":
1705                s.append("\n")
1706            if pretty_print and closeTag:
1707                s.append(space)
1708            s.append(closeTag)
1709            if indent_level is not None and closeTag and self.next_sibling:
1710                # Even if this particular tag is not pretty-printed,
1711                # we're now done with the tag, and we should add a
1712                # newline if appropriate.
1713                s.append("\n")
1714            s = ''.join(s)
1715        return s
1716
1717    def _should_pretty_print(self, indent_level):
1718        """Should this tag be pretty-printed?
1719
1720        Most of them should, but some (such as <pre> in HTML
1721        documents) should not.
1722        """
1723        return (
1724            indent_level is not None
1725            and (
1726                not self.preserve_whitespace_tags
1727                or self.name not in self.preserve_whitespace_tags
1728            )
1729        )
1730
1731    def prettify(self, encoding=None, formatter="minimal"):
1732        """Pretty-print this PageElement as a string.
1733
1734        :param encoding: The eventual encoding of the string. If this is None,
1735            a Unicode string will be returned.
1736        :param formatter: A Formatter object, or a string naming one of
1737            the standard formatters.
1738        :return: A Unicode string (if encoding==None) or a bytestring
1739            (otherwise).
1740        """
1741        if encoding is None:
1742            return self.decode(True, formatter=formatter)
1743        else:
1744            return self.encode(encoding, True, formatter=formatter)
1745
1746    def decode_contents(self, indent_level=None,
1747                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1748                       formatter="minimal"):
1749        """Renders the contents of this tag as a Unicode string.
1750
1751        :param indent_level: Each line of the rendering will be
1752           indented this many spaces. Used internally in
1753           recursive calls while pretty-printing.
1754
1755        :param eventual_encoding: The tag is destined to be
1756           encoded into this encoding. decode_contents() is _not_
1757           responsible for performing that encoding. This information
1758           is passed in so that it can be substituted in if the
1759           document contains a <META> tag that mentions the document's
1760           encoding.
1761
1762        :param formatter: A Formatter object, or a string naming one of
1763            the standard Formatters.
1764        """
1765        # First off, turn a string formatter into a Formatter object. This
1766        # will stop the lookup from happening over and over again.
1767        if not isinstance(formatter, Formatter):
1768            formatter = self.formatter_for_name(formatter)
1769
1770        pretty_print = (indent_level is not None)
1771        s = []
1772        for c in self:
1773            text = None
1774            if isinstance(c, NavigableString):
1775                text = c.output_ready(formatter)
1776            elif isinstance(c, Tag):
1777                s.append(c.decode(indent_level, eventual_encoding,
1778                                  formatter))
1779            preserve_whitespace = (
1780                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1781            )
1782            if text and indent_level and not preserve_whitespace:
1783                text = text.strip()
1784            if text:
1785                if pretty_print and not preserve_whitespace:
1786                    s.append(" " * (indent_level - 1))
1787                s.append(text)
1788                if pretty_print and not preserve_whitespace:
1789                    s.append("\n")
1790        return ''.join(s)
1791
1792    def encode_contents(
1793        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1794        formatter="minimal"):
1795        """Renders the contents of this PageElement as a bytestring.
1796
1797        :param indent_level: Each line of the rendering will be
1798           indented this many spaces. Used internally in
1799           recursive calls while pretty-printing.
1800
1801        :param eventual_encoding: The bytestring will be in this encoding.
1802
1803        :param formatter: A Formatter object, or a string naming one of
1804            the standard Formatters.
1805
1806        :return: A bytestring.
1807        """
1808        contents = self.decode_contents(indent_level, encoding, formatter)
1809        return contents.encode(encoding)
1810
1811    # Old method for BS3 compatibility
1812    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1813                       prettyPrint=False, indentLevel=0):
1814        """Deprecated method for BS3 compatibility."""
1815        if not prettyPrint:
1816            indentLevel = None
1817        return self.encode_contents(
1818            indent_level=indentLevel, encoding=encoding)
1819
1820    #Soup methods
1821
1822    def find(self, name=None, attrs={}, recursive=True, text=None,
1823             **kwargs):
1824        """Look in the children of this PageElement and find the first
1825        PageElement that matches the given criteria.
1826
1827        All find_* methods take a common set of arguments. See the online
1828        documentation for detailed explanations.
1829
1830        :param name: A filter on tag name.
1831        :param attrs: A dictionary of filters on attribute values.
1832        :param recursive: If this is True, find() will perform a
1833            recursive search of this PageElement's children. Otherwise,
1834            only the direct children will be considered.
1835        :param limit: Stop looking after finding this many results.
1836        :kwargs: A dictionary of filters on attribute values.
1837        :return: A PageElement.
1838        :rtype: bs4.element.Tag | bs4.element.NavigableString
1839        """
1840        r = None
1841        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1842        if l:
1843            r = l[0]
1844        return r
1845    findChild = find #BS2
1846
1847    def find_all(self, name=None, attrs={}, recursive=True, text=None,
1848                 limit=None, **kwargs):
1849        """Look in the children of this PageElement and find all
1850        PageElements that match the given criteria.
1851
1852        All find_* methods take a common set of arguments. See the online
1853        documentation for detailed explanations.
1854
1855        :param name: A filter on tag name.
1856        :param attrs: A dictionary of filters on attribute values.
1857        :param recursive: If this is True, find_all() will perform a
1858            recursive search of this PageElement's children. Otherwise,
1859            only the direct children will be considered.
1860        :param limit: Stop looking after finding this many results.
1861        :kwargs: A dictionary of filters on attribute values.
1862        :return: A ResultSet of PageElements.
1863        :rtype: bs4.element.ResultSet
1864        """
1865        generator = self.descendants
1866        if not recursive:
1867            generator = self.children
1868        return self._find_all(name, attrs, text, limit, generator, **kwargs)
1869    findAll = find_all       # BS3
1870    findChildren = find_all  # BS2
1871
1872    #Generator methods
1873    @property
1874    def children(self):
1875        """Iterate over all direct children of this PageElement.
1876
1877        :yield: A sequence of PageElements.
1878        """
1879        # return iter() to make the purpose of the method clear
1880        return iter(self.contents)  # XXX This seems to be untested.
1881
1882    @property
1883    def descendants(self):
1884        """Iterate over all children of this PageElement in a
1885        breadth-first sequence.
1886
1887        :yield: A sequence of PageElements.
1888        """
1889        if not len(self.contents):
1890            return
1891        stopNode = self._last_descendant().next_element
1892        current = self.contents[0]
1893        while current is not stopNode:
1894            yield current
1895            current = current.next_element
1896
1897    # CSS selector code
1898    def select_one(self, selector, namespaces=None, **kwargs):
1899        """Perform a CSS selection operation on the current element.
1900
1901        :param selector: A CSS selector.
1902
1903        :param namespaces: A dictionary mapping namespace prefixes
1904           used in the CSS selector to namespace URIs. By default,
1905           Beautiful Soup will use the prefixes it encountered while
1906           parsing the document.
1907
1908        :param kwargs: Keyword arguments to be passed into SoupSieve's
1909           soupsieve.select() method.
1910
1911        :return: A Tag.
1912        :rtype: bs4.element.Tag
1913        """
1914        value = self.select(selector, namespaces, 1, **kwargs)
1915        if value:
1916            return value[0]
1917        return None
1918
1919    def select(self, selector, namespaces=None, limit=None, **kwargs):
1920        """Perform a CSS selection operation on the current element.
1921
1922        This uses the SoupSieve library.
1923
1924        :param selector: A string containing a CSS selector.
1925
1926        :param namespaces: A dictionary mapping namespace prefixes
1927           used in the CSS selector to namespace URIs. By default,
1928           Beautiful Soup will use the prefixes it encountered while
1929           parsing the document.
1930
1931        :param limit: After finding this number of results, stop looking.
1932
1933        :param kwargs: Keyword arguments to be passed into SoupSieve's
1934           soupsieve.select() method.
1935
1936        :return: A ResultSet of Tags.
1937        :rtype: bs4.element.ResultSet
1938        """
1939        if namespaces is None:
1940            namespaces = self._namespaces
1941
1942        if limit is None:
1943            limit = 0
1944        if soupsieve is None:
1945            raise NotImplementedError(
1946                "Cannot execute CSS selectors because the soupsieve package is not installed."
1947            )
1948
1949        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1950
1951        # We do this because it's more consistent and because
1952        # ResultSet.__getattr__ has a helpful error message.
1953        return ResultSet(None, results)
1954
1955    # Old names for backwards compatibility
1956    def childGenerator(self):
1957        """Deprecated generator."""
1958        return self.children
1959
1960    def recursiveChildGenerator(self):
1961        """Deprecated generator."""
1962        return self.descendants
1963
1964    def has_key(self, key):
1965        """Deprecated method. This was kind of misleading because has_key()
1966        (attributes) was different from __in__ (contents).
1967
1968        has_key() is gone in Python 3, anyway.
1969        """
1970        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1971                key))
1972        return self.has_attr(key)
1973
1974# Next, a couple classes to represent queries and their results.
1975class SoupStrainer(object):
1976    """Encapsulates a number of ways of matching a markup element (tag or
1977    string).
1978
1979    This is primarily used to underpin the find_* methods, but you can
1980    create one yourself and pass it in as `parse_only` to the
1981    `BeautifulSoup` constructor, to parse a subset of a large
1982    document.
1983    """
1984
1985    def __init__(self, name=None, attrs={}, text=None, **kwargs):
1986        """Constructor.
1987
1988        The SoupStrainer constructor takes the same arguments passed
1989        into the find_* methods. See the online documentation for
1990        detailed explanations.
1991
1992        :param name: A filter on tag name.
1993        :param attrs: A dictionary of filters on attribute values.
1994        :param text: A filter for a NavigableString with specific text.
1995        :kwargs: A dictionary of filters on attribute values.
1996        """
1997        self.name = self._normalize_search_value(name)
1998        if not isinstance(attrs, dict):
1999            # Treat a non-dict value for attrs as a search for the 'class'
2000            # attribute.
2001            kwargs['class'] = attrs
2002            attrs = None
2003
2004        if 'class_' in kwargs:
2005            # Treat class_="foo" as a search for the 'class'
2006            # attribute, overriding any non-dict value for attrs.
2007            kwargs['class'] = kwargs['class_']
2008            del kwargs['class_']
2009
2010        if kwargs:
2011            if attrs:
2012                attrs = attrs.copy()
2013                attrs.update(kwargs)
2014            else:
2015                attrs = kwargs
2016        normalized_attrs = {}
2017        for key, value in list(attrs.items()):
2018            normalized_attrs[key] = self._normalize_search_value(value)
2019
2020        self.attrs = normalized_attrs
2021        self.text = self._normalize_search_value(text)
2022
2023    def _normalize_search_value(self, value):
2024        # Leave it alone if it's a Unicode string, a callable, a
2025        # regular expression, a boolean, or None.
2026        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
2027            or isinstance(value, bool) or value is None):
2028            return value
2029
2030        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
2031        if isinstance(value, bytes):
2032            return value.decode("utf8")
2033
2034        # If it's listlike, convert it into a list of strings.
2035        if hasattr(value, '__iter__'):
2036            new_value = []
2037            for v in value:
2038                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
2039                    and not isinstance(v, str)):
2040                    # This is almost certainly the user's mistake. In the
2041                    # interests of avoiding infinite loops, we'll let
2042                    # it through as-is rather than doing a recursive call.
2043                    new_value.append(v)
2044                else:
2045                    new_value.append(self._normalize_search_value(v))
2046            return new_value
2047
2048        # Otherwise, convert it into a Unicode string.
2049        # The unicode(str()) thing is so this will do the same thing on Python 2
2050        # and Python 3.
2051        return str(str(value))
2052
2053    def __str__(self):
2054        """A human-readable representation of this SoupStrainer."""
2055        if self.text:
2056            return self.text
2057        else:
2058            return "%s|%s" % (self.name, self.attrs)
2059
2060    def search_tag(self, markup_name=None, markup_attrs={}):
2061        """Check whether a Tag with the given name and attributes would
2062        match this SoupStrainer.
2063
2064        Used prospectively to decide whether to even bother creating a Tag
2065        object.
2066
2067        :param markup_name: A tag name as found in some markup.
2068        :param markup_attrs: A dictionary of attributes as found in some markup.
2069
2070        :return: True if the prospective tag would match this SoupStrainer;
2071            False otherwise.
2072        """
2073        found = None
2074        markup = None
2075        if isinstance(markup_name, Tag):
2076            markup = markup_name
2077            markup_attrs = markup
2078
2079        if isinstance(self.name, str):
2080            # Optimization for a very common case where the user is
2081            # searching for a tag with one specific name, and we're
2082            # looking at a tag with a different name.
2083            if markup and not markup.prefix and self.name != markup.name:
2084                 return False
2085
2086        call_function_with_tag_data = (
2087            isinstance(self.name, Callable)
2088            and not isinstance(markup_name, Tag))
2089
2090        if ((not self.name)
2091            or call_function_with_tag_data
2092            or (markup and self._matches(markup, self.name))
2093            or (not markup and self._matches(markup_name, self.name))):
2094            if call_function_with_tag_data:
2095                match = self.name(markup_name, markup_attrs)
2096            else:
2097                match = True
2098                markup_attr_map = None
2099                for attr, match_against in list(self.attrs.items()):
2100                    if not markup_attr_map:
2101                        if hasattr(markup_attrs, 'get'):
2102                            markup_attr_map = markup_attrs
2103                        else:
2104                            markup_attr_map = {}
2105                            for k, v in markup_attrs:
2106                                markup_attr_map[k] = v
2107                    attr_value = markup_attr_map.get(attr)
2108                    if not self._matches(attr_value, match_against):
2109                        match = False
2110                        break
2111            if match:
2112                if markup:
2113                    found = markup
2114                else:
2115                    found = markup_name
2116        if found and self.text and not self._matches(found.string, self.text):
2117            found = None
2118        return found
2119
2120    # For BS3 compatibility.
2121    searchTag = search_tag
2122
2123    def search(self, markup):
2124        """Find all items in `markup` that match this SoupStrainer.
2125
2126        Used by the core _find_all() method, which is ultimately
2127        called by all find_* methods.
2128
2129        :param markup: A PageElement or a list of them.
2130        """
2131        # print('looking for %s in %s' % (self, markup))
2132        found = None
2133        # If given a list of items, scan it for a text element that
2134        # matches.
2135        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2136            for element in markup:
2137                if isinstance(element, NavigableString) \
2138                       and self.search(element):
2139                    found = element
2140                    break
2141        # If it's a Tag, make sure its name or attributes match.
2142        # Don't bother with Tags if we're searching for text.
2143        elif isinstance(markup, Tag):
2144            if not self.text or self.name or self.attrs:
2145                found = self.search_tag(markup)
2146        # If it's text, make sure the text matches.
2147        elif isinstance(markup, NavigableString) or \
2148                 isinstance(markup, str):
2149            if not self.name and not self.attrs and self._matches(markup, self.text):
2150                found = markup
2151        else:
2152            raise Exception(
2153                "I don't know how to match against a %s" % markup.__class__)
2154        return found
2155
2156    def _matches(self, markup, match_against, already_tried=None):
2157        # print(u"Matching %s against %s" % (markup, match_against))
2158        result = False
2159        if isinstance(markup, list) or isinstance(markup, tuple):
2160            # This should only happen when searching a multi-valued attribute
2161            # like 'class'.
2162            for item in markup:
2163                if self._matches(item, match_against):
2164                    return True
2165            # We didn't match any particular value of the multivalue
2166            # attribute, but maybe we match the attribute value when
2167            # considered as a string.
2168            if self._matches(' '.join(markup), match_against):
2169                return True
2170            return False
2171
2172        if match_against is True:
2173            # True matches any non-None value.
2174            return markup is not None
2175
2176        if isinstance(match_against, Callable):
2177            return match_against(markup)
2178
2179        # Custom callables take the tag as an argument, but all
2180        # other ways of matching match the tag name as a string.
2181        original_markup = markup
2182        if isinstance(markup, Tag):
2183            markup = markup.name
2184
2185        # Ensure that `markup` is either a Unicode string, or None.
2186        markup = self._normalize_search_value(markup)
2187
2188        if markup is None:
2189            # None matches None, False, an empty string, an empty list, and so on.
2190            return not match_against
2191
2192        if (hasattr(match_against, '__iter__')
2193            and not isinstance(match_against, str)):
2194            # We're asked to match against an iterable of items.
2195            # The markup must be match at least one item in the
2196            # iterable. We'll try each one in turn.
2197            #
2198            # To avoid infinite recursion we need to keep track of
2199            # items we've already seen.
2200            if not already_tried:
2201                already_tried = set()
2202            for item in match_against:
2203                if item.__hash__:
2204                    key = item
2205                else:
2206                    key = id(item)
2207                if key in already_tried:
2208                    continue
2209                else:
2210                    already_tried.add(key)
2211                    if self._matches(original_markup, item, already_tried):
2212                        return True
2213            else:
2214                return False
2215
2216        # Beyond this point we might need to run the test twice: once against
2217        # the tag's name and once against its prefixed name.
2218        match = False
2219
2220        if not match and isinstance(match_against, str):
2221            # Exact string match
2222            match = markup == match_against
2223
2224        if not match and hasattr(match_against, 'search'):
2225            # Regexp match
2226            return match_against.search(markup)
2227
2228        if (not match
2229            and isinstance(original_markup, Tag)
2230            and original_markup.prefix):
2231            # Try the whole thing again with the prefixed tag name.
2232            return self._matches(
2233                original_markup.prefix + ':' + original_markup.name, match_against
2234            )
2235
2236        return match
2237
2238
2239class ResultSet(list):
2240    """A ResultSet is just a list that keeps track of the SoupStrainer
2241    that created it."""
2242    def __init__(self, source, result=()):
2243        """Constructor.
2244
2245        :param source: A SoupStrainer.
2246        :param result: A list of PageElements.
2247        """
2248        super(ResultSet, self).__init__(result)
2249        self.source = source
2250
2251    def __getattr__(self, key):
2252        """Raise a helpful exception to explain a common code fix."""
2253        raise AttributeError(
2254            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2255        )
2256