1try:
2    from collections.abc import Callable # Python 3.6
3except ImportError as e:
4    from collections import Callable
5import sys
6from collections import OrderedDict
7
8from pdb import set_trace
9import re
10import warnings
11from sigil_bs4.dammit import EntitySubstitution
12
13DEFAULT_OUTPUT_ENCODING = "utf-8"
14
15whitespace_re = re.compile("\s+")
16
17NON_BREAKING_INLINE_TAGS = ("a","abbr","acronym","b","bdo","big","br",
18    "button","cite","code","del","dfn","em","font","i","image","img",
19    "input","ins","kbd","label","map","mark", "nobr","object","q",
20    "ruby","rt","s","samp","select","small","span","strike","strong",
21    "sub","sup","textarea","tt","u","var","wbr","mbp:nu")
22
23PRESERVE_WHITESPACE_TAGS = ("code","pre","textarea","script","style")
24
25VOID_TAGS = ("area","base","basefont","bgsound","br","col","command",
26    "embed","event-source","frame","hr","img","input","keygen",
27    "link","meta","param","source","spacer","track","wbr",
28    "mbp:pagebreak")
29
30NO_ENTITY_SUB_TAGS = ("script", "style")
31
32SPECIAL_HANDLING_TAGS = ("html", "body")
33
34STRUCTURAL_TAGS = ("article","aside","blockquote","body","canvas",
35    "colgroup","div","dl","figure","footer","head","header","hr","html",
36    "ol","section","table","tbody","tfoot","thead","td","th","tr","ul")
37
38OTHER_TEXTHOLDING_TAGS = ("address","caption","dd","div","dt","h1","h2",
39    "h3","h4","h5","h6","legend","li","option","p","td","th","title")
40
41EBOOK_XML_PARENT_TAGS = ("package","metadata","manifest","spine","guide","ncx",
42                         "head","doctitle","docauthor","navmap", "navpoint",
43                          "navlabel", "pagelist", "pagetarget")
44
45def _alias(attr):
46    """Alias one attribute name to another for backward compatibility"""
47    @property
48    def alias(self):
49        return getattr(self, attr)
50
51    @alias.setter
52    def alias(self):
53        return setattr(self, attr)
54    return alias
55
56
57class NamespacedAttribute(str):
58
59    def __new__(cls, prefix, name, namespace=None):
60        if name is None:
61            obj = str.__new__(cls, prefix)
62
63        elif prefix is None:
64            # Not really namespaced.
65            obj = str.__new__(cls, name)
66        else:
67            obj = str.__new__(cls, prefix + ":" + name)
68        obj.prefix = prefix
69        obj.name = name
70        obj.namespace = namespace
71        return obj
72
73class AttributeValueWithCharsetSubstitution(str):
74    """A stand-in object for a character encoding specified in HTML."""
75
76class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
77    """A generic stand-in for the value of a meta tag's 'charset' attribute.
78
79    When Beautiful Soup parses the markup '<meta charset="utf8">', the
80    value of the 'charset' attribute will be one of these objects.
81    """
82
83    def __new__(cls, original_value):
84        obj = str.__new__(cls, original_value)
85        obj.original_value = original_value
86        return obj
87
88    def encode(self, encoding):
89        return encoding
90
91
92class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
93    """A generic stand-in for the value of a meta tag's 'content' attribute.
94
95    When Beautiful Soup parses the markup:
96     <meta http-equiv="content-type" content="text/html; charset=utf8">
97
98    The value of the 'content' attribute will be one of these objects.
99    """
100
101    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
102
103    def __new__(cls, original_value):
104        match = cls.CHARSET_RE.search(original_value)
105        if match is None:
106            # No substitution necessary.
107            return str.__new__(str, original_value)
108
109        obj = str.__new__(cls, original_value)
110        obj.original_value = original_value
111        return obj
112
113    def encode(self, encoding):
114        def rewrite(match):
115            return match.group(1) + encoding
116        return self.CHARSET_RE.sub(rewrite, self.original_value)
117
118class HTMLAwareEntitySubstitution(EntitySubstitution):
119
120    """Entity substitution rules that are aware of some HTML quirks.
121
122    Specifically, the contents of <script> and <style> tags should not
123    undergo entity substitution.
124
125    Incoming NavigableString objects are checked to see if they're the
126    direct children of a <script> or <style> tag.
127    """
128
129    cdata_containing_tags = set(["script", "style"])
130
131    preformatted_tags = set(["pre"])
132
133    @classmethod
134    def _substitute_if_appropriate(cls, ns, f):
135        if (isinstance(ns, NavigableString)
136            and ns.parent is not None
137            and ns.parent.name in cls.cdata_containing_tags):
138            # Do nothing.
139            return ns
140        # Substitute.
141        return f(ns)
142
143    @classmethod
144    def substitute_html(cls, ns):
145        return cls._substitute_if_appropriate(
146            ns, EntitySubstitution.substitute_html)
147
148    @classmethod
149    def substitute_xml(cls, ns):
150        return cls._substitute_if_appropriate(
151            ns, EntitySubstitution.substitute_xml_containing_entities)
152
153class PageElement(object):
154    """Contains the navigational information for some part of the page
155    (either a tag or a piece of text)"""
156
157    # There are five possible values for the "formatter" argument passed in
158    # to methods like encode() and prettify():
159    #
160    # "html" - All Unicode characters with corresponding HTML entities
161    #   are converted to those entities on output.
162    # "minimal" - Bare ampersands and angle brackets are converted to
163    #   XML entities: &amp; &lt; &gt;
164    # None - The null formatter. Unicode characters are never
165    #   converted to entities.  This is not recommended, but it's
166    #   faster than "minimal".
167    # A function - This function will be called on every string that
168    #  needs to undergo entity substitution.
169    #
170
171    # In an HTML document, the default "html" and "minimal" functions
172    # will leave the contents of <script> and <style> tags alone. For
173    # an XML document, all tags will be given the same treatment.
174
175    HTML_FORMATTERS = {
176        "html" : HTMLAwareEntitySubstitution.substitute_html,
177        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
178        None : None
179        }
180
181    XML_FORMATTERS = {
182        "html" : EntitySubstitution.substitute_html,
183        "minimal" : EntitySubstitution.substitute_xml_containing_entities,
184        None : None
185        }
186
187    def format_string(self, s, formatter='minimal'):
188        """Format the given string using the given formatter."""
189        if not isinstance(formatter, Callable):
190            formatter = self._formatter_for_name(formatter)
191        if formatter is None:
192            output = s
193        else:
194            output = formatter(s)
195        return output
196
197    @property
198    def _is_xml(self):
199        """Is this element part of an XML tree or an HTML tree?
200
201        This is used when mapping a formatter name ("minimal") to an
202        appropriate function (one that performs entity-substitution on
203        the contents of <script> and <style> tags, or not). It's
204        inefficient, but it should be called very rarely.
205        """
206        if self.parent is None:
207            # This is the top-level object. It should have .is_xml set
208            # from tree creation. If not, take a guess--BS is usually
209            # used on HTML markup.
210            return getattr(self, 'is_xml', False)
211        return self.parent._is_xml
212
213    def _formatter_for_name(self, name):
214        "Look up a formatter function based on its name and the tree."
215        if self._is_xml:
216            return self.XML_FORMATTERS.get(
217                name, EntitySubstitution.substitute_xml)
218        else:
219            return self.HTML_FORMATTERS.get(
220                name, HTMLAwareEntitySubstitution.substitute_xml)
221
222    def setup(self, parent=None, previous_element=None, next_element=None,
223              previous_sibling=None, next_sibling=None):
224        """Sets up the initial relations between this element and
225        other elements."""
226        self.parent = parent
227
228        self.previous_element = previous_element
229        if previous_element is not None:
230            self.previous_element.next_element = self
231
232        self.next_element = next_element
233        if self.next_element:
234            self.next_element.previous_element = self
235
236        self.next_sibling = next_sibling
237        if self.next_sibling:
238            self.next_sibling.previous_sibling = self
239
240        if (not previous_sibling
241            and self.parent is not None and self.parent.contents):
242            previous_sibling = self.parent.contents[-1]
243
244        self.previous_sibling = previous_sibling
245        if previous_sibling:
246            self.previous_sibling.next_sibling = self
247
248    nextSibling = _alias("next_sibling")  # BS3
249    previousSibling = _alias("previous_sibling")  # BS3
250
251    def replace_with(self, replace_with):
252        if not self.parent:
253            raise ValueError(
254                "Cannot replace one element with another when the"
255                "element to be replaced is not part of a tree.")
256        if replace_with is self:
257            return
258        if replace_with is self.parent:
259            raise ValueError("Cannot replace a Tag with its parent.")
260        old_parent = self.parent
261        my_index = self.parent.index(self)
262        self.extract()
263        old_parent.insert(my_index, replace_with)
264        return self
265    replaceWith = replace_with  # BS3
266
267    def unwrap(self):
268        my_parent = self.parent
269        if not self.parent:
270            raise ValueError(
271                "Cannot replace an element with its contents when that"
272                "element is not part of a tree.")
273        my_index = self.parent.index(self)
274        self.extract()
275        for child in reversed(self.contents[:]):
276            my_parent.insert(my_index, child)
277        return self
278    replace_with_children = unwrap
279    replaceWithChildren = unwrap  # BS3
280
281    def wrap(self, wrap_inside):
282        me = self.replace_with(wrap_inside)
283        wrap_inside.append(me)
284        return wrap_inside
285
286    def extract(self):
287        """Destructively rips this element out of the tree."""
288        if self.parent is not None:
289            del self.parent.contents[self.parent.index(self)]
290
291        #Find the two elements that would be next to each other if
292        #this element (and any children) hadn't been parsed. Connect
293        #the two.
294        last_child = self._last_descendant()
295        next_element = last_child.next_element
296
297        if (self.previous_element is not None and
298            self.previous_element != next_element):
299            self.previous_element.next_element = next_element
300        if next_element is not None and next_element != self.previous_element:
301            next_element.previous_element = self.previous_element
302        self.previous_element = None
303        last_child.next_element = None
304
305        self.parent = None
306        if (self.previous_sibling is not None
307            and self.previous_sibling != self.next_sibling):
308            self.previous_sibling.next_sibling = self.next_sibling
309        if (self.next_sibling is not None
310            and self.next_sibling != self.previous_sibling):
311            self.next_sibling.previous_sibling = self.previous_sibling
312        self.previous_sibling = self.next_sibling = None
313        return self
314
315    def _last_descendant(self, is_initialized=True, accept_self=True):
316        "Finds the last element beneath this object to be parsed."
317        if is_initialized and self.next_sibling:
318            last_child = self.next_sibling.previous_element
319        else:
320            last_child = self
321            while isinstance(last_child, Tag) and last_child.contents:
322                last_child = last_child.contents[-1]
323        if not accept_self and last_child == self:
324            last_child = None
325        return last_child
326    # BS3: Not part of the API!
327    _lastRecursiveChild = _last_descendant
328
329    def insert(self, position, new_child):
330        if new_child is self:
331            raise ValueError("Cannot insert a tag into itself.")
332        if (isinstance(new_child, str)
333            and not isinstance(new_child, NavigableString)):
334            new_child = NavigableString(new_child)
335
336        position = min(position, len(self.contents))
337        if hasattr(new_child, 'parent') and new_child.parent is not None:
338            # We're 'inserting' an element that's already one
339            # of this object's children.
340            if new_child.parent is self:
341                current_index = self.index(new_child)
342                if current_index < position:
343                    # We're moving this element further down the list
344                    # of this object's children. That means that when
345                    # we extract this element, our target index will
346                    # jump down one.
347                    position -= 1
348            new_child.extract()
349
350        new_child.parent = self
351        previous_child = None
352        if position == 0:
353            new_child.previous_sibling = None
354            new_child.previous_element = self
355        else:
356            previous_child = self.contents[position - 1]
357            new_child.previous_sibling = previous_child
358            new_child.previous_sibling.next_sibling = new_child
359            new_child.previous_element = previous_child._last_descendant(False)
360        if new_child.previous_element is not None:
361            new_child.previous_element.next_element = new_child
362
363        new_childs_last_element = new_child._last_descendant(False)
364
365        if position >= len(self.contents):
366            new_child.next_sibling = None
367
368            parent = self
369            parents_next_sibling = None
370            while parents_next_sibling is None and parent is not None:
371                parents_next_sibling = parent.next_sibling
372                parent = parent.parent
373                if parents_next_sibling is not None:
374                    # We found the element that comes next in the document.
375                    break
376            if parents_next_sibling is not None:
377                new_childs_last_element.next_element = parents_next_sibling
378            else:
379                # The last element of this tag is the last element in
380                # the document.
381                new_childs_last_element.next_element = None
382        else:
383            next_child = self.contents[position]
384            new_child.next_sibling = next_child
385            if new_child.next_sibling is not None:
386                new_child.next_sibling.previous_sibling = new_child
387            new_childs_last_element.next_element = next_child
388
389        if new_childs_last_element.next_element is not None:
390            new_childs_last_element.next_element.previous_element = new_childs_last_element
391        self.contents.insert(position, new_child)
392
393    def append(self, tag):
394        """Appends the given tag to the contents of this tag."""
395        self.insert(len(self.contents), tag)
396
397    def insert_before(self, predecessor):
398        """Makes the given element the immediate predecessor of this one.
399
400        The two elements will have the same parent, and the given element
401        will be immediately before this one.
402        """
403        if self is predecessor:
404            raise ValueError("Can't insert an element before itself.")
405        parent = self.parent
406        if parent is None:
407            raise ValueError(
408                "Element has no parent, so 'before' has no meaning.")
409        # Extract first so that the index won't be screwed up if they
410        # are siblings.
411        if isinstance(predecessor, PageElement):
412            predecessor.extract()
413        index = parent.index(self)
414        parent.insert(index, predecessor)
415
416    def insert_after(self, successor):
417        """Makes the given element the immediate successor of this one.
418
419        The two elements will have the same parent, and the given element
420        will be immediately after this one.
421        """
422        if self is successor:
423            raise ValueError("Can't insert an element after itself.")
424        parent = self.parent
425        if parent is None:
426            raise ValueError(
427                "Element has no parent, so 'after' has no meaning.")
428        # Extract first so that the index won't be screwed up if they
429        # are siblings.
430        if isinstance(successor, PageElement):
431            successor.extract()
432        index = parent.index(self)
433        parent.insert(index+1, successor)
434
435    def find_next(self, name=None, attrs=OrderedDict(), text=None, **kwargs):
436        """Returns the first item that matches the given criteria and
437        appears after this Tag in the document."""
438        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
439    findNext = find_next  # BS3
440
441    def find_all_next(self, name=None, attrs=OrderedDict(), text=None, limit=None,
442                    **kwargs):
443        """Returns all items that match the given criteria and appear
444        after this Tag in the document."""
445        return self._find_all(name, attrs, text, limit, self.next_elements,
446                             **kwargs)
447    findAllNext = find_all_next  # BS3
448
449    def find_next_sibling(self, name=None, attrs=OrderedDict(), text=None, **kwargs):
450        """Returns the closest sibling to this Tag that matches the
451        given criteria and appears after this Tag in the document."""
452        return self._find_one(self.find_next_siblings, name, attrs, text,
453                             **kwargs)
454    findNextSibling = find_next_sibling  # BS3
455
456    def find_next_siblings(self, name=None, attrs=OrderedDict(), text=None, limit=None,
457                           **kwargs):
458        """Returns the siblings of this Tag that match the given
459        criteria and appear after this Tag in the document."""
460        return self._find_all(name, attrs, text, limit,
461                              self.next_siblings, **kwargs)
462    findNextSiblings = find_next_siblings   # BS3
463    fetchNextSiblings = find_next_siblings  # BS2
464
465    def find_previous(self, name=None, attrs=OrderedDict(), text=None, **kwargs):
466        """Returns the first item that matches the given criteria and
467        appears before this Tag in the document."""
468        return self._find_one(
469            self.find_all_previous, name, attrs, text, **kwargs)
470    findPrevious = find_previous  # BS3
471
472    def find_all_previous(self, name=None, attrs=OrderedDict(), text=None, limit=None,
473                        **kwargs):
474        """Returns all items that match the given criteria and appear
475        before this Tag in the document."""
476        return self._find_all(name, attrs, text, limit, self.previous_elements,
477                           **kwargs)
478    findAllPrevious = find_all_previous  # BS3
479    fetchPrevious = find_all_previous    # BS2
480
481    def find_previous_sibling(self, name=None, attrs=OrderedDict(), text=None, **kwargs):
482        """Returns the closest sibling to this Tag that matches the
483        given criteria and appears before this Tag in the document."""
484        return self._find_one(self.find_previous_siblings, name, attrs, text,
485                             **kwargs)
486    findPreviousSibling = find_previous_sibling  # BS3
487
488    def find_previous_siblings(self, name=None, attrs=OrderedDict(), text=None,
489                               limit=None, **kwargs):
490        """Returns the siblings of this Tag that match the given
491        criteria and appear before this Tag in the document."""
492        return self._find_all(name, attrs, text, limit,
493                              self.previous_siblings, **kwargs)
494    findPreviousSiblings = find_previous_siblings   # BS3
495    fetchPreviousSiblings = find_previous_siblings  # BS2
496
497    def find_parent(self, name=None, attrs=OrderedDict(), **kwargs):
498        """Returns the closest parent of this Tag that matches the given
499        criteria."""
500        # NOTE: We can't use _find_one because findParents takes a different
501        # set of arguments.
502        r = None
503        l = self.find_parents(name, attrs, 1, **kwargs)
504        if l:
505            r = l[0]
506        return r
507    findParent = find_parent  # BS3
508
509    def find_parents(self, name=None, attrs=OrderedDict(), limit=None, **kwargs):
510        """Returns the parents of this Tag that match the given
511        criteria."""
512
513        return self._find_all(name, attrs, None, limit, self.parents,
514                             **kwargs)
515    findParents = find_parents   # BS3
516    fetchParents = find_parents  # BS2
517
518    @property
519    def next(self):
520        return self.next_element
521
522    @property
523    def previous(self):
524        return self.previous_element
525
526    #These methods do the real heavy lifting.
527
528    def _find_one(self, method, name, attrs, text, **kwargs):
529        r = None
530        l = method(name, attrs, text, 1, **kwargs)
531        if l:
532            r = l[0]
533        return r
534
535    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
536        "Iterates over a generator looking for things that match."
537
538        if text is None and 'string' in kwargs:
539            text = kwargs['string']
540            del kwargs['string']
541
542        if isinstance(name, SoupStrainer):
543            strainer = name
544        else:
545            strainer = SoupStrainer(name, attrs, text, **kwargs)
546
547        if text is None and not limit and not attrs and not kwargs:
548            if name is True or name is None:
549                # Optimization to find all tags.
550                result = (element for element in generator
551                          if isinstance(element, Tag))
552                return ResultSet(strainer, result)
553            elif isinstance(name, str):
554                # Optimization to find all tags with a given name.
555                result = (element for element in generator
556                          if isinstance(element, Tag)
557                            and element.name == name)
558                return ResultSet(strainer, result)
559        results = ResultSet(strainer)
560        while True:
561            try:
562                i = next(generator)
563            except StopIteration:
564                break
565            if i:
566                found = strainer.search(i)
567                if found:
568                    results.append(found)
569                    if limit and len(results) >= limit:
570                        break
571        return results
572
573    #These generators can be used to navigate starting from both
574    #NavigableStrings and Tags.
575    @property
576    def next_elements(self):
577        i = self.next_element
578        while i is not None:
579            yield i
580            i = i.next_element
581
582    @property
583    def next_siblings(self):
584        i = self.next_sibling
585        while i is not None:
586            yield i
587            i = i.next_sibling
588
589    @property
590    def previous_elements(self):
591        i = self.previous_element
592        while i is not None:
593            yield i
594            i = i.previous_element
595
596    @property
597    def previous_siblings(self):
598        i = self.previous_sibling
599        while i is not None:
600            yield i
601            i = i.previous_sibling
602
603    @property
604    def parents(self):
605        i = self.parent
606        while i is not None:
607            yield i
608            i = i.parent
609
610    # Methods for supporting CSS selectors.
611
612    tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
613
614    # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
615    #   \---------------------------/  \---/\-------------/    \-------/
616    #     |                              |         |               |
617    #     |                              |         |           The value
618    #     |                              |    ~,|,^,$,* or =
619    #     |                           Attribute
620    #    Tag
621    attribselect_re = re.compile(
622        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
623        r'=?"?(?P<value>[^\]"]*)"?\]$'
624        )
625
626    def _attr_value_as_string(self, value, default=None):
627        """Force an attribute value into a string representation.
628
629        A multi-valued attribute will be converted into a
630        space-separated stirng.
631        """
632        value = self.get(value, default)
633        if isinstance(value, list) or isinstance(value, tuple):
634            value =" ".join(value)
635        return value
636
637    def _tag_name_matches_and(self, function, tag_name):
638        if not tag_name:
639            return function
640        else:
641            def _match(tag):
642                return tag.name == tag_name and function(tag)
643            return _match
644
645    def _attribute_checker(self, operator, attribute, value=''):
646        """Create a function that performs a CSS selector operation.
647
648        Takes an operator, attribute and optional value. Returns a
649        function that will return True for elements that match that
650        combination.
651        """
652        if operator == '=':
653            # string representation of `attribute` is equal to `value`
654            return lambda el: el._attr_value_as_string(attribute) == value
655        elif operator == '~':
656            # space-separated list representation of `attribute`
657            # contains `value`
658            def _includes_value(element):
659                attribute_value = element.get(attribute, [])
660                if not isinstance(attribute_value, list):
661                    attribute_value = attribute_value.split()
662                return value in attribute_value
663            return _includes_value
664        elif operator == '^':
665            # string representation of `attribute` starts with `value`
666            return lambda el: el._attr_value_as_string(
667                attribute, '').startswith(value)
668        elif operator == '$':
669            # string represenation of `attribute` ends with `value`
670            return lambda el: el._attr_value_as_string(
671                attribute, '').endswith(value)
672        elif operator == '*':
673            # string representation of `attribute` contains `value`
674            return lambda el: value in el._attr_value_as_string(attribute, '')
675        elif operator == '|':
676            # string representation of `attribute` is either exactly
677            # `value` or starts with `value` and then a dash.
678            def _is_or_starts_with_dash(element):
679                attribute_value = element._attr_value_as_string(attribute, '')
680                return (attribute_value == value or attribute_value.startswith(
681                        value + '-'))
682            return _is_or_starts_with_dash
683        else:
684            return lambda el: el.has_attr(attribute)
685
686    # Old non-property versions of the generators, for backwards
687    # compatibility with BS3.
688    def nextGenerator(self):
689        return self.next_elements
690
691    def nextSiblingGenerator(self):
692        return self.next_siblings
693
694    def previousGenerator(self):
695        return self.previous_elements
696
697    def previousSiblingGenerator(self):
698        return self.previous_siblings
699
700    def parentGenerator(self):
701        return self.parents
702
703
704class NavigableString(str, PageElement):
705
706    PREFIX = ''
707    SUFFIX = ''
708
709    def __new__(cls, value):
710        """Create a new NavigableString.
711
712        When unpickling a NavigableString, this method is called with
713        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
714        passed in to the superclass's __new__ or the superclass won't know
715        how to handle non-ASCII characters.
716        """
717        if isinstance(value, str):
718            u = str.__new__(cls, value)
719        else:
720            u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
721        u.setup()
722        return u
723
724    def __copy__(self):
725        """A copy of a NavigableString has the same contents and class
726        as the original, but it is not connected to the parse tree.
727        """
728        return type(self)(self)
729
730    def __getnewargs__(self):
731        return (str(self),)
732
733    def __getattr__(self, attr):
734        """text.string gives you text. This is for backwards
735        compatibility for Navigable*String, but for CData* it lets you
736        get the string without the CData wrapper."""
737        if attr == 'string':
738            return self
739        else:
740            raise AttributeError(
741                "'%s' object has no attribute '%s'" % (
742                    self.__class__.__name__, attr))
743
744    def output_ready(self, formatter="minimal"):
745        output = self.format_string(self, formatter)
746        return self.PREFIX + output + self.SUFFIX
747
748    @property
749    def name(self):
750        return None
751
752    @name.setter
753    def name(self, name):
754        raise AttributeError("A NavigableString cannot be given a name.")
755
756class PreformattedString(NavigableString):
757    """A NavigableString not subject to the normal formatting rules.
758
759    The string will be passed into the formatter (to trigger side effects),
760    but the return value will be ignored.
761    """
762
763    def output_ready(self, formatter="minimal"):
764        """CData strings are passed into the formatter.
765        But the return value is ignored."""
766        self.format_string(self, formatter)
767        return self.PREFIX + self + self.SUFFIX
768
769class CData(PreformattedString):
770
771    PREFIX = '<![CDATA['
772    SUFFIX = ']]>'
773
774class ProcessingInstruction(PreformattedString):
775
776    PREFIX = '<?'
777    SUFFIX = '>'
778
779class Comment(PreformattedString):
780
781    PREFIX = '<!--'
782    SUFFIX = '-->'
783
784
785class Declaration(PreformattedString):
786    PREFIX = '<!'
787    SUFFIX = '!>'
788
789
790class Doctype(PreformattedString):
791
792    @classmethod
793    def for_name_and_ids(cls, name, pub_id, system_id):
794        value = name or ''
795        if pub_id is not None:
796            value += ' PUBLIC "%s"' % pub_id
797            if system_id is not None:
798                value += '\n "%s"' % system_id
799        elif system_id is not None:
800            value += ' SYSTEM "%s"' % system_id
801
802        return Doctype(value)
803
804    PREFIX = '<!DOCTYPE '
805    SUFFIX = '>\n'
806
807
808class Tag(PageElement):
809
810    """Represents a found HTML tag with its attributes and contents."""
811
812    def __init__(self, parser=None, builder=None, name=None, namespace=None,
813                 prefix=None, attrs=None, parent=None, previous=None):
814        "Basic constructor."
815
816        if parser is None:
817            self.parser_class = None
818        else:
819            # We don't actually store the parser object: that lets extracted
820            # chunks be garbage-collected.
821            self.parser_class = parser.__class__
822        if name is None:
823            raise ValueError("No value provided for new tag's name.")
824        self.name = name
825        self.namespace = namespace
826        self.prefix = prefix
827        if attrs is None:
828            attrs = OrderedDict()
829        elif attrs:
830            if builder is not None and builder.cdata_list_attributes:
831                attrs = builder._replace_cdata_list_attribute_values(
832                    self.name, attrs)
833            else:
834                attrs = OrderedDict(attrs)
835        else:
836            attrs = OrderedDict(attrs)
837        self.attrs = attrs
838        self.contents = []
839        self.setup(parent, previous)
840        self.hidden = False
841
842        # Set up any substitutions, such as the charset in a META tag.
843        if builder is not None:
844            builder.set_up_substitutions(self)
845            self.can_be_empty_element = builder.can_be_empty_element(name)
846        else:
847            self.can_be_empty_element = False
848
849    parserClass = _alias("parser_class")  # BS3
850
851    def __copy__(self):
852        """A copy of a Tag is a new Tag, unconnected to the parse tree.
853        Its contents are a copy of the old Tag's contents.
854        """
855        clone = type(self)(None, self.builder, self.name, self.namespace,
856                           self.nsprefix, self.attrs)
857        for attr in ('can_be_empty_element', 'hidden'):
858            setattr(clone, attr, getattr(self, attr))
859        for child in self.contents:
860            clone.append(child.__copy__())
861        return clone
862
863    @property
864    def is_empty_element(self):
865        """Is this tag an empty-element tag? (aka a self-closing tag)
866
867        A tag that has contents is never an empty-element tag.
868
869        A tag that has no contents may or may not be an empty-element
870        tag. It depends on the builder used to create the tag. If the
871        builder has a designated list of empty-element tags, then only
872        a tag whose name shows up in that list is considered an
873        empty-element tag.
874
875        If the builder has no designated list of empty-element tags,
876        then any tag with no contents is an empty-element tag.
877        """
878        return len(self.contents) == 0 and self.can_be_empty_element
879    isSelfClosing = is_empty_element  # BS3
880
881    @property
882    def is_non_breaking_inline_tag(self):
883        # used only for pretty printing of html to prevent returns after tags
884        # from introducing spaces where none are desired
885        return self.name in NON_BREAKING_INLINE_TAGS and not self._is_xml
886
887    @property
888    def string(self):
889        """Convenience property to get the single string within this tag.
890
891        :Return: If this tag has a single string child, return value
892         is that string. If this tag has no children, or more than one
893         child, return value is None. If this tag has one child tag,
894         return value is the 'string' attribute of the child tag,
895         recursively.
896        """
897        if len(self.contents) != 1:
898            return None
899        child = self.contents[0]
900        if isinstance(child, NavigableString):
901            return child
902        return child.string
903
904    @string.setter
905    def string(self, string):
906        self.clear()
907        self.append(string.__class__(string))
908
909    def _all_strings(self, strip=False, types=(NavigableString, CData)):
910        """Yield all strings of certain classes, possibly stripping them.
911
912        By default, yields only NavigableString and CData objects. So
913        no comments, processing instructions, etc.
914        """
915        for descendant in self.descendants:
916            if (
917                (types is None and not isinstance(descendant, NavigableString))
918                or
919                (types is not None and type(descendant) not in types)):
920                continue
921            if strip:
922                descendant = descendant.strip()
923                if len(descendant) == 0:
924                    continue
925            yield descendant
926
927    strings = property(_all_strings)
928
929    @property
930    def stripped_strings(self):
931        for string in self._all_strings(True):
932            yield string
933
934    def get_text(self, separator="", strip=False,
935                 types=(NavigableString, CData)):
936        """
937        Get all child strings, concatenated using the given separator.
938        """
939        return separator.join([s for s in self._all_strings(
940                    strip, types=types)])
941    getText = get_text
942    text = property(get_text)
943
944    def decompose(self):
945        """Recursively destroys the contents of this tree."""
946        self.extract()
947        i = self
948        while i is not None:
949            next = i.next_element
950            i.__dict__.clear()
951            i.contents = []
952            i = next
953
954    def clear(self, decompose=False):
955        """
956        Extract all children. If decompose is True, decompose instead.
957        """
958        if decompose:
959            for element in self.contents[:]:
960                if isinstance(element, Tag):
961                    element.decompose()
962                else:
963                    element.extract()
964        else:
965            for element in self.contents[:]:
966                element.extract()
967
968    def index(self, element):
969        """
970        Find the index of a child by identity, not value. Avoids issues with
971        tag.contents.index(element) getting the index of equal elements.
972        """
973        for i, child in enumerate(self.contents):
974            if child is element:
975                return i
976        raise ValueError("Tag.index: element not in tag")
977
978    def get(self, key, default=None):
979        """Returns the value of the 'key' attribute for the tag, or
980        the value given for 'default' if it doesn't have that
981        attribute."""
982        return self.attrs.get(key, default)
983
984    def has_attr(self, key):
985        return key in self.attrs
986
987    def __hash__(self):
988        return str(self).__hash__()
989
990    def __getitem__(self, key):
991        """tag[key] returns the value of the 'key' attribute for the tag,
992        and throws an exception if it's not there."""
993        return self.attrs[key]
994
995    def __iter__(self):
996        "Iterating over a tag iterates over its contents."
997        return iter(self.contents)
998
999    def __len__(self):
1000        "The length of a tag is the length of its list of contents."
1001        return len(self.contents)
1002
1003    def __contains__(self, x):
1004        return x in self.contents
1005
1006    def __bool__(self):
1007        "A tag is non-None even if it has no contents."
1008        return True
1009
1010    def __nonzero__(self):
1011        "A tag is non-None even if it has no contents."
1012        return True
1013
1014    def __setitem__(self, key, value):
1015        """Setting tag[key] sets the value of the 'key' attribute for the
1016        tag."""
1017        self.attrs[key] = value
1018
1019    def __delitem__(self, key):
1020        "Deleting tag[key] deletes all 'key' attributes for the tag."
1021        self.attrs.pop(key, None)
1022
1023    def __call__(self, *args, **kwargs):
1024        """Calling a tag like a function is the same as calling its
1025        find_all() method. Eg. tag('a') returns a list of all the A tags
1026        found within this tag."""
1027        return self.find_all(*args, **kwargs)
1028
1029    def __getattr__(self, tag):
1030        #print "Getattr %s.%s" % (self.__class__, tag)
1031        if len(tag) > 3 and tag.endswith('Tag'):
1032            # BS3: soup.aTag -> "soup.find("a")
1033            tag_name = tag[:-3]
1034            warnings.warn(
1035                '.%sTag is deprecated, use .find("%s") instead.' % (
1036                    tag_name, tag_name))
1037            return self.find(tag_name)
1038        # We special case contents to avoid recursion.
1039        elif not tag.startswith("__") and not tag=="contents":
1040            return self.find(tag)
1041        raise AttributeError(
1042            "'%s' object has no attribute '%s'" % (self.__class__, tag))
1043
1044    def __eq__(self, other):
1045        """Returns true iff this tag has the same name, the same attributes,
1046        and the same contents (recursively) as the given tag."""
1047        if self is other:
1048            return True
1049        if (not hasattr(other, 'name') or
1050            not hasattr(other, 'attrs') or
1051            not hasattr(other, 'contents') or
1052            self.name != other.name or
1053            self.attrs != other.attrs or
1054            len(self) != len(other)):
1055            return False
1056        for i, my_child in enumerate(self.contents):
1057            if my_child != other.contents[i]:
1058                return False
1059        return True
1060
1061    def __ne__(self, other):
1062        """Returns true iff this tag is not identical to the other tag,
1063        as defined in __eq__."""
1064        return not self == other
1065
1066    def __repr__(self, encoding="unicode-escape"):
1067        """Renders this tag as a string."""
1068        # "The return value must be a string object", i.e. Unicode
1069        return self.decode()
1070
1071    def __str__(self):
1072        return self.decode()
1073
1074    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1075               indent_level=None, formatter="minimal",
1076               errors="xmlcharrefreplace", indent_chars=" "):
1077        # Turn the data structure into Unicode, then encode the
1078        # Unicode.
1079        u = self.decode(indent_level, encoding, formatter, indent_chars)
1080        return u.encode(encoding, errors)
1081
1082    def _should_pretty_print(self, indent_level):
1083        """Should this tag be pretty-printed?"""
1084        return (
1085            indent_level is not None and
1086            ((self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1087              and self.name not in NON_BREAKING_INLINE_TAGS)
1088             or self._is_xml))
1089
1090    def decode(self, indent_level=None,
1091               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1092               formatter="minimal", indent_chars=" "):
1093        """Returns a Unicode representation of this tag and its contents.
1094
1095        :param eventual_encoding: The tag is destined to be
1096           encoded into this encoding. This method is _not_
1097           responsible for performing that encoding. This information
1098           is passed in so that it can be substituted in if the
1099           document contains a <META> tag that mentions the document's
1100           encoding.
1101        """
1102
1103        # First off, turn a string formatter into a function. This
1104        # will stop the lookup from happening over and over again.
1105        if not isinstance(formatter, Callable):
1106            formatter = self._formatter_for_name(formatter)
1107
1108        attrs = []
1109        if self.attrs:
1110            for key, val in sorted(self.attrs.items()):
1111                if val is None:
1112                    decoded = key
1113                else:
1114                    if isinstance(val, list) or isinstance(val, tuple):
1115                        val = ' '.join(val)
1116                    elif not isinstance(val, str):
1117                        val = str(val)
1118                    elif (
1119                        isinstance(val, AttributeValueWithCharsetSubstitution)
1120                        and eventual_encoding is not None):
1121                        val = val.encode(eventual_encoding)
1122
1123                    text = self.format_string(val, formatter)
1124                    decoded = (
1125                        str(key) + '='
1126                        + EntitySubstitution.quoted_attribute_value(text))
1127                attrs.append(decoded)
1128        close = ''
1129        closeTag = ''
1130
1131        prefix = ''
1132        if self.prefix:
1133            prefix = self.prefix + ":"
1134
1135        if self.is_empty_element:
1136            close = '/'
1137        else:
1138            closeTag = '</%s%s>' % (prefix, self.name)
1139
1140        pretty_print = self._should_pretty_print(indent_level)
1141        space = ''
1142        indent_space = ''
1143        if indent_level is not None:
1144            indent_space = (indent_chars * (indent_level - 1))
1145        if pretty_print:
1146            space = indent_space
1147            indent_contents = indent_level + 1
1148        else:
1149            indent_contents = None
1150        contents = self.decode_contents(
1151            indent_contents, eventual_encoding, formatter, indent_chars)
1152
1153        if self.hidden:
1154            # This is the 'document root' object.
1155            s = contents
1156        else:
1157            s = []
1158            attribute_string = ''
1159            if attrs:
1160                attribute_string = ' ' + ' '.join(attrs)
1161            if indent_level is not None:
1162                # Even if this particular tag is not pretty-printed,
1163                # we should indent up to the start of the tag.
1164                s.append(indent_space)
1165            s.append('<%s%s%s%s>' % (
1166                    prefix, self.name, attribute_string, close))
1167            if pretty_print:
1168                s.append("\n")
1169            s.append(contents)
1170            if pretty_print and contents and contents[-1] != "\n":
1171                s.append("\n")
1172            if pretty_print and closeTag:
1173                s.append(space)
1174            s.append(closeTag)
1175            if indent_level is not None and closeTag and self.next_sibling:
1176                # Even if this particular tag is not pretty-printed,
1177                # we're now done with the tag, and we should add a
1178                # newline if appropriate.
1179                s.append("\n")
1180            s = ''.join(s)
1181        return s
1182
1183    def prettify(self, encoding=None, formatter="minimal", indent_chars=" "):
1184        if encoding is None:
1185            return self.decode(True, formatter=formatter, indent_chars=indent_chars)
1186        else:
1187            return self.encode(encoding, True, formatter=formatter, indent_chars=indent_chars)
1188
1189    def decode_contents(self, indent_level=None,
1190                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1191                       formatter="minimal", indent_chars=" "):
1192        """Renders the contents of this tag as a Unicode string.
1193
1194        :param indent_level: Each line of the rendering will be
1195           indented this many spaces.
1196
1197        :param eventual_encoding: The tag is destined to be
1198           encoded into this encoding. This method is _not_
1199           responsible for performing that encoding. This information
1200           is passed in so that it can be substituted in if the
1201           document contains a <META> tag that mentions the document's
1202           encoding.
1203
1204        :param formatter: The output formatter responsible for converting
1205           entities to Unicode characters.
1206        """
1207        # First off, turn a string formatter into a function. This
1208        # will stop the lookup from happening over and over again.
1209        if not isinstance(formatter, Callable):
1210            formatter = self._formatter_for_name(formatter)
1211
1212        pretty_print = (indent_level is not None)
1213        s = []
1214        for c in self:
1215            text = None
1216            if isinstance(c, NavigableString):
1217                text = c.output_ready(formatter)
1218            elif isinstance(c, Tag):
1219                s.append(c.decode(indent_level, eventual_encoding, formatter, indent_chars))
1220            if text and indent_level and not self.name == 'pre':
1221                text = text.strip()
1222            if text:
1223                if pretty_print and not self.name == 'pre':
1224                    s.append(indent_chars * (indent_level - 1))
1225                s.append(text)
1226                if pretty_print and not self.name == 'pre':
1227                    s.append("\n")
1228        return ''.join(s)
1229
1230    def decodexml(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1231               formatter="minimal", indent_chars=" "):
1232
1233        # First off, turn a string formatter into a function. This
1234        # will stop the lookup from happening over and over again.
1235        if not isinstance(formatter, Callable):
1236            formatter = self._formatter_for_name(formatter)
1237
1238        is_xmlparent = self.name.lower() in EBOOK_XML_PARENT_TAGS
1239        attrs = []
1240        if self.attrs:
1241            for key, val in sorted(self.attrs.items()):
1242                if val is None:
1243                    decoded = key
1244                else:
1245                    if isinstance(val, list) or isinstance(val, tuple):
1246                        val = ' '.join(val)
1247                    elif not isinstance(val, str):
1248                        val = str(val)
1249                    elif (
1250                        isinstance(val, AttributeValueWithCharsetSubstitution)
1251                        and eventual_encoding is not None):
1252                        val = val.encode(eventual_encoding)
1253
1254                    text = self.format_string(val, formatter)
1255                    decoded = (
1256                        str(key) + '='
1257                        + EntitySubstitution.quoted_attribute_value(text))
1258                attrs.append(decoded)
1259
1260        prefix = ''
1261        if self.prefix:
1262            prefix = self.prefix + ":"
1263
1264        # for pure xml, a self closing tag with only whitespace
1265        # "contents" should be treated as empty
1266        if self.can_be_empty_element:
1267            tagcontents = self.string
1268            if tagcontents is not None and len(tagcontents.strip()) == 0:
1269                self.contents = []
1270
1271        close = ''
1272        closeTag = ''
1273        if self.is_empty_element:
1274            close = '/'
1275        else:
1276            closeTag = '</%s%s>' % (prefix, self.name)
1277
1278        indent_space = (indent_chars * (indent_level - 1))
1279        indent_contents = indent_level
1280        if is_xmlparent or self.hidden:
1281            indent_contents = indent_level + 1
1282
1283        contents = self.decodexml_contents(indent_contents, eventual_encoding, formatter, indent_chars)
1284        if self.hidden:
1285            # This is the 'document root' object.
1286            s = contents
1287        else:
1288            s = []
1289            attribute_string = ''
1290            if attrs:
1291                attribute_string = ' ' + ' '.join(attrs)
1292            s.append(indent_space)
1293            s.append('<%s%s%s%s>' % (prefix, self.name, attribute_string, close))
1294            if is_xmlparent:
1295                s.append("\n")
1296            s.append(contents)
1297            if contents and contents[-1] != "\n" and is_xmlparent or self.is_empty_element:
1298                s.append("\n")
1299            if closeTag and is_xmlparent:
1300                s.append(indent_space)
1301            s.append(closeTag)
1302            if closeTag and self.next_sibling:
1303                s.append("\n")
1304            s = ''.join(s)
1305        return s
1306
1307    def decodexml_contents(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1308                        formatter="minimal", indent_chars=" "):
1309        """Renders the contents of this tag as a Unicode string.
1310        """
1311        # First off, turn a string formatter into a function. This
1312        # will stop the lookup from happening over and over again.
1313        if not isinstance(formatter, Callable):
1314            formatter = self._formatter_for_name(formatter)
1315
1316        is_xmlparent = self.name.lower() in EBOOK_XML_PARENT_TAGS
1317        s = []
1318        for c in self:
1319            text = None
1320            if isinstance(c, NavigableString):
1321                text = c.output_ready(formatter)
1322            elif isinstance(c, Tag):
1323                val = c.decodexml(indent_level, eventual_encoding, formatter, indent_chars)
1324                s.append(val)
1325            if text:
1326                text = text.strip()
1327            if text:
1328                if is_xmlparent and len(s) == 0:
1329                    s.append(indent_chars * (indent_level - 1))
1330                s.append(text)
1331        return ''.join(s)
1332
1333    def serialize_xhtml(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
1334        # First off, turn a string formatter into a function. This
1335        # will stop the lookup from happening over and over again.
1336        if not isinstance(formatter, Callable):
1337            formatter = self._formatter_for_name(formatter)
1338
1339        prefix = ''
1340        close = ''
1341        closeTag = ''
1342        attrs = []
1343        if self.attrs:
1344            for key, val in sorted(self.attrs.items()):
1345                if val is None:
1346                    ntext = key
1347                else:
1348                    if isinstance(val, list) or isinstance(val, tuple):
1349                        val = ' '.join(val)
1350                    elif not isinstance(val, str):
1351                        val = str(val)
1352                    elif (isinstance(val, AttributeValueWithCharsetSubstitution) and
1353                          eventual_encoding is not None):
1354                        val = val.encode(eventual_encoding)
1355                    text = self.format_string(val, formatter)
1356                    ntext = (str(key) + '=' + EntitySubstitution.quoted_attribute_value(text))
1357                attrs.append(ntext)
1358
1359        contents = self.serialize_xhtml_contents(eventual_encoding, formatter)
1360
1361        in_xml_ns = self.namespace != 'http://www.w3.org/1999/xhtml'
1362        testcontents = contents.strip()
1363
1364        if self.prefix:
1365            prefix = self.prefix + ":"
1366
1367        if self.name in VOID_TAGS or (in_xml_ns and testcontents==""):
1368            close = '/'
1369        else:
1370            closeTag = '</%s%s>' % (prefix, self.name)
1371
1372        # strip extraneous whitespace before the primary closing tag
1373        if self.name in SPECIAL_HANDLING_TAGS:
1374            contents = contents.strip()
1375            contents += "\n"
1376
1377        if self.hidden:
1378            # This is the 'document root' object.
1379            s = contents
1380        else:
1381            s = []
1382            attribute_string = ''
1383            if attrs:
1384                attribute_string = ' ' + ' '.join(attrs)
1385            s.append('<%s%s%s%s>' % (prefix, self.name, attribute_string, close))
1386            if self.name in SPECIAL_HANDLING_TAGS:
1387                s.append("\n")
1388            s.append(contents)
1389            s.append(closeTag)
1390            if self.name in SPECIAL_HANDLING_TAGS:
1391                s.append("\n")
1392            s = ''.join(s)
1393        return s
1394
1395    def serialize_xhtml_contents(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
1396
1397        # First off, turn a string formatter into a function. This
1398        # will stop the lookup from happening over and over again.
1399        if not isinstance(formatter, Callable):
1400            formatter = self._formatter_for_name(formatter)
1401
1402        s = []
1403        for c in self:
1404            text = None
1405            if isinstance(c, Comment):
1406                text = Comment(c).output_ready(formatter)
1407                s.append(text)
1408            elif isinstance(c, CData):
1409                text = CData(c).output_ready(formatter)
1410                s.append(text)
1411            elif isinstance(c, NavigableString):
1412                text = c.output_ready(formatter)
1413                s.append(text)
1414            elif isinstance(c, Tag):
1415                s.append(c.serialize_xhtml(eventual_encoding, formatter))
1416        return ''.join(s)
1417
1418    def prettyprint_xhtml(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1419               formatter="minimal", indent_chars=" "):
1420
1421        # First off, turn a string formatter into a function. This
1422        # will stop the lookup from happening over and over again.
1423        if not isinstance(formatter, Callable):
1424            formatter = self._formatter_for_name(formatter)
1425
1426        is_structural = self.name in STRUCTURAL_TAGS
1427        is_inline = self.name in NON_BREAKING_INLINE_TAGS
1428
1429        # build attribute string
1430        attribs = []
1431        atts = ""
1432        if self.attrs:
1433            for key, val in sorted(self.attrs.items()):
1434                if val is None:
1435                    decoded = key
1436                else:
1437                    if isinstance(val, list) or isinstance(val, tuple):
1438                        val = ' '.join(val)
1439                    elif not isinstance(val, str):
1440                        val = str(val)
1441                    elif (
1442                        isinstance(val, AttributeValueWithCharsetSubstitution)
1443                        and eventual_encoding is not None):
1444                        val = val.encode(eventual_encoding)
1445
1446                    text = self.format_string(val, formatter)
1447                    decoded = (
1448                        str(key) + '='
1449                        + EntitySubstitution.quoted_attribute_value(text))
1450                attribs.append(decoded)
1451            atts = " " + " ".join(attribs)
1452
1453
1454        # get tag content
1455        contents=""
1456        is_void_tag = self.name in VOID_TAGS
1457        if not is_void_tag:
1458            if is_structural:
1459                contents = self.prettyprint_xhtml_contents(indent_level+1, eventual_encoding, formatter, indent_chars)
1460            else:
1461                contents = self.prettyprint_xhtml_contents(indent_level, eventual_encoding, formatter, indent_chars)
1462
1463        if self.hidden:
1464            # This is the 'document root' object.
1465            return contents
1466
1467        in_xml_ns = self.namespace != 'http://www.w3.org/1999/xhtml'
1468        testcontents = contents.strip()
1469        single = self.name in VOID_TAGS or (in_xml_ns and testcontents == "")
1470
1471        prefix = ''
1472        if self.prefix:
1473            prefix = self.prefix + ":"
1474
1475        is_keepwhitespace = self.name in PRESERVE_WHITESPACE_TAGS
1476        if not is_keepwhitespace and not is_inline:
1477            contents = contents.rstrip()
1478
1479        indent_space = (indent_chars * (indent_level - 1))
1480
1481        # handle self-closed tags with no content first
1482        if single:
1483            selfclosetag = '<%s%s%s/>' % (prefix, self.name, atts)
1484            if is_inline:
1485                # always add newline after br tags when they are children of structural tags
1486                if (self.name == "br") and self.parent.name in STRUCTURAL_TAGS:
1487                    selfclosetag += "\n"
1488                return selfclosetag
1489            return indent_space + selfclosetag + "\n"
1490
1491        # handle the general case
1492        starttag = '<%s%s%s>' % (prefix, self.name, atts)
1493        closetag = '</%s%s>' % (prefix, self.name)
1494        results = ""
1495        if is_structural:
1496            results = indent_space + starttag
1497            if contents != "":
1498                results += "\n" + contents + "\n" + indent_space
1499            results += closetag + "\n"
1500        elif is_inline:
1501            results = starttag
1502            results += contents
1503            results += closetag
1504        else:
1505            results = indent_space + starttag
1506            if not is_keepwhitespace:
1507                contents = contents.lstrip()
1508            results += contents
1509            results += closetag + "\n"
1510        return results
1511
1512    def prettyprint_xhtml_contents(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1513                        formatter="minimal", indent_chars=" "):
1514        """Renders the contents of this tag as a Unicode string.
1515        """
1516        # First off, turn a string formatter into a function. This
1517        # will stop the lookup from happening over and over again.
1518        if not isinstance(formatter, Callable):
1519            formatter = self._formatter_for_name(formatter)
1520
1521        is_structural = self.name in STRUCTURAL_TAGS
1522        is_inline = self.name in NON_BREAKING_INLINE_TAGS
1523        is_keepwhitespace = self.name in PRESERVE_WHITESPACE_TAGS
1524        indent_space = (indent_chars * (indent_level - 1))
1525        last_char = "x"
1526        contains_block_tags = False
1527
1528        if is_structural or self.hidden:
1529            last_char = "\n"
1530
1531        s = []
1532
1533        for c in self:
1534            text = None
1535            if isinstance(c, Comment):
1536                text = Comment(c).output_ready(formatter)
1537                s.append(text)
1538            elif isinstance(c, CData):
1539                text = CData(c).output_ready(formatter)
1540                s.append(text)
1541            elif isinstance(c, NavigableString):
1542                text = c.output_ready(formatter)
1543                tval = text
1544                is_whitespace = (tval.strip() == "")
1545
1546                # handle pure whitespace differently
1547                if is_whitespace:
1548                    if is_keepwhitespace:
1549                        s.append(text)
1550                    elif is_inline or self.name in OTHER_TEXTHOLDING_TAGS:
1551                        if last_char not in " \t\v\f\r\n":
1552                            s.append(" ")
1553                        else:
1554                            s.append("")
1555                    else:
1556                        # ignore this whitespace
1557                        s.append("")
1558
1559                # handle all other text
1560                else:
1561                    if is_structural and last_char == "\n":
1562                        s.append(indent_space)
1563                        text = text.lstrip()
1564                    s.append(text)
1565
1566            # handle tags
1567            elif isinstance(c, Tag):
1568                val = c.prettyprint_xhtml(indent_level, eventual_encoding, formatter, indent_chars)
1569                # track if contains block tags and append newline and prepend newline if needed
1570                if not c.name in NON_BREAKING_INLINE_TAGS:
1571                    contains_block_tags = True
1572                    if last_char != "\n":
1573                        s.append("\n")
1574                        last_char = "\n"
1575                # if child of a structual tag is inline and follows a newline, indent it properly
1576                if is_structural and c.name in NON_BREAKING_INLINE_TAGS and last_char == '\n':
1577                    s.append(indent_space)
1578                    val = val.lstrip()
1579                s.append(val)
1580
1581            else:
1582                s.append("")
1583
1584            # update last_char
1585            last_element = s[-1]
1586            if last_element != "":
1587                last_char = last_element[-1:]
1588
1589        # after processing all children, handle inline tags that contain block level tags
1590        if is_inline and contains_block_tags:
1591            if last_char != "\n":
1592                s.append("\n")
1593            s.append(indent_space)
1594
1595        return ''.join(s)
1596
1597    def encode_contents(
1598        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1599        formatter="minimal", indent_chars=" "):
1600        """Renders the contents of this tag as a bytestring.
1601
1602        :param indent_level: Each line of the rendering will be
1603           indented this many spaces.
1604
1605        :param eventual_encoding: The bytestring will be in this encoding.
1606
1607        :param formatter: The output formatter responsible for converting
1608           entities to Unicode characters.
1609        """
1610
1611        contents = self.decode_contents(indent_level, encoding, formatter, indent_chars)
1612        return contents.encode(encoding)
1613
1614    # Old method for BS3 compatibility
1615    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1616                       prettyPrint=False, indentLevel=0):
1617        if not prettyPrint:
1618            indentLevel = None
1619        return self.encode_contents(
1620            indent_level=indentLevel, encoding=encoding)
1621
1622    #Soup methods
1623
1624    def find(self, name=None, attrs=OrderedDict(), recursive=True, text=None,
1625             **kwargs):
1626        """Return only the first child of this Tag matching the given
1627        criteria."""
1628        r = None
1629        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1630        if l:
1631            r = l[0]
1632        return r
1633    findChild = find
1634
1635    def find_all(self, name=None, attrs=OrderedDict(), recursive=True, text=None,
1636                 limit=None, **kwargs):
1637        """Extracts a list of Tag objects that match the given
1638        criteria.  You can specify the name of the Tag and any
1639        attributes you want the Tag to have.
1640
1641        The value of a key-value pair in the 'attrs' map can be a
1642        string, a list of strings, a regular expression object, or a
1643        callable that takes a string and returns whether or not the
1644        string matches for some custom definition of 'matches'. The
1645        same is true of the tag name."""
1646
1647        generator = self.descendants
1648        if not recursive:
1649            generator = self.children
1650        return self._find_all(name, attrs, text, limit, generator, **kwargs)
1651    findAll = find_all       # BS3
1652    findChildren = find_all  # BS2
1653
1654    #Generator methods
1655    @property
1656    def children(self):
1657        # return iter() to make the purpose of the method clear
1658        return iter(self.contents)  # XXX This seems to be untested.
1659
1660    @property
1661    def descendants(self):
1662        if not len(self.contents):
1663            return
1664        stopNode = self._last_descendant().next_element
1665        current = self.contents[0]
1666        while current is not stopNode:
1667            yield current
1668            current = current.next_element
1669
1670    # CSS selector code
1671
1672    _selector_combinators = ['>', '+', '~']
1673    _select_debug = False
1674    def select_one(self, selector):
1675        """Perform a CSS selection operation on the current element."""
1676        value = self.select(selector, limit=1)
1677        if value:
1678            return value[0]
1679        return None
1680
1681    def select(self, selector, _candidate_generator=None, limit=None):
1682        """Perform a CSS selection operation on the current element."""
1683
1684        # Remove whitespace directly after the grouping operator ','
1685        # then split into tokens.
1686        tokens = re.sub(',[\s]*',',', selector).split()
1687        current_context = [self]
1688
1689        if tokens[-1] in self._selector_combinators:
1690            raise ValueError(
1691                'Final combinator "%s" is missing an argument.' % tokens[-1])
1692
1693        if self._select_debug:
1694            print('Running CSS selector "%s"' % selector)
1695
1696        for index, token_group in enumerate(tokens):
1697            new_context = []
1698            new_context_ids = set([])
1699
1700            # Grouping selectors, ie: p,a
1701            grouped_tokens = token_group.split(',')
1702            if '' in grouped_tokens:
1703                raise ValueError('Invalid group selection syntax: %s' % token_group)
1704
1705            if tokens[index-1] in self._selector_combinators:
1706                # This token was consumed by the previous combinator. Skip it.
1707                if self._select_debug:
1708                    print('  Token was consumed by the previous combinator.')
1709                continue
1710
1711            for token in grouped_tokens:
1712                if self._select_debug:
1713                    print(' Considering token "%s"' % token)
1714                recursive_candidate_generator = None
1715                tag_name = None
1716
1717                # Each operation corresponds to a checker function, a rule
1718                # for determining whether a candidate matches the
1719                # selector. Candidates are generated by the active
1720                # iterator.
1721                checker = None
1722
1723                m = self.attribselect_re.match(token)
1724                if m is not None:
1725                    # Attribute selector
1726                    tag_name, attribute, operator, value = m.groups()
1727                    checker = self._attribute_checker(operator, attribute, value)
1728
1729                elif '#' in token:
1730                    # ID selector
1731                    tag_name, tag_id = token.split('#', 1)
1732                    def id_matches(tag):
1733                        return tag.get('id', None) == tag_id
1734                    checker = id_matches
1735
1736                elif '.' in token:
1737                    # Class selector
1738                    tag_name, klass = token.split('.', 1)
1739                    classes = set(klass.split('.'))
1740                    def classes_match(candidate):
1741                        return classes.issubset(candidate.get('class', []))
1742                    checker = classes_match
1743
1744                elif ':' in token:
1745                    # Pseudo-class
1746                    tag_name, pseudo = token.split(':', 1)
1747                    if tag_name == '':
1748                        raise ValueError(
1749                            "A pseudo-class must be prefixed with a tag name.")
1750                    pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1751                    found = []
1752                    if pseudo_attributes is None:
1753                        pseudo_type = pseudo
1754                        pseudo_value = None
1755                    else:
1756                        pseudo_type, pseudo_value = pseudo_attributes.groups()
1757                    if pseudo_type == 'nth-of-type':
1758                        try:
1759                            pseudo_value = int(pseudo_value)
1760                        except:
1761                            raise NotImplementedError(
1762                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1763                        if pseudo_value < 1:
1764                            raise ValueError(
1765                                'nth-of-type pseudo-class value must be at least 1.')
1766                        class Counter(object):
1767                            def __init__(self, destination):
1768                                self.count = 0
1769                                self.destination = destination
1770
1771                            def nth_child_of_type(self, tag):
1772                                self.count += 1
1773                                if self.count == self.destination:
1774                                    return True
1775                                if self.count > self.destination:
1776                                    # Stop the generator that's sending us
1777                                    # these things.
1778                                    raise StopIteration()
1779                                return False
1780                        checker = Counter(pseudo_value).nth_child_of_type
1781                    else:
1782                        raise NotImplementedError(
1783                            'Only the following pseudo-classes are implemented: nth-of-type.')
1784
1785                elif token == '*':
1786                    # Star selector -- matches everything
1787                    pass
1788                elif token == '>':
1789                    # Run the next token as a CSS selector against the
1790                    # direct children of each tag in the current context.
1791                    recursive_candidate_generator = lambda tag: tag.children
1792                elif token == '~':
1793                    # Run the next token as a CSS selector against the
1794                    # siblings of each tag in the current context.
1795                    recursive_candidate_generator = lambda tag: tag.next_siblings
1796                elif token == '+':
1797                    # For each tag in the current context, run the next
1798                    # token as a CSS selector against the tag's next
1799                    # sibling that's a tag.
1800                    def next_tag_sibling(tag):
1801                        yield tag.find_next_sibling(True)
1802                    recursive_candidate_generator = next_tag_sibling
1803
1804                elif self.tag_name_re.match(token):
1805                    # Just a tag name.
1806                    tag_name = token
1807                else:
1808                    raise ValueError(
1809                        'Unsupported or invalid CSS selector: "%s"' % token)
1810                if recursive_candidate_generator:
1811                    # This happens when the selector looks like  "> foo".
1812                    #
1813                    # The generator calls select() recursively on every
1814                    # member of the current context, passing in a different
1815                    # candidate generator and a different selector.
1816                    #
1817                    # In the case of "> foo", the candidate generator is
1818                    # one that yields a tag's direct children (">"), and
1819                    # the selector is "foo".
1820                    next_token = tokens[index+1]
1821                    def recursive_select(tag):
1822                        if self._select_debug:
1823                            print('    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1824                            print('-' * 40)
1825                        for i in tag.select(next_token, recursive_candidate_generator):
1826                            if self._select_debug:
1827                                print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1828                            yield i
1829                        if self._select_debug:
1830                            print('-' * 40)
1831                    _use_candidate_generator = recursive_select
1832                elif _candidate_generator is None:
1833                    # By default, a tag's candidates are all of its
1834                    # children. If tag_name is defined, only yield tags
1835                    # with that name.
1836                    if self._select_debug:
1837                        if tag_name:
1838                            check = "[any]"
1839                        else:
1840                            check = tag_name
1841                        print('   Default candidate generator, tag name="%s"' % check)
1842                    if self._select_debug:
1843                        # This is redundant with later code, but it stops
1844                        # a bunch of bogus tags from cluttering up the
1845                        # debug log.
1846                        def default_candidate_generator(tag):
1847                            for child in tag.descendants:
1848                                if not isinstance(child, Tag):
1849                                    continue
1850                                if tag_name and not child.name == tag_name:
1851                                    continue
1852                                yield child
1853                        _use_candidate_generator = default_candidate_generator
1854                    else:
1855                        _use_candidate_generator = lambda tag: tag.descendants
1856                else:
1857                    _use_candidate_generator = _candidate_generator
1858
1859                count = 0
1860                for tag in current_context:
1861                    if self._select_debug:
1862                        print("    Running candidate generator on %s %s" % (
1863                            tag.name, repr(tag.attrs)))
1864                    for candidate in _use_candidate_generator(tag):
1865                        if not isinstance(candidate, Tag):
1866                            continue
1867                        if tag_name and candidate.name != tag_name:
1868                            continue
1869                        if checker is not None:
1870                            try:
1871                                result = checker(candidate)
1872                            except StopIteration:
1873                                # The checker has decided we should no longer
1874                                # run the generator.
1875                                break
1876                        if checker is None or result:
1877                            if self._select_debug:
1878                                print("     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1879                            if id(candidate) not in new_context_ids:
1880                                # If a tag matches a selector more than once,
1881                                # don't include it in the context more than once.
1882                                new_context.append(candidate)
1883                                new_context_ids.add(id(candidate))
1884                                if limit and len(new_context) >= limit:
1885                                    break
1886                        elif self._select_debug:
1887                            print("     FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1888
1889
1890            current_context = new_context
1891
1892        if self._select_debug:
1893            print("Final verdict:")
1894            for i in current_context:
1895                print(" %s %s" % (i.name, i.attrs))
1896        return current_context
1897
1898    # Old names for backwards compatibility
1899    def childGenerator(self):
1900        return self.children
1901
1902    def recursiveChildGenerator(self):
1903        return self.descendants
1904
1905    def has_key(self, key):
1906        """This was kind of misleading because has_key() (attributes)
1907        was different from __in__ (contents). has_key() is gone in
1908        Python 3, anyway."""
1909        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1910                key))
1911        return self.has_attr(key)
1912
1913# Next, a couple classes to represent queries and their results.
1914class SoupStrainer(object):
1915    """Encapsulates a number of ways of matching a markup element (tag or
1916    text)."""
1917
1918    def __init__(self, name=None, attrs=OrderedDict(), text=None, **kwargs):
1919        self.name = self._normalize_search_value(name)
1920        if not isinstance(attrs, dict):
1921            # Treat a non-dict value for attrs as a search for the 'class'
1922            # attribute.
1923            kwargs['class'] = attrs
1924            attrs = None
1925
1926        if 'class_' in kwargs:
1927            # Treat class_="foo" as a search for the 'class'
1928            # attribute, overriding any non-dict value for attrs.
1929            kwargs['class'] = kwargs['class_']
1930            del kwargs['class_']
1931
1932        if kwargs:
1933            if attrs:
1934                attrs = attrs.copy()
1935                attrs.update(kwargs)
1936            else:
1937                attrs = kwargs
1938        normalized_attrs = OrderedDict()
1939        for key, value in list(attrs.items()):
1940            normalized_attrs[key] = self._normalize_search_value(value)
1941
1942        self.attrs = normalized_attrs
1943        self.text = self._normalize_search_value(text)
1944
1945    def _normalize_search_value(self, value):
1946        # Leave it alone if it's a Unicode string, a callable, a
1947        # regular expression, a boolean, or None.
1948        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1949            or isinstance(value, bool) or value is None):
1950            return value
1951
1952        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1953        if isinstance(value, bytes):
1954            return value.decode("utf8")
1955
1956        # If it's listlike, convert it into a list of strings.
1957        if hasattr(value, '__iter__'):
1958            new_value = []
1959            for v in value:
1960                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1961                    and not isinstance(v, str)):
1962                    # This is almost certainly the user's mistake. In the
1963                    # interests of avoiding infinite loops, we'll let
1964                    # it through as-is rather than doing a recursive call.
1965                    new_value.append(v)
1966                else:
1967                    new_value.append(self._normalize_search_value(v))
1968            return new_value
1969
1970        # Otherwise, convert it into a Unicode string.
1971        return str(value)
1972
1973    def __str__(self):
1974        if self.text:
1975            return self.text
1976        else:
1977            return "%s|%s" % (self.name, self.attrs)
1978
1979    def search_tag(self, markup_name=None, markup_attrs=OrderedDict()):
1980        found = None
1981        markup = None
1982        if isinstance(markup_name, Tag):
1983            markup = markup_name
1984            markup_attrs = markup
1985        call_function_with_tag_data = (
1986            isinstance(self.name, Callable)
1987            and not isinstance(markup_name, Tag))
1988
1989        if ((not self.name)
1990            or call_function_with_tag_data
1991            or (markup and self._matches(markup, self.name))
1992            or (not markup and self._matches(markup_name, self.name))):
1993            if call_function_with_tag_data:
1994                match = self.name(markup_name, markup_attrs)
1995            else:
1996                match = True
1997                markup_attr_map = None
1998                for attr, match_against in list(self.attrs.items()):
1999                    if not markup_attr_map:
2000                        if hasattr(markup_attrs, 'get'):
2001                            markup_attr_map = markup_attrs
2002                        else:
2003                            markup_attr_map = OrderedDict()
2004                            for k, v in markup_attrs:
2005                                markup_attr_map[k] = v
2006                    attr_value = markup_attr_map.get(attr)
2007                    if not self._matches(attr_value, match_against):
2008                        match = False
2009                        break
2010            if match:
2011                if markup:
2012                    found = markup
2013                else:
2014                    found = markup_name
2015        if found and self.text and not self._matches(found.string, self.text):
2016            found = None
2017        return found
2018    searchTag = search_tag
2019
2020    def search(self, markup):
2021        # print 'looking for %s in %s' % (self, markup)
2022        found = None
2023        # If given a list of items, scan it for a text element that
2024        # matches.
2025        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2026            for element in markup:
2027                if isinstance(element, NavigableString) \
2028                       and self.search(element):
2029                    found = element
2030                    break
2031        # If it's a Tag, make sure its name or attributes match.
2032        # Don't bother with Tags if we're searching for text.
2033        elif isinstance(markup, Tag):
2034            if not self.text or self.name or self.attrs:
2035                found = self.search_tag(markup)
2036        # If it's text, make sure the text matches.
2037        elif isinstance(markup, NavigableString) or \
2038                 isinstance(markup, str):
2039            if not self.name and not self.attrs and self._matches(markup, self.text):
2040                found = markup
2041        else:
2042            raise Exception(
2043                "I don't know how to match against a %s" % markup.__class__)
2044        return found
2045
2046    def _matches(self, markup, match_against):
2047        # print u"Matching %s against %s" % (markup, match_against)
2048        result = False
2049        if isinstance(markup, list) or isinstance(markup, tuple):
2050            # This should only happen when searching a multi-valued attribute
2051            # like 'class'.
2052            if (isinstance(match_against, str)
2053                and ' ' in match_against):
2054                # A bit of a special case. If they try to match "foo
2055                # bar" on a multivalue attribute's value, only accept
2056                # the literal value "foo bar"
2057                #
2058                # XXX This is going to be pretty slow because we keep
2059                # splitting match_against. But it shouldn't come up
2060                # too often.
2061                return (whitespace_re.split(match_against) == markup)
2062            else:
2063                for item in markup:
2064                    if self._matches(item, match_against):
2065                        return True
2066                return False
2067
2068        if match_against is True:
2069            # True matches any non-None value.
2070            return markup is not None
2071
2072        if isinstance(match_against, Callable):
2073            return match_against(markup)
2074
2075        # Custom callables take the tag as an argument, but all
2076        # other ways of matching match the tag name as a string.
2077        if isinstance(markup, Tag):
2078            markup = markup.name
2079
2080        # Ensure that `markup` is either a Unicode string, or None.
2081        markup = self._normalize_search_value(markup)
2082
2083        if markup is None:
2084            # None matches None, False, an empty string, an empty list, and so on.
2085            return not match_against
2086
2087        if isinstance(match_against, str):
2088            # Exact string match
2089            return markup == match_against
2090
2091        if hasattr(match_against, 'match'):
2092            # Regexp match
2093            return match_against.search(markup)
2094
2095        if hasattr(match_against, '__iter__'):
2096            # The markup must be an exact match against something
2097            # in the iterable.
2098            return markup in match_against
2099
2100
2101class ResultSet(list):
2102    """A ResultSet is just a list that keeps track of the SoupStrainer
2103    that created it."""
2104    def __init__(self, source, result=()):
2105        super(ResultSet, self).__init__(result)
2106        self.source = source
2107