1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-formed XML/HTML document yields a well-formed data
11structure. An ill-formed XML/HTML document yields a correspondingly
12ill-formed data structure. If your document is only locally
13well-formed, you can use this library to find and process the
14well-formed part of it.
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39For more than you ever wanted to know about Beautiful Soup, see the
40documentation:
41http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43Here, have some legalese:
44
45Copyright (c) 2004-2010, Leonard Richardson
46
47All rights reserved.
48
49Redistribution and use in source and binary forms, with or without
50modification, are permitted provided that the following conditions are
51met:
52
53  * Redistributions of source code must retain the above copyright
54    notice, this list of conditions and the following disclaimer.
55
56  * Redistributions in binary form must reproduce the above
57    copyright notice, this list of conditions and the following
58    disclaimer in the documentation and/or other materials provided
59    with the distribution.
60
61  * Neither the name of the the Beautiful Soup Consortium and All
62    Night Kosher Bakery nor the names of its contributors may be
63    used to endorse or promote products derived from this software
64    without specific prior written permission.
65
66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78"""
79from __future__ import generators
80from __future__ import print_function
81
82__author__ = "Leonard Richardson (leonardr@segfault.org)"
83__version__ = "3.2.1"
84__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
85__license__ = "New-style BSD"
86
87import codecs
88import types
89import re
90import sys
91
92if sys.version_info >= (3, 0):
93    xrange = range
94    text_type = str
95    binary_type = bytes
96    basestring = str
97else:
98    text_type = unicode
99    binary_type = str
100
101try:
102  from htmlentitydefs import name2codepoint
103except ImportError:
104  name2codepoint = {}
105try:
106    set
107except NameError:
108    from sets import Set as set
109
110try:
111    import sgmllib
112except ImportError:
113    from lib.utils import sgmllib
114
115try:
116    import markupbase
117except ImportError:
118    import _markupbase as markupbase
119
120#These hacks make Beautiful Soup able to parse XML with namespaces
121sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
122markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
123
124DEFAULT_OUTPUT_ENCODING = "utf-8"
125
126def _match_css_class(str):
127    """Build a RE to match the given CSS class."""
128    return re.compile(r"(^|.*\s)%s($|\s)" % str)
129
130# First, the classes that represent markup elements.
131
132class PageElement(object):
133    """Contains the navigational information for some part of the page
134    (either a tag or a piece of text)"""
135
136    def _invert(h):
137        "Cheap function to invert a hash."
138        i = {}
139        for k,v in h.items():
140            i[v] = k
141        return i
142
143    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
144                                      "quot" : '"',
145                                      "amp" : "&",
146                                      "lt" : "<",
147                                      "gt" : ">" }
148
149    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
150
151    def setup(self, parent=None, previous=None):
152        """Sets up the initial relations between this element and
153        other elements."""
154        self.parent = parent
155        self.previous = previous
156        self.next = None
157        self.previousSibling = None
158        self.nextSibling = None
159        if self.parent and self.parent.contents:
160            self.previousSibling = self.parent.contents[-1]
161            self.previousSibling.nextSibling = self
162
163    def replaceWith(self, replaceWith):
164        oldParent = self.parent
165        myIndex = self.parent.index(self)
166        if hasattr(replaceWith, "parent")\
167                  and replaceWith.parent is self.parent:
168            # We're replacing this element with one of its siblings.
169            index = replaceWith.parent.index(replaceWith)
170            if index and index < myIndex:
171                # Furthermore, it comes before this element. That
172                # means that when we extract it, the index of this
173                # element will change.
174                myIndex = myIndex - 1
175        self.extract()
176        oldParent.insert(myIndex, replaceWith)
177
178    def replaceWithChildren(self):
179        myParent = self.parent
180        myIndex = self.parent.index(self)
181        self.extract()
182        reversedChildren = list(self.contents)
183        reversedChildren.reverse()
184        for child in reversedChildren:
185            myParent.insert(myIndex, child)
186
187    def extract(self):
188        """Destructively rips this element out of the tree."""
189        if self.parent:
190            try:
191                del self.parent.contents[self.parent.index(self)]
192            except ValueError:
193                pass
194
195        #Find the two elements that would be next to each other if
196        #this element (and any children) hadn't been parsed. Connect
197        #the two.
198        lastChild = self._lastRecursiveChild()
199        nextElement = lastChild.next
200
201        if self.previous:
202            self.previous.next = nextElement
203        if nextElement:
204            nextElement.previous = self.previous
205        self.previous = None
206        lastChild.next = None
207
208        self.parent = None
209        if self.previousSibling:
210            self.previousSibling.nextSibling = self.nextSibling
211        if self.nextSibling:
212            self.nextSibling.previousSibling = self.previousSibling
213        self.previousSibling = self.nextSibling = None
214        return self
215
216    def _lastRecursiveChild(self):
217        "Finds the last element beneath this object to be parsed."
218        lastChild = self
219        while hasattr(lastChild, 'contents') and lastChild.contents:
220            lastChild = lastChild.contents[-1]
221        return lastChild
222
223    def insert(self, position, newChild):
224        if isinstance(newChild, basestring) \
225            and not isinstance(newChild, NavigableString):
226            newChild = NavigableString(newChild)
227
228        position =  min(position, len(self.contents))
229        if hasattr(newChild, 'parent') and newChild.parent is not None:
230            # We're 'inserting' an element that's already one
231            # of this object's children.
232            if newChild.parent is self:
233                index = self.index(newChild)
234                if index > position:
235                    # Furthermore we're moving it further down the
236                    # list of this object's children. That means that
237                    # when we extract this element, our target index
238                    # will jump down one.
239                    position = position - 1
240            newChild.extract()
241
242        newChild.parent = self
243        previousChild = None
244        if position == 0:
245            newChild.previousSibling = None
246            newChild.previous = self
247        else:
248            previousChild = self.contents[position-1]
249            newChild.previousSibling = previousChild
250            newChild.previousSibling.nextSibling = newChild
251            newChild.previous = previousChild._lastRecursiveChild()
252        if newChild.previous:
253            newChild.previous.next = newChild
254
255        newChildsLastElement = newChild._lastRecursiveChild()
256
257        if position >= len(self.contents):
258            newChild.nextSibling = None
259
260            parent = self
261            parentsNextSibling = None
262            while not parentsNextSibling:
263                parentsNextSibling = parent.nextSibling
264                parent = parent.parent
265                if not parent: # This is the last element in the document.
266                    break
267            if parentsNextSibling:
268                newChildsLastElement.next = parentsNextSibling
269            else:
270                newChildsLastElement.next = None
271        else:
272            nextChild = self.contents[position]
273            newChild.nextSibling = nextChild
274            if newChild.nextSibling:
275                newChild.nextSibling.previousSibling = newChild
276            newChildsLastElement.next = nextChild
277
278        if newChildsLastElement.next:
279            newChildsLastElement.next.previous = newChildsLastElement
280        self.contents.insert(position, newChild)
281
282    def append(self, tag):
283        """Appends the given tag to the contents of this tag."""
284        self.insert(len(self.contents), tag)
285
286    def findNext(self, name=None, attrs={}, text=None, **kwargs):
287        """Returns the first item that matches the given criteria and
288        appears after this Tag in the document."""
289        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
290
291    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
292                    **kwargs):
293        """Returns all items that match the given criteria and appear
294        after this Tag in the document."""
295        return self._findAll(name, attrs, text, limit, self.nextGenerator,
296                             **kwargs)
297
298    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
299        """Returns the closest sibling to this Tag that matches the
300        given criteria and appears after this Tag in the document."""
301        return self._findOne(self.findNextSiblings, name, attrs, text,
302                             **kwargs)
303
304    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
305                         **kwargs):
306        """Returns the siblings of this Tag that match the given
307        criteria and appear after this Tag in the document."""
308        return self._findAll(name, attrs, text, limit,
309                             self.nextSiblingGenerator, **kwargs)
310    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
311
312    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
313        """Returns the first item that matches the given criteria and
314        appears before this Tag in the document."""
315        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
316
317    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
318                        **kwargs):
319        """Returns all items that match the given criteria and appear
320        before this Tag in the document."""
321        return self._findAll(name, attrs, text, limit, self.previousGenerator,
322                           **kwargs)
323    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
324
325    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
326        """Returns the closest sibling to this Tag that matches the
327        given criteria and appears before this Tag in the document."""
328        return self._findOne(self.findPreviousSiblings, name, attrs, text,
329                             **kwargs)
330
331    def findPreviousSiblings(self, name=None, attrs={}, text=None,
332                             limit=None, **kwargs):
333        """Returns the siblings of this Tag that match the given
334        criteria and appear before this Tag in the document."""
335        return self._findAll(name, attrs, text, limit,
336                             self.previousSiblingGenerator, **kwargs)
337    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
338
339    def findParent(self, name=None, attrs={}, **kwargs):
340        """Returns the closest parent of this Tag that matches the given
341        criteria."""
342        # NOTE: We can't use _findOne because findParents takes a different
343        # set of arguments.
344        r = None
345        l = self.findParents(name, attrs, 1)
346        if l:
347            r = l[0]
348        return r
349
350    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
351        """Returns the parents of this Tag that match the given
352        criteria."""
353
354        return self._findAll(name, attrs, None, limit, self.parentGenerator,
355                             **kwargs)
356    fetchParents = findParents # Compatibility with pre-3.x
357
358    #These methods do the real heavy lifting.
359
360    def _findOne(self, method, name, attrs, text, **kwargs):
361        r = None
362        l = method(name, attrs, text, 1, **kwargs)
363        if l:
364            r = l[0]
365        return r
366
367    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
368        "Iterates over a generator looking for things that match."
369
370        if isinstance(name, SoupStrainer):
371            strainer = name
372        # (Possibly) special case some findAll*(...) searches
373        elif text is None and not limit and not attrs and not kwargs:
374            # findAll*(True)
375            if name is True:
376                return [element for element in generator()
377                        if isinstance(element, Tag)]
378            # findAll*('tag-name')
379            elif isinstance(name, basestring):
380                return [element for element in generator()
381                        if isinstance(element, Tag) and
382                        element.name == name]
383            else:
384                strainer = SoupStrainer(name, attrs, text, **kwargs)
385        # Build a SoupStrainer
386        else:
387            strainer = SoupStrainer(name, attrs, text, **kwargs)
388        results = ResultSet(strainer)
389        g = generator()
390        while True:
391            try:
392                i = next(g)
393            except StopIteration:
394                break
395            if i:
396                found = strainer.search(i)
397                if found:
398                    results.append(found)
399                    if limit and len(results) >= limit:
400                        break
401        return results
402
403    #These Generators can be used to navigate starting from both
404    #NavigableStrings and Tags.
405    def nextGenerator(self):
406        i = self
407        while i is not None:
408            i = i.next
409            yield i
410
411    def nextSiblingGenerator(self):
412        i = self
413        while i is not None:
414            i = i.nextSibling
415            yield i
416
417    def previousGenerator(self):
418        i = self
419        while i is not None:
420            i = i.previous
421            yield i
422
423    def previousSiblingGenerator(self):
424        i = self
425        while i is not None:
426            i = i.previousSibling
427            yield i
428
429    def parentGenerator(self):
430        i = self
431        while i is not None:
432            i = i.parent
433            yield i
434
435    # Utility methods
436    def substituteEncoding(self, str, encoding=None):
437        encoding = encoding or "utf-8"
438        return str.replace("%SOUP-ENCODING%", encoding)
439
440    def toEncoding(self, s, encoding=None):
441        """Encodes an object to a string in some encoding, or to Unicode.
442        ."""
443        if isinstance(s, text_type):
444            if encoding:
445                s = s.encode(encoding)
446        elif isinstance(s, binary_type):
447            s = s.encode(encoding or "utf8")
448        else:
449            s  = self.toEncoding(str(s), encoding or "utf8")
450        return s
451
452    BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))")
453
454    def _sub_entity(self, x):
455        """Used with a regular expression to substitute the
456        appropriate XML entity for an XML special character."""
457        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
458
459
460class NavigableString(text_type, PageElement):
461
462    def __new__(cls, value):
463        """Create a new NavigableString.
464
465        When unpickling a NavigableString, this method is called with
466        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
467        passed in to the superclass's __new__ or the superclass won't know
468        how to handle non-ASCII characters.
469        """
470        if isinstance(value, text_type):
471            return text_type.__new__(cls, value)
472        return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
473
474    def __getnewargs__(self):
475        return (NavigableString.__str__(self),)
476
477    def __getattr__(self, attr):
478        """text.string gives you text. This is for backwards
479        compatibility for Navigable*String, but for CData* it lets you
480        get the string without the CData wrapper."""
481        if attr == 'string':
482            return self
483        else:
484            raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
485
486    def __unicode__(self):
487        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
488
489    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
490        # Substitute outgoing XML entities.
491        data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
492        if encoding:
493            return data.encode(encoding)
494        else:
495            return data
496
497class CData(NavigableString):
498
499    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
501
502class ProcessingInstruction(NavigableString):
503    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504        output = self
505        if "%SOUP-ENCODING%" in output:
506            output = self.substituteEncoding(output, encoding)
507        return "<?%s?>" % self.toEncoding(output, encoding)
508
509class Comment(NavigableString):
510    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
511        return "<!--%s-->" % NavigableString.__str__(self, encoding)
512
513class Declaration(NavigableString):
514    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
515        return "<!%s>" % NavigableString.__str__(self, encoding)
516
517class Tag(PageElement):
518
519    """Represents a found HTML tag with its attributes and contents."""
520
521    def _convertEntities(self, match):
522        """Used in a call to re.sub to replace HTML, XML, and numeric
523        entities with the appropriate Unicode characters. If HTML
524        entities are being converted, any unrecognized entities are
525        escaped."""
526        try:
527            x = match.group(1)
528            if self.convertHTMLEntities and x in name2codepoint:
529                return unichr(name2codepoint[x])
530            elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
531                if self.convertXMLEntities:
532                    return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
533                else:
534                    return u'&%s;' % x
535            elif len(x) > 0 and x[0] == '#':
536                # Handle numeric entities
537                if len(x) > 1 and x[1] == 'x':
538                    return unichr(int(x[2:], 16))
539                else:
540                    return unichr(int(x[1:]))
541
542            elif self.escapeUnrecognizedEntities:
543                return u'&amp;%s;' % x
544
545        except ValueError:  # e.g. ValueError: unichr() arg not in range(0x10000)
546            pass
547
548        return u'&%s;' % x
549
550    def __init__(self, parser, name, attrs=None, parent=None,
551                 previous=None):
552        "Basic constructor."
553
554        # We don't actually store the parser object: that lets extracted
555        # chunks be garbage-collected
556        self.parserClass = parser.__class__
557        self.isSelfClosing = parser.isSelfClosingTag(name)
558        self.name = name
559        if attrs is None:
560            attrs = []
561        elif isinstance(attrs, dict):
562            attrs = attrs.items()
563        self.attrs = attrs
564        self.contents = []
565        self.setup(parent, previous)
566        self.hidden = False
567        self.containsSubstitutions = False
568        self.convertHTMLEntities = parser.convertHTMLEntities
569        self.convertXMLEntities = parser.convertXMLEntities
570        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
571
572        # Convert any HTML, XML, or numeric entities in the attribute values.
573        # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8
574        convert = lambda k_val: (k_val[0],
575                                 re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);",
576                                     self._convertEntities,
577                                     k_val[1]))
578        self.attrs = map(convert, self.attrs)
579
580    def getString(self):
581        if (len(self.contents) == 1
582            and isinstance(self.contents[0], NavigableString)):
583            return self.contents[0]
584
585    def setString(self, string):
586        """Replace the contents of the tag with a string"""
587        self.clear()
588        self.append(string)
589
590    string = property(getString, setString)
591
592    def getText(self, separator=u""):
593        if not len(self.contents):
594            return u""
595        stopNode = self._lastRecursiveChild().next
596        strings = []
597        current = self.contents[0]
598        while current is not stopNode:
599            if isinstance(current, NavigableString):
600                strings.append(current.strip())
601            current = current.next
602        return separator.join(strings)
603
604    text = property(getText)
605
606    def get(self, key, default=None):
607        """Returns the value of the 'key' attribute for the tag, or
608        the value given for 'default' if it doesn't have that
609        attribute."""
610        return self._getAttrMap().get(key, default)
611
612    def clear(self):
613        """Extract all children."""
614        for child in self.contents[:]:
615            child.extract()
616
617    def index(self, element):
618        for i, child in enumerate(self.contents):
619            if child is element:
620                return i
621        raise ValueError("Tag.index: element not in tag")
622
623    def has_key(self, key):
624        return self._getAttrMap().has_key(key)
625
626    def __getitem__(self, key):
627        """tag[key] returns the value of the 'key' attribute for the tag,
628        and throws an exception if it's not there."""
629        return self._getAttrMap()[key]
630
631    def __iter__(self):
632        "Iterating over a tag iterates over its contents."
633        return iter(self.contents)
634
635    def __len__(self):
636        "The length of a tag is the length of its list of contents."
637        return len(self.contents)
638
639    def __contains__(self, x):
640        return x in self.contents
641
642    def __nonzero__(self):
643        "A tag is non-None even if it has no contents."
644        return True
645
646    def __setitem__(self, key, value):
647        """Setting tag[key] sets the value of the 'key' attribute for the
648        tag."""
649        self._getAttrMap()
650        self.attrMap[key] = value
651        found = False
652        for i in xrange(0, len(self.attrs)):
653            if self.attrs[i][0] == key:
654                self.attrs[i] = (key, value)
655                found = True
656        if not found:
657            self.attrs.append((key, value))
658        self._getAttrMap()[key] = value
659
660    def __delitem__(self, key):
661        "Deleting tag[key] deletes all 'key' attributes for the tag."
662        for item in self.attrs:
663            if item[0] == key:
664                self.attrs.remove(item)
665                #We don't break because bad HTML can define the same
666                #attribute multiple times.
667            self._getAttrMap()
668            if self.attrMap.has_key(key):
669                del self.attrMap[key]
670
671    def __call__(self, *args, **kwargs):
672        """Calling a tag like a function is the same as calling its
673        findAll() method. Eg. tag('a') returns a list of all the A tags
674        found within this tag."""
675        return self.findAll(*args, **kwargs)
676
677    def __getattr__(self, tag):
678        #print "Getattr %s.%s" % (self.__class__, tag)
679        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
680            return self.find(tag[:-3])
681        elif tag.find('__') != 0:
682            return self.find(tag)
683        raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
684
685    def __eq__(self, other):
686        """Returns true iff this tag has the same name, the same attributes,
687        and the same contents (recursively) as the given tag.
688
689        NOTE: right now this will return false if two tags have the
690        same attributes in a different order. Should this be fixed?"""
691        if other is self:
692            return True
693        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
694            return False
695        for i in xrange(0, len(self.contents)):
696            if self.contents[i] != other.contents[i]:
697                return False
698        return True
699
700    def __ne__(self, other):
701        """Returns true iff this tag is not identical to the other tag,
702        as defined in __eq__."""
703        return not self == other
704
705    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
706        """Renders this tag as a string."""
707        return self.__str__(encoding)
708
709    def __unicode__(self):
710        return self.__str__(None)
711
712    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
713                prettyPrint=False, indentLevel=0):
714        """Returns a string or Unicode representation of this tag and
715        its contents. To get Unicode, pass None for encoding.
716
717        NOTE: since Python's HTML parser consumes whitespace, this
718        method is not certain to reproduce the whitespace present in
719        the original string."""
720
721        encodedName = self.toEncoding(self.name, encoding)
722
723        attrs = []
724        if self.attrs:
725            for key, val in self.attrs:
726                fmt = '%s="%s"'
727                if isinstance(val, basestring):
728                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
729                        val = self.substituteEncoding(val, encoding)
730
731                    # The attribute value either:
732                    #
733                    # * Contains no embedded double quotes or single quotes.
734                    #   No problem: we enclose it in double quotes.
735                    # * Contains embedded single quotes. No problem:
736                    #   double quotes work here too.
737                    # * Contains embedded double quotes. No problem:
738                    #   we enclose it in single quotes.
739                    # * Embeds both single _and_ double quotes. This
740                    #   can't happen naturally, but it can happen if
741                    #   you modify an attribute value after parsing
742                    #   the document. Now we have a bit of a
743                    #   problem. We solve it by enclosing the
744                    #   attribute in single quotes, and escaping any
745                    #   embedded single quotes to XML entities.
746                    if '"' in val:
747                        fmt = "%s='%s'"
748                        if "'" in val:
749                            # TODO: replace with apos when
750                            # appropriate.
751                            val = val.replace("'", "&squot;")
752
753                    # Now we're okay w/r/t quotes. But the attribute
754                    # value might also contain angle brackets, or
755                    # ampersands that aren't part of entities. We need
756                    # to escape those to XML entities too.
757                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
758
759                attrs.append(fmt % (self.toEncoding(key, encoding),
760                                    self.toEncoding(val, encoding)))
761        close = ''
762        closeTag = ''
763        if self.isSelfClosing:
764            close = ' /'
765        else:
766            closeTag = '</%s>' % encodedName
767
768        indentTag, indentContents = 0, 0
769        if prettyPrint:
770            indentTag = indentLevel
771            space = (' ' * (indentTag-1))
772            indentContents = indentTag + 1
773        contents = self.renderContents(encoding, prettyPrint, indentContents)
774        if self.hidden:
775            s = contents
776        else:
777            s = []
778            attributeString = ''
779            if attrs:
780                attributeString = ' ' + ' '.join(attrs)
781            if prettyPrint:
782                s.append(space)
783            s.append('<%s%s%s>' % (encodedName, attributeString, close))
784            if prettyPrint:
785                s.append("\n")
786            s.append(contents)
787            if prettyPrint and contents and contents[-1] != "\n":
788                s.append("\n")
789            if prettyPrint and closeTag:
790                s.append(space)
791            s.append(closeTag)
792            if prettyPrint and closeTag and self.nextSibling:
793                s.append("\n")
794            s = ''.join(s)
795        return s
796
797    def decompose(self):
798        """Recursively destroys the contents of this tree."""
799        self.extract()
800        if len(self.contents) == 0:
801            return
802        current = self.contents[0]
803        while current is not None:
804            next = current.next
805            if isinstance(current, Tag):
806                del current.contents[:]
807            current.parent = None
808            current.previous = None
809            current.previousSibling = None
810            current.next = None
811            current.nextSibling = None
812            current = next
813
814    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
815        return self.__str__(encoding, True)
816
817    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
818                       prettyPrint=False, indentLevel=0):
819        """Renders the contents of this tag as a string in the given
820        encoding. If encoding is None, returns a Unicode string.."""
821        s=[]
822        for c in self:
823            text = None
824            if isinstance(c, NavigableString):
825                text = c.__str__(encoding)
826            elif isinstance(c, Tag):
827                s.append(c.__str__(encoding, prettyPrint, indentLevel))
828            if text and prettyPrint:
829                text = text.strip()
830            if text:
831                if prettyPrint:
832                    s.append(" " * (indentLevel-1))
833                s.append(text)
834                if prettyPrint:
835                    s.append("\n")
836
837        return ''.join(s)
838
839    #Soup methods
840
841    def find(self, name=None, attrs={}, recursive=True, text=None,
842             **kwargs):
843        """Return only the first child of this Tag matching the given
844        criteria."""
845        r = None
846        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
847        if l:
848            r = l[0]
849        return r
850    findChild = find
851
852    def findAll(self, name=None, attrs={}, recursive=True, text=None,
853                limit=None, **kwargs):
854        """Extracts a list of Tag objects that match the given
855        criteria.  You can specify the name of the Tag and any
856        attributes you want the Tag to have.
857
858        The value of a key-value pair in the 'attrs' map can be a
859        string, a list of strings, a regular expression object, or a
860        callable that takes a string and returns whether or not the
861        string matches for some custom definition of 'matches'. The
862        same is true of the tag name."""
863        generator = self.recursiveChildGenerator
864        if not recursive:
865            generator = self.childGenerator
866        return self._findAll(name, attrs, text, limit, generator, **kwargs)
867    findChildren = findAll
868
869    # Pre-3.x compatibility methods
870    first = find
871    fetch = findAll
872
873    def fetchText(self, text=None, recursive=True, limit=None):
874        return self.findAll(text=text, recursive=recursive, limit=limit)
875
876    def firstText(self, text=None, recursive=True):
877        return self.find(text=text, recursive=recursive)
878
879    #Private methods
880
881    def _getAttrMap(self):
882        """Initializes a map representation of this tag's attributes,
883        if not already initialized."""
884        if not getattr(self, 'attrMap'):
885            self.attrMap = {}
886            for (key, value) in self.attrs:
887                self.attrMap[key] = value
888        return self.attrMap
889
890    #Generator methods
891    def childGenerator(self):
892        # Just use the iterator from the contents
893        return iter(self.contents)
894
895    def recursiveChildGenerator(self):
896        if not len(self.contents):
897            return  # Note: https://stackoverflow.com/a/30217723 (PEP 479)
898        stopNode = self._lastRecursiveChild().next
899        current = self.contents[0]
900        while current is not stopNode:
901            yield current
902            current = current.next
903
904
905# Next, a couple classes to represent queries and their results.
906class SoupStrainer:
907    """Encapsulates a number of ways of matching a markup element (tag or
908    text)."""
909
910    def __init__(self, name=None, attrs={}, text=None, **kwargs):
911        self.name = name
912        if isinstance(attrs, basestring):
913            kwargs['class'] = _match_css_class(attrs)
914            attrs = None
915        if kwargs:
916            if attrs:
917                attrs = attrs.copy()
918                attrs.update(kwargs)
919            else:
920                attrs = kwargs
921        self.attrs = attrs
922        self.text = text
923
924    def __str__(self):
925        if self.text:
926            return self.text
927        else:
928            return "%s|%s" % (self.name, self.attrs)
929
930    def searchTag(self, markupName=None, markupAttrs={}):
931        found = None
932        markup = None
933        if isinstance(markupName, Tag):
934            markup = markupName
935            markupAttrs = markup
936        callFunctionWithTagData = callable(self.name) \
937                                and not isinstance(markupName, Tag)
938
939        if (not self.name) \
940               or callFunctionWithTagData \
941               or (markup and self._matches(markup, self.name)) \
942               or (not markup and self._matches(markupName, self.name)):
943            if callFunctionWithTagData:
944                match = self.name(markupName, markupAttrs)
945            else:
946                match = True
947                markupAttrMap = None
948                for attr, matchAgainst in self.attrs.items():
949                    if not markupAttrMap:
950                         if hasattr(markupAttrs, 'get'):
951                            markupAttrMap = markupAttrs
952                         else:
953                            markupAttrMap = {}
954                            for k,v in markupAttrs:
955                                markupAttrMap[k] = v
956                    attrValue = markupAttrMap.get(attr)
957                    if not self._matches(attrValue, matchAgainst):
958                        match = False
959                        break
960            if match:
961                if markup:
962                    found = markup
963                else:
964                    found = markupName
965        return found
966
967    def search(self, markup):
968        #print 'looking for %s in %s' % (self, markup)
969        found = None
970        # If given a list of items, scan it for a text element that
971        # matches.
972        if hasattr(markup, "__iter__") \
973                and not isinstance(markup, Tag):
974            for element in markup:
975                if isinstance(element, NavigableString) \
976                       and self.search(element):
977                    found = element
978                    break
979        # If it's a Tag, make sure its name or attributes match.
980        # Don't bother with Tags if we're searching for text.
981        elif isinstance(markup, Tag):
982            if not self.text:
983                found = self.searchTag(markup)
984        # If it's text, make sure the text matches.
985        elif isinstance(markup, NavigableString) or \
986                 isinstance(markup, basestring):
987            if self._matches(markup, self.text):
988                found = markup
989        else:
990            raise Exception("I don't know how to match against a %s" \
991                  % markup.__class__)
992        return found
993
994    def _matches(self, markup, matchAgainst):
995        #print "Matching %s against %s" % (markup, matchAgainst)
996        result = False
997        if matchAgainst is True:
998            result = markup is not None
999        elif callable(matchAgainst):
1000            result = matchAgainst(markup)
1001        else:
1002            #Custom match methods take the tag as an argument, but all
1003            #other ways of matching match the tag name as a string.
1004            if isinstance(markup, Tag):
1005                markup = markup.name
1006            if markup and not isinstance(markup, basestring):
1007                markup = text_type(markup)
1008            #Now we know that chunk is either a string, or None.
1009            if hasattr(matchAgainst, 'match'):
1010                # It's a regexp object.
1011                result = markup and matchAgainst.search(markup)
1012            elif hasattr(matchAgainst, '__iter__'): # list-like
1013                result = markup in matchAgainst
1014            elif hasattr(matchAgainst, 'items'):
1015                result = markup.has_key(matchAgainst)
1016            elif matchAgainst and isinstance(markup, basestring):
1017                if isinstance(markup, text_type):
1018                    matchAgainst = text_type(matchAgainst)
1019                else:
1020                    matchAgainst = str(matchAgainst)
1021
1022            if not result:
1023                result = matchAgainst == markup
1024        return result
1025
1026class ResultSet(list):
1027    """A ResultSet is just a list that keeps track of the SoupStrainer
1028    that created it."""
1029    def __init__(self, source):
1030        list.__init__([])
1031        self.source = source
1032
1033# Now, some helper functions.
1034
1035def buildTagMap(default, *args):
1036    """Turns a list of maps, lists, or scalars into a single map.
1037    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1038    NESTING_RESET_TAGS maps out of lists and partial maps."""
1039    built = {}
1040    for portion in args:
1041        if hasattr(portion, 'items'):
1042            #It's a map. Merge it.
1043            for k,v in portion.items():
1044                built[k] = v
1045        elif hasattr(portion, '__iter__'): # is a list
1046            #It's a list. Map each item to the default.
1047            for k in portion:
1048                built[k] = default
1049        else:
1050            #It's a scalar. Map it to the default.
1051            built[portion] = default
1052    return built
1053
1054# Now, the parser classes.
1055
1056class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
1057
1058    """This class contains the basic parser and search code. It defines
1059    a parser that knows nothing about tag behavior except for the
1060    following:
1061
1062      You can't close a tag without closing all the tags it encloses.
1063      That is, "<foo><bar></foo>" actually means
1064      "<foo><bar></bar></foo>".
1065
1066    [Another possible explanation is "<foo><bar /></foo>", but since
1067    this class defines no SELF_CLOSING_TAGS, it will never use that
1068    explanation.]
1069
1070    This class is useful for parsing XML or made-up markup languages,
1071    or when BeautifulSoup makes an assumption counter to what you were
1072    expecting."""
1073
1074    SELF_CLOSING_TAGS = {}
1075    NESTABLE_TAGS = {}
1076    RESET_NESTING_TAGS = {}
1077    QUOTE_TAGS = {}
1078    PRESERVE_WHITESPACE_TAGS = []
1079
1080    MARKUP_MASSAGE = [(re.compile(r'(<[^<>]*)/>'),
1081                       lambda x: x.group(1) + ' />'),
1082                      (re.compile(r'<!\s+([^<>]*)>'),
1083                       lambda x: '<!' + x.group(1) + '>')
1084                      ]
1085
1086    ROOT_TAG_NAME = u'[document]'
1087
1088    HTML_ENTITIES = "html"
1089    XML_ENTITIES = "xml"
1090    XHTML_ENTITIES = "xhtml"
1091    # TODO: This only exists for backwards-compatibility
1092    ALL_ENTITIES = XHTML_ENTITIES
1093
1094    # Used when determining whether a text node is all whitespace and
1095    # can be replaced with a single space. A text node that contains
1096    # fancy Unicode spaces (usually non-breaking) should be left
1097    # alone.
1098    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1099
1100    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1101                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1102                 convertEntities=None, selfClosingTags=None, isHTML=False):
1103        """The Soup object is initialized as the 'root tag', and the
1104        provided markup (which can be a string or a file-like object)
1105        is fed into the underlying parser.
1106
1107        sgmllib will process most bad HTML, and the BeautifulSoup
1108        class has some tricks for dealing with some HTML that kills
1109        sgmllib, but Beautiful Soup can nonetheless choke or lose data
1110        if your data uses self-closing tags or declarations
1111        incorrectly.
1112
1113        By default, Beautiful Soup uses regexes to sanitize input,
1114        avoiding the vast majority of these problems. If the problems
1115        don't apply to you, pass in False for markupMassage, and
1116        you'll get better performance.
1117
1118        The default parser massage techniques fix the two most common
1119        instances of invalid HTML that choke sgmllib:
1120
1121         <br/> (No space between name of closing tag and tag close)
1122         <! --Comment--> (Extraneous whitespace in declaration)
1123
1124        You can pass in a custom list of (RE object, replace method)
1125        tuples to get Beautiful Soup to scrub your input the way you
1126        want."""
1127
1128        self.parseOnlyThese = parseOnlyThese
1129        self.fromEncoding = fromEncoding
1130        self.smartQuotesTo = smartQuotesTo
1131        self.convertEntities = convertEntities
1132        # Set the rules for how we'll deal with the entities we
1133        # encounter
1134        if self.convertEntities:
1135            # It doesn't make sense to convert encoded characters to
1136            # entities even while you're converting entities to Unicode.
1137            # Just convert it all to Unicode.
1138            self.smartQuotesTo = None
1139            if convertEntities == self.HTML_ENTITIES:
1140                self.convertXMLEntities = False
1141                self.convertHTMLEntities = True
1142                self.escapeUnrecognizedEntities = True
1143            elif convertEntities == self.XHTML_ENTITIES:
1144                self.convertXMLEntities = True
1145                self.convertHTMLEntities = True
1146                self.escapeUnrecognizedEntities = False
1147            elif convertEntities == self.XML_ENTITIES:
1148                self.convertXMLEntities = True
1149                self.convertHTMLEntities = False
1150                self.escapeUnrecognizedEntities = False
1151        else:
1152            self.convertXMLEntities = False
1153            self.convertHTMLEntities = False
1154            self.escapeUnrecognizedEntities = False
1155
1156        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1157        sgmllib.SGMLParser.__init__(self)
1158
1159        if hasattr(markup, 'read'):        # It's a file-type object.
1160            markup = markup.read()
1161        self.markup = markup
1162        self.markupMassage = markupMassage
1163        try:
1164            self._feed(isHTML=isHTML)
1165        except StopParsing:
1166            pass
1167        self.markup = None                 # The markup can now be GCed
1168
1169    def convert_charref(self, name):
1170        """This method fixes a bug in Python's SGMLParser."""
1171        try:
1172            n = int(name)
1173        except ValueError:
1174            return
1175        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1176            return
1177        return self.convert_codepoint(n)
1178
1179    def _feed(self, inDocumentEncoding=None, isHTML=False):
1180        # Convert the document to Unicode.
1181        markup = self.markup
1182        if isinstance(markup, text_type):
1183            if not hasattr(self, 'originalEncoding'):
1184                self.originalEncoding = None
1185        else:
1186            dammit = UnicodeDammit\
1187                     (markup, [self.fromEncoding, inDocumentEncoding],
1188                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1189            markup = dammit.unicode
1190            self.originalEncoding = dammit.originalEncoding
1191            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1192        if markup:
1193            if self.markupMassage:
1194                if not hasattr(self.markupMassage, "__iter__"):
1195                    self.markupMassage = self.MARKUP_MASSAGE
1196                for fix, m in self.markupMassage:
1197                    markup = fix.sub(m, markup)
1198                # TODO: We get rid of markupMassage so that the
1199                # soup object can be deepcopied later on. Some
1200                # Python installations can't copy regexes. If anyone
1201                # was relying on the existence of markupMassage, this
1202                # might cause problems.
1203                del(self.markupMassage)
1204        self.reset()
1205
1206        sgmllib.SGMLParser.feed(self, markup)
1207        # Close out any unfinished strings and close all the open tags.
1208        self.endData()
1209        while self.currentTag.name != self.ROOT_TAG_NAME:
1210            self.popTag()
1211
1212    def __getattr__(self, methodName):
1213        """This method routes method call requests to either the SGMLParser
1214        superclass or the Tag superclass, depending on the method name."""
1215        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1216
1217        if methodName.startswith('start_') or methodName.startswith('end_') \
1218               or methodName.startswith('do_'):
1219            return sgmllib.SGMLParser.__getattr__(self, methodName)
1220        elif not methodName.startswith('__'):
1221            return Tag.__getattr__(self, methodName)
1222        else:
1223            raise AttributeError
1224
1225    def isSelfClosingTag(self, name):
1226        """Returns true iff the given string is the name of a
1227        self-closing tag according to this parser."""
1228        return name in self.SELF_CLOSING_TAGS \
1229               or name in self.instanceSelfClosingTags
1230
1231    def reset(self):
1232        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1233        self.hidden = 1
1234        sgmllib.SGMLParser.reset(self)
1235        self.currentData = []
1236        self.currentTag = None
1237        self.tagStack = []
1238        self.quoteStack = []
1239        self.pushTag(self)
1240
1241    def popTag(self):
1242        tag = self.tagStack.pop()
1243
1244        #print "Pop", tag.name
1245        if self.tagStack:
1246            self.currentTag = self.tagStack[-1]
1247        return self.currentTag
1248
1249    def pushTag(self, tag):
1250        #print "Push", tag.name
1251        if self.currentTag:
1252            self.currentTag.contents.append(tag)
1253        self.tagStack.append(tag)
1254        self.currentTag = self.tagStack[-1]
1255
1256    def endData(self, containerClass=NavigableString):
1257        if self.currentData:
1258            currentData = u''.join(self.currentData)
1259            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1260                not set([tag.name for tag in self.tagStack]).intersection(
1261                    self.PRESERVE_WHITESPACE_TAGS)):
1262                if '\n' in currentData:
1263                    currentData = '\n'
1264                else:
1265                    currentData = ' '
1266            self.currentData = []
1267            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1268                   (not self.parseOnlyThese.text or \
1269                    not self.parseOnlyThese.search(currentData)):
1270                return
1271            o = containerClass(currentData)
1272            o.setup(self.currentTag, self.previous)
1273            if self.previous:
1274                self.previous.next = o
1275            self.previous = o
1276            self.currentTag.contents.append(o)
1277
1278
1279    def _popToTag(self, name, inclusivePop=True):
1280        """Pops the tag stack up to and including the most recent
1281        instance of the given tag. If inclusivePop is false, pops the tag
1282        stack up to but *not* including the most recent instqance of
1283        the given tag."""
1284        #print "Popping to %s" % name
1285        if name == self.ROOT_TAG_NAME:
1286            return
1287
1288        numPops = 0
1289        mostRecentTag = None
1290        for i in xrange(len(self.tagStack)-1, 0, -1):
1291            if name == self.tagStack[i].name:
1292                numPops = len(self.tagStack)-i
1293                break
1294        if not inclusivePop:
1295            numPops = numPops - 1
1296
1297        for i in xrange(0, numPops):
1298            mostRecentTag = self.popTag()
1299        return mostRecentTag
1300
1301    def _smartPop(self, name):
1302
1303        """We need to pop up to the previous tag of this type, unless
1304        one of this tag's nesting reset triggers comes between this
1305        tag and the previous tag of this type, OR unless this tag is a
1306        generic nesting trigger and another generic nesting trigger
1307        comes between this tag and the previous tag of this type.
1308
1309        Examples:
1310         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1311         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1312         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1313
1314         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1315         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1316         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1317        """
1318
1319        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1320        isNestable = nestingResetTriggers != None
1321        isResetNesting = name in self.RESET_NESTING_TAGS
1322        popTo = None
1323        inclusive = True
1324        for i in xrange(len(self.tagStack)-1, 0, -1):
1325            p = self.tagStack[i]
1326            if (not p or p.name == name) and not isNestable:
1327                #Non-nestable tags get popped to the top or to their
1328                #last occurance.
1329                popTo = name
1330                break
1331            if (nestingResetTriggers is not None
1332                and p.name in nestingResetTriggers) \
1333                or (nestingResetTriggers is None and isResetNesting
1334                    and p.name in self.RESET_NESTING_TAGS):
1335
1336                #If we encounter one of the nesting reset triggers
1337                #peculiar to this tag, or we encounter another tag
1338                #that causes nesting to reset, pop up to but not
1339                #including that tag.
1340                popTo = p.name
1341                inclusive = False
1342                break
1343            p = p.parent
1344        if popTo:
1345            self._popToTag(popTo, inclusive)
1346
1347    def unknown_starttag(self, name, attrs, selfClosing=0):
1348        #print "Start tag %s: %s" % (name, attrs)
1349        if self.quoteStack:
1350            #This is not a real tag.
1351            #print "<%s> is not real!" % name
1352            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1353            self.handle_data('<%s%s>' % (name, attrs))
1354            return
1355        self.endData()
1356
1357        if not self.isSelfClosingTag(name) and not selfClosing:
1358            self._smartPop(name)
1359
1360        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1361               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1362            return
1363
1364        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1365        if self.previous:
1366            self.previous.next = tag
1367        self.previous = tag
1368        self.pushTag(tag)
1369        if selfClosing or self.isSelfClosingTag(name):
1370            self.popTag()
1371        if name in self.QUOTE_TAGS:
1372            #print "Beginning quote (%s)" % name
1373            self.quoteStack.append(name)
1374            self.literal = 1
1375        return tag
1376
1377    def unknown_endtag(self, name):
1378        #print "End tag %s" % name
1379        if self.quoteStack and self.quoteStack[-1] != name:
1380            #This is not a real end tag.
1381            #print "</%s> is not real!" % name
1382            self.handle_data('</%s>' % name)
1383            return
1384        self.endData()
1385        self._popToTag(name)
1386        if self.quoteStack and self.quoteStack[-1] == name:
1387            self.quoteStack.pop()
1388            self.literal = (len(self.quoteStack) > 0)
1389
1390    def handle_data(self, data):
1391        self.currentData.append(data)
1392
1393    def _toStringSubclass(self, text, subclass):
1394        """Adds a certain piece of text to the tree as a NavigableString
1395        subclass."""
1396        self.endData()
1397        self.handle_data(text)
1398        self.endData(subclass)
1399
1400    def handle_pi(self, text):
1401        """Handle a processing instruction as a ProcessingInstruction
1402        object, possibly one with a %SOUP-ENCODING% slot into which an
1403        encoding will be plugged later."""
1404        if text[:3] == "xml":
1405            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1406        self._toStringSubclass(text, ProcessingInstruction)
1407
1408    def handle_comment(self, text):
1409        "Handle comments as Comment objects."
1410        self._toStringSubclass(text, Comment)
1411
1412    def handle_charref(self, ref):
1413        "Handle character references as data."
1414        if self.convertEntities:
1415            data = unichr(int(ref))
1416        else:
1417            data = '&#%s;' % ref
1418        self.handle_data(data)
1419
1420    def handle_entityref(self, ref):
1421        """Handle entity references as data, possibly converting known
1422        HTML and/or XML entity references to the corresponding Unicode
1423        characters."""
1424        data = None
1425        if self.convertHTMLEntities:
1426            try:
1427                data = unichr(name2codepoint[ref])
1428            except KeyError:
1429                pass
1430
1431        if not data and self.convertXMLEntities:
1432                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1433
1434        if not data and self.convertHTMLEntities and \
1435            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1436                # TODO: We've got a problem here. We're told this is
1437                # an entity reference, but it's not an XML entity
1438                # reference or an HTML entity reference. Nonetheless,
1439                # the logical thing to do is to pass it through as an
1440                # unrecognized entity reference.
1441                #
1442                # Except: when the input is "&carol;" this function
1443                # will be called with input "carol". When the input is
1444                # "AT&T", this function will be called with input
1445                # "T". We have no way of knowing whether a semicolon
1446                # was present originally, so we don't know whether
1447                # this is an unknown entity or just a misplaced
1448                # ampersand.
1449                #
1450                # The more common case is a misplaced ampersand, so I
1451                # escape the ampersand and omit the trailing semicolon.
1452                data = "&amp;%s" % ref
1453        if not data:
1454            # This case is different from the one above, because we
1455            # haven't already gone through a supposedly comprehensive
1456            # mapping of entities to Unicode characters. We might not
1457            # have gone through any mapping at all. So the chances are
1458            # very high that this is a real entity, and not a
1459            # misplaced ampersand.
1460            data = "&%s;" % ref
1461        self.handle_data(data)
1462
1463    def handle_decl(self, data):
1464        "Handle DOCTYPEs and the like as Declaration objects."
1465        self._toStringSubclass(data, Declaration)
1466
1467    def parse_declaration(self, i):
1468        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1469        declaration as a CData object."""
1470        j = None
1471        if self.rawdata[i:i+9] == '<![CDATA[':
1472             k = self.rawdata.find(']]>', i)
1473             if k == -1:
1474                 k = len(self.rawdata)
1475             data = self.rawdata[i+9:k]
1476             j = k+3
1477             self._toStringSubclass(data, CData)
1478        else:
1479            try:
1480                j = sgmllib.SGMLParser.parse_declaration(self, i)
1481            except sgmllib.SGMLParseError:
1482                toHandle = self.rawdata[i:]
1483                self.handle_data(toHandle)
1484                j = i + len(toHandle)
1485        return j
1486
1487class BeautifulSoup(BeautifulStoneSoup):
1488
1489    """This parser knows the following facts about HTML:
1490
1491    * Some tags have no closing tag and should be interpreted as being
1492      closed as soon as they are encountered.
1493
1494    * The text inside some tags (ie. 'script') may contain tags which
1495      are not really part of the document and which should be parsed
1496      as text, not tags. If you want to parse the text as tags, you can
1497      always fetch it and parse it explicitly.
1498
1499    * Tag nesting rules:
1500
1501      Most tags can't be nested at all. For instance, the occurance of
1502      a <p> tag should implicitly close the previous <p> tag.
1503
1504       <p>Para1<p>Para2
1505        should be transformed into:
1506       <p>Para1</p><p>Para2
1507
1508      Some tags can be nested arbitrarily. For instance, the occurance
1509      of a <blockquote> tag should _not_ implicitly close the previous
1510      <blockquote> tag.
1511
1512       Alice said: <blockquote>Bob said: <blockquote>Blah
1513        should NOT be transformed into:
1514       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1515
1516      Some tags can be nested, but the nesting is reset by the
1517      interposition of other tags. For instance, a <tr> tag should
1518      implicitly close the previous <tr> tag within the same <table>,
1519      but not close a <tr> tag in another table.
1520
1521       <table><tr>Blah<tr>Blah
1522        should be transformed into:
1523       <table><tr>Blah</tr><tr>Blah
1524        but,
1525       <tr>Blah<table><tr>Blah
1526        should NOT be transformed into
1527       <tr>Blah<table></tr><tr>Blah
1528
1529    Differing assumptions about tag nesting rules are a major source
1530    of problems with the BeautifulSoup class. If BeautifulSoup is not
1531    treating as nestable a tag your page author treats as nestable,
1532    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1533    BeautifulStoneSoup before writing your own subclass."""
1534
1535    def __init__(self, *args, **kwargs):
1536        if 'smartQuotesTo' not in kwargs:
1537            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1538        kwargs['isHTML'] = True
1539        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1540
1541    SELF_CLOSING_TAGS = buildTagMap(None,
1542                                    ('br' , 'hr', 'input', 'img', 'meta',
1543                                    'spacer', 'link', 'frame', 'base', 'col'))
1544
1545    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1546
1547    QUOTE_TAGS = {'script' : None, 'textarea' : None}
1548
1549    #According to the HTML standard, each of these inline tags can
1550    #contain another tag of the same type. Furthermore, it's common
1551    #to actually use these tags this way.
1552    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1553                            'center')
1554
1555    #According to the HTML standard, these block tags can contain
1556    #another tag of the same type. Furthermore, it's common
1557    #to actually use these tags this way.
1558    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1559
1560    #Lists can contain other lists, but there are restrictions.
1561    NESTABLE_LIST_TAGS = { 'ol' : [],
1562                           'ul' : [],
1563                           'li' : ['ul', 'ol'],
1564                           'dl' : [],
1565                           'dd' : ['dl'],
1566                           'dt' : ['dl'] }
1567
1568    #Tables can contain other tables, but there are restrictions.
1569    NESTABLE_TABLE_TAGS = {'table' : [],
1570                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1571                           'td' : ['tr'],
1572                           'th' : ['tr'],
1573                           'thead' : ['table'],
1574                           'tbody' : ['table'],
1575                           'tfoot' : ['table'],
1576                           }
1577
1578    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1579
1580    #If one of these tags is encountered, all tags up to the next tag of
1581    #this type are popped.
1582    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1583                                     NON_NESTABLE_BLOCK_TAGS,
1584                                     NESTABLE_LIST_TAGS,
1585                                     NESTABLE_TABLE_TAGS)
1586
1587    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1588                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1589
1590    # Used to detect the charset in a META tag; see start_meta
1591    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
1592
1593    def start_meta(self, attrs):
1594        """Beautiful Soup can detect a charset included in a META tag,
1595        try to convert the document to that charset, and re-parse the
1596        document from the beginning."""
1597        httpEquiv = None
1598        contentType = None
1599        contentTypeIndex = None
1600        tagNeedsEncodingSubstitution = False
1601
1602        for i in xrange(0, len(attrs)):
1603            key, value = attrs[i]
1604            key = key.lower()
1605            if key == 'http-equiv':
1606                httpEquiv = value
1607            elif key == 'content':
1608                contentType = value
1609                contentTypeIndex = i
1610
1611        if httpEquiv and contentType: # It's an interesting meta tag.
1612            match = self.CHARSET_RE.search(contentType)
1613            if match:
1614                if (self.declaredHTMLEncoding is not None or
1615                    self.originalEncoding == self.fromEncoding):
1616                    # An HTML encoding was sniffed while converting
1617                    # the document to Unicode, or an HTML encoding was
1618                    # sniffed during a previous pass through the
1619                    # document, or an encoding was specified
1620                    # explicitly and it worked. Rewrite the meta tag.
1621                    def rewrite(match):
1622                        return match.group(1) + "%SOUP-ENCODING%"
1623                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1624                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1625                                               newAttr)
1626                    tagNeedsEncodingSubstitution = True
1627                else:
1628                    # This is our first pass through the document.
1629                    # Go through it again with the encoding information.
1630                    newCharset = match.group(3)
1631                    if newCharset and newCharset != self.originalEncoding:
1632                        self.declaredHTMLEncoding = newCharset
1633                        self._feed(self.declaredHTMLEncoding)
1634                        raise StopParsing
1635                    pass
1636        tag = self.unknown_starttag("meta", attrs)
1637        if tag and tagNeedsEncodingSubstitution:
1638            tag.containsSubstitutions = True
1639
1640class StopParsing(Exception):
1641    pass
1642
1643class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1644
1645    """The BeautifulSoup class is oriented towards skipping over
1646    common HTML errors like unclosed tags. However, sometimes it makes
1647    errors of its own. For instance, consider this fragment:
1648
1649     <b>Foo<b>Bar</b></b>
1650
1651    This is perfectly valid (if bizarre) HTML. However, the
1652    BeautifulSoup class will implicitly close the first b tag when it
1653    encounters the second 'b'. It will think the author wrote
1654    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1655    there's no real-world reason to bold something that's already
1656    bold. When it encounters '</b></b>' it will close two more 'b'
1657    tags, for a grand total of three tags closed instead of two. This
1658    can throw off the rest of your document structure. The same is
1659    true of a number of other tags, listed below.
1660
1661    It's much more common for someone to forget to close a 'b' tag
1662    than to actually use nested 'b' tags, and the BeautifulSoup class
1663    handles the common case. This class handles the not-co-common
1664    case: where you can't believe someone wrote what they did, but
1665    it's valid HTML and BeautifulSoup screwed up by assuming it
1666    wouldn't be."""
1667
1668    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1669     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1670      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1671      'big')
1672
1673    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1674
1675    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1676                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1677                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1678
1679class MinimalSoup(BeautifulSoup):
1680    """The MinimalSoup class is for parsing HTML that contains
1681    pathologically bad markup. It makes no assumptions about tag
1682    nesting, but it does know which tags are self-closing, that
1683    <script> tags contain Javascript and should not be parsed, that
1684    META tags may contain encoding information, and so on.
1685
1686    This also makes it better for subclassing than BeautifulStoneSoup
1687    or BeautifulSoup."""
1688
1689    RESET_NESTING_TAGS = buildTagMap('noscript')
1690    NESTABLE_TAGS = {}
1691
1692class BeautifulSOAP(BeautifulStoneSoup):
1693    """This class will push a tag with only a single string child into
1694    the tag's parent as an attribute. The attribute's name is the tag
1695    name, and the value is the string child. An example should give
1696    the flavor of the change:
1697
1698    <foo><bar>baz</bar></foo>
1699     =>
1700    <foo bar="baz"><bar>baz</bar></foo>
1701
1702    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1703
1704    This is, of course, useful for scraping structures that tend to
1705    use subelements instead of attributes, such as SOAP messages. Note
1706    that it modifies its input, so don't print the modified version
1707    out.
1708
1709    I'm not sure how many people really want to use this class; let me
1710    know if you do. Mainly I like the name."""
1711
1712    def popTag(self):
1713        if len(self.tagStack) > 1:
1714            tag = self.tagStack[-1]
1715            parent = self.tagStack[-2]
1716            parent._getAttrMap()
1717            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1718                isinstance(tag.contents[0], NavigableString) and
1719                not parent.attrMap.has_key(tag.name)):
1720                parent[tag.name] = tag.contents[0]
1721        BeautifulStoneSoup.popTag(self)
1722
1723#Enterprise class names! It has come to our attention that some people
1724#think the names of the Beautiful Soup parser classes are too silly
1725#and "unprofessional" for use in enterprise screen-scraping. We feel
1726#your pain! For such-minded folk, the Beautiful Soup Consortium And
1727#All-Night Kosher Bakery recommends renaming this file to
1728#"RobustParser.py" (or, in cases of extreme enterprisiness,
1729#"RobustParserBeanInterface.class") and using the following
1730#enterprise-friendly class aliases:
1731class RobustXMLParser(BeautifulStoneSoup):
1732    pass
1733class RobustHTMLParser(BeautifulSoup):
1734    pass
1735class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1736    pass
1737class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1738    pass
1739class SimplifyingSOAPParser(BeautifulSOAP):
1740    pass
1741
1742######################################################
1743#
1744# Bonus library: Unicode, Dammit
1745#
1746# This class forces XML data into a standard format (usually to UTF-8
1747# or Unicode).  It is heavily based on code from Mark Pilgrim's
1748# Universal Feed Parser. It does not rewrite the XML or HTML to
1749# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1750# (XML) and BeautifulSoup.start_meta (HTML).
1751
1752# Autodetects character encodings.
1753# Download from http://chardet.feedparser.org/
1754try:
1755    import chardet
1756#    import chardet.constants
1757#    chardet.constants._debug = 1
1758except ImportError:
1759    chardet = None
1760
1761# cjkcodecs and iconv_codec make Python know about more character encodings.
1762# Both are available from http://cjkpython.i18n.org/
1763# They're built in if you use Python 2.4.
1764try:
1765    import cjkcodecs.aliases
1766except ImportError:
1767    pass
1768try:
1769    import iconv_codec
1770except ImportError:
1771    pass
1772
1773class UnicodeDammit:
1774    """A class for detecting the encoding of a *ML document and
1775    converting it to a Unicode string. If the source encoding is
1776    windows-1252, can replace MS smart quotes with their HTML or XML
1777    equivalents."""
1778
1779    # This dictionary maps commonly seen values for "charset" in HTML
1780    # meta tags to the corresponding Python codec names. It only covers
1781    # values that aren't in Python's aliases and can't be determined
1782    # by the heuristics in find_codec.
1783    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1784                        "x-sjis" : "shift-jis" }
1785
1786    def __init__(self, markup, overrideEncodings=[],
1787                 smartQuotesTo='xml', isHTML=False):
1788        self.declaredHTMLEncoding = None
1789        self.markup, documentEncoding, sniffedEncoding = \
1790                     self._detectEncoding(markup, isHTML)
1791        self.smartQuotesTo = smartQuotesTo
1792        self.triedEncodings = []
1793        if markup == '' or isinstance(markup, text_type):
1794            self.originalEncoding = None
1795            self.unicode = text_type(markup)
1796            return
1797
1798        u = None
1799        for proposedEncoding in overrideEncodings:
1800            u = self._convertFrom(proposedEncoding)
1801            if u: break
1802        if not u:
1803            for proposedEncoding in (documentEncoding, sniffedEncoding):
1804                u = self._convertFrom(proposedEncoding)
1805                if u: break
1806
1807        # If no luck and we have auto-detection library, try that:
1808        if not u and chardet and not isinstance(self.markup, text_type):
1809            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1810
1811        # As a last resort, try utf-8 and windows-1252:
1812        if not u:
1813            for proposed_encoding in ("utf-8", "windows-1252"):
1814                u = self._convertFrom(proposed_encoding)
1815                if u: break
1816
1817        self.unicode = u
1818        if not u: self.originalEncoding = None
1819
1820    def _subMSChar(self, orig):
1821        """Changes a MS smart quote character to an XML or HTML
1822        entity."""
1823        sub = self.MS_CHARS.get(orig)
1824        if isinstance(sub, tuple):
1825            if self.smartQuotesTo == 'xml':
1826                sub = '&#x%s;' % sub[1]
1827            else:
1828                sub = '&%s;' % sub[0]
1829        return sub
1830
1831    def _convertFrom(self, proposed):
1832        proposed = self.find_codec(proposed)
1833        if not proposed or proposed in self.triedEncodings:
1834            return None
1835        self.triedEncodings.append(proposed)
1836        markup = self.markup
1837
1838        # Convert smart quotes to HTML if coming from an encoding
1839        # that might have them.
1840        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1841                                                      "iso-8859-1",
1842                                                      "iso-8859-2"):
1843            markup = re.compile("([\x80-\x9f])").sub \
1844                     (lambda x: self._subMSChar(x.group(1)),
1845                      markup)
1846
1847        try:
1848            # print "Trying to convert document to %s" % proposed
1849            u = self._toUnicode(markup, proposed)
1850            self.markup = u
1851            self.originalEncoding = proposed
1852        except Exception as e:
1853            # print "That didn't work!"
1854            # print e
1855            return None
1856        #print "Correct encoding: %s" % proposed
1857        return self.markup
1858
1859    def _toUnicode(self, data, encoding):
1860        '''Given a string and its encoding, decodes the string into Unicode.
1861        %encoding is a string recognized by encodings.aliases'''
1862
1863        # strip Byte Order Mark (if present)
1864        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1865               and (data[2:4] != '\x00\x00'):
1866            encoding = 'utf-16be'
1867            data = data[2:]
1868        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1869                 and (data[2:4] != '\x00\x00'):
1870            encoding = 'utf-16le'
1871            data = data[2:]
1872        elif data[:3] == '\xef\xbb\xbf':
1873            encoding = 'utf-8'
1874            data = data[3:]
1875        elif data[:4] == '\x00\x00\xfe\xff':
1876            encoding = 'utf-32be'
1877            data = data[4:]
1878        elif data[:4] == '\xff\xfe\x00\x00':
1879            encoding = 'utf-32le'
1880            data = data[4:]
1881        newdata = text_type(data, encoding)
1882        return newdata
1883
1884    def _detectEncoding(self, xml_data, isHTML=False):
1885        """Given a document, tries to detect its XML encoding."""
1886        xml_encoding = sniffed_xml_encoding = None
1887        try:
1888            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1889                # EBCDIC
1890                xml_data = self._ebcdic_to_ascii(xml_data)
1891            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1892                # UTF-16BE
1893                sniffed_xml_encoding = 'utf-16be'
1894                xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
1895            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1896                     and (xml_data[2:4] != '\x00\x00'):
1897                # UTF-16BE with BOM
1898                sniffed_xml_encoding = 'utf-16be'
1899                xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
1900            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1901                # UTF-16LE
1902                sniffed_xml_encoding = 'utf-16le'
1903                xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
1904            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1905                     (xml_data[2:4] != '\x00\x00'):
1906                # UTF-16LE with BOM
1907                sniffed_xml_encoding = 'utf-16le'
1908                xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
1909            elif xml_data[:4] == '\x00\x00\x00\x3c':
1910                # UTF-32BE
1911                sniffed_xml_encoding = 'utf-32be'
1912                xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
1913            elif xml_data[:4] == '\x3c\x00\x00\x00':
1914                # UTF-32LE
1915                sniffed_xml_encoding = 'utf-32le'
1916                xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
1917            elif xml_data[:4] == '\x00\x00\xfe\xff':
1918                # UTF-32BE with BOM
1919                sniffed_xml_encoding = 'utf-32be'
1920                xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
1921            elif xml_data[:4] == '\xff\xfe\x00\x00':
1922                # UTF-32LE with BOM
1923                sniffed_xml_encoding = 'utf-32le'
1924                xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
1925            elif xml_data[:3] == '\xef\xbb\xbf':
1926                # UTF-8 with BOM
1927                sniffed_xml_encoding = 'utf-8'
1928                xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
1929            else:
1930                sniffed_xml_encoding = 'ascii'
1931                pass
1932        except:
1933            xml_encoding_match = None
1934        xml_encoding_match = re.compile(
1935            r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1936        if not xml_encoding_match and isHTML:
1937            regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1938            xml_encoding_match = regexp.search(xml_data)
1939        if xml_encoding_match is not None:
1940            xml_encoding = xml_encoding_match.groups()[0].lower()
1941            if isHTML:
1942                self.declaredHTMLEncoding = xml_encoding
1943            if sniffed_xml_encoding and \
1944               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1945                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1946                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1947                                 'utf16', 'u16')):
1948                xml_encoding = sniffed_xml_encoding
1949        return xml_data, xml_encoding, sniffed_xml_encoding
1950
1951
1952    def find_codec(self, charset):
1953        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1954               or (charset and self._codec(charset.replace("-", ""))) \
1955               or (charset and self._codec(charset.replace("-", "_"))) \
1956               or charset
1957
1958    def _codec(self, charset):
1959        if not charset: return charset
1960        codec = None
1961        try:
1962            codecs.lookup(charset)
1963            codec = charset
1964        except (LookupError, ValueError):
1965            pass
1966        return codec
1967
1968    EBCDIC_TO_ASCII_MAP = None
1969    def _ebcdic_to_ascii(self, s):
1970        c = self.__class__
1971        if not c.EBCDIC_TO_ASCII_MAP:
1972            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1973                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1974                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1975                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1976                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1977                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1978                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1979                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1980                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1981                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1982                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1983                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1984                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1985                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1986                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1987                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1988                    250,251,252,253,254,255)
1989            import string
1990            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1991            ''.join(map(chr, xrange(256))), ''.join(map(chr, emap)))
1992        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1993
1994    MS_CHARS = { '\x80' : ('euro', '20AC'),
1995                 '\x81' : ' ',
1996                 '\x82' : ('sbquo', '201A'),
1997                 '\x83' : ('fnof', '192'),
1998                 '\x84' : ('bdquo', '201E'),
1999                 '\x85' : ('hellip', '2026'),
2000                 '\x86' : ('dagger', '2020'),
2001                 '\x87' : ('Dagger', '2021'),
2002                 '\x88' : ('circ', '2C6'),
2003                 '\x89' : ('permil', '2030'),
2004                 '\x8A' : ('Scaron', '160'),
2005                 '\x8B' : ('lsaquo', '2039'),
2006                 '\x8C' : ('OElig', '152'),
2007                 '\x8D' : '?',
2008                 '\x8E' : ('#x17D', '17D'),
2009                 '\x8F' : '?',
2010                 '\x90' : '?',
2011                 '\x91' : ('lsquo', '2018'),
2012                 '\x92' : ('rsquo', '2019'),
2013                 '\x93' : ('ldquo', '201C'),
2014                 '\x94' : ('rdquo', '201D'),
2015                 '\x95' : ('bull', '2022'),
2016                 '\x96' : ('ndash', '2013'),
2017                 '\x97' : ('mdash', '2014'),
2018                 '\x98' : ('tilde', '2DC'),
2019                 '\x99' : ('trade', '2122'),
2020                 '\x9a' : ('scaron', '161'),
2021                 '\x9b' : ('rsaquo', '203A'),
2022                 '\x9c' : ('oelig', '153'),
2023                 '\x9d' : '?',
2024                 '\x9e' : ('#x17E', '17E'),
2025                 '\x9f' : ('Yuml', ''),}
2026
2027#######################################################################
2028
2029
2030#By default, act as an HTML pretty-printer.
2031if __name__ == '__main__':
2032    import sys
2033    soup = BeautifulSoup(sys.stdin)
2034    print(soup.prettify())
2035