1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4http://www.crummy.com/software/BeautifulSoup/ 5 6Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7tree representation. It provides methods and Pythonic idioms that make 8it easy to navigate, search, and modify the tree. 9 10A well-formed XML/HTML document yields a well-formed data 11structure. An ill-formed XML/HTML document yields a correspondingly 12ill-formed data structure. If your document is only locally 13well-formed, you can use this library to find and process the 14well-formed part of it. 15 16Beautiful Soup works with Python 2.2 and up. It has no external 17dependencies, but you'll have more success at converting data to UTF-8 18if you also install these three packages: 19 20* chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22* cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 by stock Python. 24 http://cjkpython.i18n.org/ 25 26Beautiful Soup defines classes for two main parsing strategies: 27 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 30 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 34 35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36the encoding of an HTML or XML document, and converting it to 37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 39For more than you ever wanted to know about Beautiful Soup, see the 40documentation: 41http://www.crummy.com/software/BeautifulSoup/documentation.html 42 43Here, have some legalese: 44 45Copyright (c) 2004-2010, Leonard Richardson 46 47All rights reserved. 48 49Redistribution and use in source and binary forms, with or without 50modification, are permitted provided that the following conditions are 51met: 52 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 55 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 60 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 65 66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 78""" 79from __future__ import generators 80from __future__ import print_function 81 82__author__ = "Leonard Richardson (leonardr@segfault.org)" 83__version__ = "3.2.1" 84__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" 85__license__ = "New-style BSD" 86 87import codecs 88import types 89import re 90import sys 91 92if sys.version_info >= (3, 0): 93 xrange = range 94 text_type = str 95 binary_type = bytes 96 basestring = str 97else: 98 text_type = unicode 99 binary_type = str 100 101try: 102 from htmlentitydefs import name2codepoint 103except ImportError: 104 name2codepoint = {} 105try: 106 set 107except NameError: 108 from sets import Set as set 109 110try: 111 import sgmllib 112except ImportError: 113 from lib.utils import sgmllib 114 115try: 116 import markupbase 117except ImportError: 118 import _markupbase as markupbase 119 120#These hacks make Beautiful Soup able to parse XML with namespaces 121sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 122markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 123 124DEFAULT_OUTPUT_ENCODING = "utf-8" 125 126def _match_css_class(str): 127 """Build a RE to match the given CSS class.""" 128 return re.compile(r"(^|.*\s)%s($|\s)" % str) 129 130# First, the classes that represent markup elements. 131 132class PageElement(object): 133 """Contains the navigational information for some part of the page 134 (either a tag or a piece of text)""" 135 136 def _invert(h): 137 "Cheap function to invert a hash." 138 i = {} 139 for k,v in h.items(): 140 i[v] = k 141 return i 142 143 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 144 "quot" : '"', 145 "amp" : "&", 146 "lt" : "<", 147 "gt" : ">" } 148 149 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 150 151 def setup(self, parent=None, previous=None): 152 """Sets up the initial relations between this element and 153 other elements.""" 154 self.parent = parent 155 self.previous = previous 156 self.next = None 157 self.previousSibling = None 158 self.nextSibling = None 159 if self.parent and self.parent.contents: 160 self.previousSibling = self.parent.contents[-1] 161 self.previousSibling.nextSibling = self 162 163 def replaceWith(self, replaceWith): 164 oldParent = self.parent 165 myIndex = self.parent.index(self) 166 if hasattr(replaceWith, "parent")\ 167 and replaceWith.parent is self.parent: 168 # We're replacing this element with one of its siblings. 169 index = replaceWith.parent.index(replaceWith) 170 if index and index < myIndex: 171 # Furthermore, it comes before this element. That 172 # means that when we extract it, the index of this 173 # element will change. 174 myIndex = myIndex - 1 175 self.extract() 176 oldParent.insert(myIndex, replaceWith) 177 178 def replaceWithChildren(self): 179 myParent = self.parent 180 myIndex = self.parent.index(self) 181 self.extract() 182 reversedChildren = list(self.contents) 183 reversedChildren.reverse() 184 for child in reversedChildren: 185 myParent.insert(myIndex, child) 186 187 def extract(self): 188 """Destructively rips this element out of the tree.""" 189 if self.parent: 190 try: 191 del self.parent.contents[self.parent.index(self)] 192 except ValueError: 193 pass 194 195 #Find the two elements that would be next to each other if 196 #this element (and any children) hadn't been parsed. Connect 197 #the two. 198 lastChild = self._lastRecursiveChild() 199 nextElement = lastChild.next 200 201 if self.previous: 202 self.previous.next = nextElement 203 if nextElement: 204 nextElement.previous = self.previous 205 self.previous = None 206 lastChild.next = None 207 208 self.parent = None 209 if self.previousSibling: 210 self.previousSibling.nextSibling = self.nextSibling 211 if self.nextSibling: 212 self.nextSibling.previousSibling = self.previousSibling 213 self.previousSibling = self.nextSibling = None 214 return self 215 216 def _lastRecursiveChild(self): 217 "Finds the last element beneath this object to be parsed." 218 lastChild = self 219 while hasattr(lastChild, 'contents') and lastChild.contents: 220 lastChild = lastChild.contents[-1] 221 return lastChild 222 223 def insert(self, position, newChild): 224 if isinstance(newChild, basestring) \ 225 and not isinstance(newChild, NavigableString): 226 newChild = NavigableString(newChild) 227 228 position = min(position, len(self.contents)) 229 if hasattr(newChild, 'parent') and newChild.parent is not None: 230 # We're 'inserting' an element that's already one 231 # of this object's children. 232 if newChild.parent is self: 233 index = self.index(newChild) 234 if index > position: 235 # Furthermore we're moving it further down the 236 # list of this object's children. That means that 237 # when we extract this element, our target index 238 # will jump down one. 239 position = position - 1 240 newChild.extract() 241 242 newChild.parent = self 243 previousChild = None 244 if position == 0: 245 newChild.previousSibling = None 246 newChild.previous = self 247 else: 248 previousChild = self.contents[position-1] 249 newChild.previousSibling = previousChild 250 newChild.previousSibling.nextSibling = newChild 251 newChild.previous = previousChild._lastRecursiveChild() 252 if newChild.previous: 253 newChild.previous.next = newChild 254 255 newChildsLastElement = newChild._lastRecursiveChild() 256 257 if position >= len(self.contents): 258 newChild.nextSibling = None 259 260 parent = self 261 parentsNextSibling = None 262 while not parentsNextSibling: 263 parentsNextSibling = parent.nextSibling 264 parent = parent.parent 265 if not parent: # This is the last element in the document. 266 break 267 if parentsNextSibling: 268 newChildsLastElement.next = parentsNextSibling 269 else: 270 newChildsLastElement.next = None 271 else: 272 nextChild = self.contents[position] 273 newChild.nextSibling = nextChild 274 if newChild.nextSibling: 275 newChild.nextSibling.previousSibling = newChild 276 newChildsLastElement.next = nextChild 277 278 if newChildsLastElement.next: 279 newChildsLastElement.next.previous = newChildsLastElement 280 self.contents.insert(position, newChild) 281 282 def append(self, tag): 283 """Appends the given tag to the contents of this tag.""" 284 self.insert(len(self.contents), tag) 285 286 def findNext(self, name=None, attrs={}, text=None, **kwargs): 287 """Returns the first item that matches the given criteria and 288 appears after this Tag in the document.""" 289 return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 290 291 def findAllNext(self, name=None, attrs={}, text=None, limit=None, 292 **kwargs): 293 """Returns all items that match the given criteria and appear 294 after this Tag in the document.""" 295 return self._findAll(name, attrs, text, limit, self.nextGenerator, 296 **kwargs) 297 298 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 299 """Returns the closest sibling to this Tag that matches the 300 given criteria and appears after this Tag in the document.""" 301 return self._findOne(self.findNextSiblings, name, attrs, text, 302 **kwargs) 303 304 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 305 **kwargs): 306 """Returns the siblings of this Tag that match the given 307 criteria and appear after this Tag in the document.""" 308 return self._findAll(name, attrs, text, limit, 309 self.nextSiblingGenerator, **kwargs) 310 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 311 312 def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 313 """Returns the first item that matches the given criteria and 314 appears before this Tag in the document.""" 315 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 316 317 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 318 **kwargs): 319 """Returns all items that match the given criteria and appear 320 before this Tag in the document.""" 321 return self._findAll(name, attrs, text, limit, self.previousGenerator, 322 **kwargs) 323 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 324 325 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 326 """Returns the closest sibling to this Tag that matches the 327 given criteria and appears before this Tag in the document.""" 328 return self._findOne(self.findPreviousSiblings, name, attrs, text, 329 **kwargs) 330 331 def findPreviousSiblings(self, name=None, attrs={}, text=None, 332 limit=None, **kwargs): 333 """Returns the siblings of this Tag that match the given 334 criteria and appear before this Tag in the document.""" 335 return self._findAll(name, attrs, text, limit, 336 self.previousSiblingGenerator, **kwargs) 337 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 338 339 def findParent(self, name=None, attrs={}, **kwargs): 340 """Returns the closest parent of this Tag that matches the given 341 criteria.""" 342 # NOTE: We can't use _findOne because findParents takes a different 343 # set of arguments. 344 r = None 345 l = self.findParents(name, attrs, 1) 346 if l: 347 r = l[0] 348 return r 349 350 def findParents(self, name=None, attrs={}, limit=None, **kwargs): 351 """Returns the parents of this Tag that match the given 352 criteria.""" 353 354 return self._findAll(name, attrs, None, limit, self.parentGenerator, 355 **kwargs) 356 fetchParents = findParents # Compatibility with pre-3.x 357 358 #These methods do the real heavy lifting. 359 360 def _findOne(self, method, name, attrs, text, **kwargs): 361 r = None 362 l = method(name, attrs, text, 1, **kwargs) 363 if l: 364 r = l[0] 365 return r 366 367 def _findAll(self, name, attrs, text, limit, generator, **kwargs): 368 "Iterates over a generator looking for things that match." 369 370 if isinstance(name, SoupStrainer): 371 strainer = name 372 # (Possibly) special case some findAll*(...) searches 373 elif text is None and not limit and not attrs and not kwargs: 374 # findAll*(True) 375 if name is True: 376 return [element for element in generator() 377 if isinstance(element, Tag)] 378 # findAll*('tag-name') 379 elif isinstance(name, basestring): 380 return [element for element in generator() 381 if isinstance(element, Tag) and 382 element.name == name] 383 else: 384 strainer = SoupStrainer(name, attrs, text, **kwargs) 385 # Build a SoupStrainer 386 else: 387 strainer = SoupStrainer(name, attrs, text, **kwargs) 388 results = ResultSet(strainer) 389 g = generator() 390 while True: 391 try: 392 i = next(g) 393 except StopIteration: 394 break 395 if i: 396 found = strainer.search(i) 397 if found: 398 results.append(found) 399 if limit and len(results) >= limit: 400 break 401 return results 402 403 #These Generators can be used to navigate starting from both 404 #NavigableStrings and Tags. 405 def nextGenerator(self): 406 i = self 407 while i is not None: 408 i = i.next 409 yield i 410 411 def nextSiblingGenerator(self): 412 i = self 413 while i is not None: 414 i = i.nextSibling 415 yield i 416 417 def previousGenerator(self): 418 i = self 419 while i is not None: 420 i = i.previous 421 yield i 422 423 def previousSiblingGenerator(self): 424 i = self 425 while i is not None: 426 i = i.previousSibling 427 yield i 428 429 def parentGenerator(self): 430 i = self 431 while i is not None: 432 i = i.parent 433 yield i 434 435 # Utility methods 436 def substituteEncoding(self, str, encoding=None): 437 encoding = encoding or "utf-8" 438 return str.replace("%SOUP-ENCODING%", encoding) 439 440 def toEncoding(self, s, encoding=None): 441 """Encodes an object to a string in some encoding, or to Unicode. 442 .""" 443 if isinstance(s, text_type): 444 if encoding: 445 s = s.encode(encoding) 446 elif isinstance(s, binary_type): 447 s = s.encode(encoding or "utf8") 448 else: 449 s = self.toEncoding(str(s), encoding or "utf8") 450 return s 451 452 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))") 453 454 def _sub_entity(self, x): 455 """Used with a regular expression to substitute the 456 appropriate XML entity for an XML special character.""" 457 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 458 459 460class NavigableString(text_type, PageElement): 461 462 def __new__(cls, value): 463 """Create a new NavigableString. 464 465 When unpickling a NavigableString, this method is called with 466 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 467 passed in to the superclass's __new__ or the superclass won't know 468 how to handle non-ASCII characters. 469 """ 470 if isinstance(value, text_type): 471 return text_type.__new__(cls, value) 472 return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 473 474 def __getnewargs__(self): 475 return (NavigableString.__str__(self),) 476 477 def __getattr__(self, attr): 478 """text.string gives you text. This is for backwards 479 compatibility for Navigable*String, but for CData* it lets you 480 get the string without the CData wrapper.""" 481 if attr == 'string': 482 return self 483 else: 484 raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) 485 486 def __unicode__(self): 487 return str(self).decode(DEFAULT_OUTPUT_ENCODING) 488 489 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 490 # Substitute outgoing XML entities. 491 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) 492 if encoding: 493 return data.encode(encoding) 494 else: 495 return data 496 497class CData(NavigableString): 498 499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 500 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 501 502class ProcessingInstruction(NavigableString): 503 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 504 output = self 505 if "%SOUP-ENCODING%" in output: 506 output = self.substituteEncoding(output, encoding) 507 return "<?%s?>" % self.toEncoding(output, encoding) 508 509class Comment(NavigableString): 510 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 511 return "<!--%s-->" % NavigableString.__str__(self, encoding) 512 513class Declaration(NavigableString): 514 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 515 return "<!%s>" % NavigableString.__str__(self, encoding) 516 517class Tag(PageElement): 518 519 """Represents a found HTML tag with its attributes and contents.""" 520 521 def _convertEntities(self, match): 522 """Used in a call to re.sub to replace HTML, XML, and numeric 523 entities with the appropriate Unicode characters. If HTML 524 entities are being converted, any unrecognized entities are 525 escaped.""" 526 try: 527 x = match.group(1) 528 if self.convertHTMLEntities and x in name2codepoint: 529 return unichr(name2codepoint[x]) 530 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 531 if self.convertXMLEntities: 532 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 533 else: 534 return u'&%s;' % x 535 elif len(x) > 0 and x[0] == '#': 536 # Handle numeric entities 537 if len(x) > 1 and x[1] == 'x': 538 return unichr(int(x[2:], 16)) 539 else: 540 return unichr(int(x[1:])) 541 542 elif self.escapeUnrecognizedEntities: 543 return u'&%s;' % x 544 545 except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000) 546 pass 547 548 return u'&%s;' % x 549 550 def __init__(self, parser, name, attrs=None, parent=None, 551 previous=None): 552 "Basic constructor." 553 554 # We don't actually store the parser object: that lets extracted 555 # chunks be garbage-collected 556 self.parserClass = parser.__class__ 557 self.isSelfClosing = parser.isSelfClosingTag(name) 558 self.name = name 559 if attrs is None: 560 attrs = [] 561 elif isinstance(attrs, dict): 562 attrs = attrs.items() 563 self.attrs = attrs 564 self.contents = [] 565 self.setup(parent, previous) 566 self.hidden = False 567 self.containsSubstitutions = False 568 self.convertHTMLEntities = parser.convertHTMLEntities 569 self.convertXMLEntities = parser.convertXMLEntities 570 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 571 572 # Convert any HTML, XML, or numeric entities in the attribute values. 573 # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8 574 convert = lambda k_val: (k_val[0], 575 re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);", 576 self._convertEntities, 577 k_val[1])) 578 self.attrs = map(convert, self.attrs) 579 580 def getString(self): 581 if (len(self.contents) == 1 582 and isinstance(self.contents[0], NavigableString)): 583 return self.contents[0] 584 585 def setString(self, string): 586 """Replace the contents of the tag with a string""" 587 self.clear() 588 self.append(string) 589 590 string = property(getString, setString) 591 592 def getText(self, separator=u""): 593 if not len(self.contents): 594 return u"" 595 stopNode = self._lastRecursiveChild().next 596 strings = [] 597 current = self.contents[0] 598 while current is not stopNode: 599 if isinstance(current, NavigableString): 600 strings.append(current.strip()) 601 current = current.next 602 return separator.join(strings) 603 604 text = property(getText) 605 606 def get(self, key, default=None): 607 """Returns the value of the 'key' attribute for the tag, or 608 the value given for 'default' if it doesn't have that 609 attribute.""" 610 return self._getAttrMap().get(key, default) 611 612 def clear(self): 613 """Extract all children.""" 614 for child in self.contents[:]: 615 child.extract() 616 617 def index(self, element): 618 for i, child in enumerate(self.contents): 619 if child is element: 620 return i 621 raise ValueError("Tag.index: element not in tag") 622 623 def has_key(self, key): 624 return self._getAttrMap().has_key(key) 625 626 def __getitem__(self, key): 627 """tag[key] returns the value of the 'key' attribute for the tag, 628 and throws an exception if it's not there.""" 629 return self._getAttrMap()[key] 630 631 def __iter__(self): 632 "Iterating over a tag iterates over its contents." 633 return iter(self.contents) 634 635 def __len__(self): 636 "The length of a tag is the length of its list of contents." 637 return len(self.contents) 638 639 def __contains__(self, x): 640 return x in self.contents 641 642 def __nonzero__(self): 643 "A tag is non-None even if it has no contents." 644 return True 645 646 def __setitem__(self, key, value): 647 """Setting tag[key] sets the value of the 'key' attribute for the 648 tag.""" 649 self._getAttrMap() 650 self.attrMap[key] = value 651 found = False 652 for i in xrange(0, len(self.attrs)): 653 if self.attrs[i][0] == key: 654 self.attrs[i] = (key, value) 655 found = True 656 if not found: 657 self.attrs.append((key, value)) 658 self._getAttrMap()[key] = value 659 660 def __delitem__(self, key): 661 "Deleting tag[key] deletes all 'key' attributes for the tag." 662 for item in self.attrs: 663 if item[0] == key: 664 self.attrs.remove(item) 665 #We don't break because bad HTML can define the same 666 #attribute multiple times. 667 self._getAttrMap() 668 if self.attrMap.has_key(key): 669 del self.attrMap[key] 670 671 def __call__(self, *args, **kwargs): 672 """Calling a tag like a function is the same as calling its 673 findAll() method. Eg. tag('a') returns a list of all the A tags 674 found within this tag.""" 675 return self.findAll(*args, **kwargs) 676 677 def __getattr__(self, tag): 678 #print "Getattr %s.%s" % (self.__class__, tag) 679 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 680 return self.find(tag[:-3]) 681 elif tag.find('__') != 0: 682 return self.find(tag) 683 raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) 684 685 def __eq__(self, other): 686 """Returns true iff this tag has the same name, the same attributes, 687 and the same contents (recursively) as the given tag. 688 689 NOTE: right now this will return false if two tags have the 690 same attributes in a different order. Should this be fixed?""" 691 if other is self: 692 return True 693 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 694 return False 695 for i in xrange(0, len(self.contents)): 696 if self.contents[i] != other.contents[i]: 697 return False 698 return True 699 700 def __ne__(self, other): 701 """Returns true iff this tag is not identical to the other tag, 702 as defined in __eq__.""" 703 return not self == other 704 705 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 706 """Renders this tag as a string.""" 707 return self.__str__(encoding) 708 709 def __unicode__(self): 710 return self.__str__(None) 711 712 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 713 prettyPrint=False, indentLevel=0): 714 """Returns a string or Unicode representation of this tag and 715 its contents. To get Unicode, pass None for encoding. 716 717 NOTE: since Python's HTML parser consumes whitespace, this 718 method is not certain to reproduce the whitespace present in 719 the original string.""" 720 721 encodedName = self.toEncoding(self.name, encoding) 722 723 attrs = [] 724 if self.attrs: 725 for key, val in self.attrs: 726 fmt = '%s="%s"' 727 if isinstance(val, basestring): 728 if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 729 val = self.substituteEncoding(val, encoding) 730 731 # The attribute value either: 732 # 733 # * Contains no embedded double quotes or single quotes. 734 # No problem: we enclose it in double quotes. 735 # * Contains embedded single quotes. No problem: 736 # double quotes work here too. 737 # * Contains embedded double quotes. No problem: 738 # we enclose it in single quotes. 739 # * Embeds both single _and_ double quotes. This 740 # can't happen naturally, but it can happen if 741 # you modify an attribute value after parsing 742 # the document. Now we have a bit of a 743 # problem. We solve it by enclosing the 744 # attribute in single quotes, and escaping any 745 # embedded single quotes to XML entities. 746 if '"' in val: 747 fmt = "%s='%s'" 748 if "'" in val: 749 # TODO: replace with apos when 750 # appropriate. 751 val = val.replace("'", "&squot;") 752 753 # Now we're okay w/r/t quotes. But the attribute 754 # value might also contain angle brackets, or 755 # ampersands that aren't part of entities. We need 756 # to escape those to XML entities too. 757 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 758 759 attrs.append(fmt % (self.toEncoding(key, encoding), 760 self.toEncoding(val, encoding))) 761 close = '' 762 closeTag = '' 763 if self.isSelfClosing: 764 close = ' /' 765 else: 766 closeTag = '</%s>' % encodedName 767 768 indentTag, indentContents = 0, 0 769 if prettyPrint: 770 indentTag = indentLevel 771 space = (' ' * (indentTag-1)) 772 indentContents = indentTag + 1 773 contents = self.renderContents(encoding, prettyPrint, indentContents) 774 if self.hidden: 775 s = contents 776 else: 777 s = [] 778 attributeString = '' 779 if attrs: 780 attributeString = ' ' + ' '.join(attrs) 781 if prettyPrint: 782 s.append(space) 783 s.append('<%s%s%s>' % (encodedName, attributeString, close)) 784 if prettyPrint: 785 s.append("\n") 786 s.append(contents) 787 if prettyPrint and contents and contents[-1] != "\n": 788 s.append("\n") 789 if prettyPrint and closeTag: 790 s.append(space) 791 s.append(closeTag) 792 if prettyPrint and closeTag and self.nextSibling: 793 s.append("\n") 794 s = ''.join(s) 795 return s 796 797 def decompose(self): 798 """Recursively destroys the contents of this tree.""" 799 self.extract() 800 if len(self.contents) == 0: 801 return 802 current = self.contents[0] 803 while current is not None: 804 next = current.next 805 if isinstance(current, Tag): 806 del current.contents[:] 807 current.parent = None 808 current.previous = None 809 current.previousSibling = None 810 current.next = None 811 current.nextSibling = None 812 current = next 813 814 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 815 return self.__str__(encoding, True) 816 817 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 818 prettyPrint=False, indentLevel=0): 819 """Renders the contents of this tag as a string in the given 820 encoding. If encoding is None, returns a Unicode string..""" 821 s=[] 822 for c in self: 823 text = None 824 if isinstance(c, NavigableString): 825 text = c.__str__(encoding) 826 elif isinstance(c, Tag): 827 s.append(c.__str__(encoding, prettyPrint, indentLevel)) 828 if text and prettyPrint: 829 text = text.strip() 830 if text: 831 if prettyPrint: 832 s.append(" " * (indentLevel-1)) 833 s.append(text) 834 if prettyPrint: 835 s.append("\n") 836 837 return ''.join(s) 838 839 #Soup methods 840 841 def find(self, name=None, attrs={}, recursive=True, text=None, 842 **kwargs): 843 """Return only the first child of this Tag matching the given 844 criteria.""" 845 r = None 846 l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 847 if l: 848 r = l[0] 849 return r 850 findChild = find 851 852 def findAll(self, name=None, attrs={}, recursive=True, text=None, 853 limit=None, **kwargs): 854 """Extracts a list of Tag objects that match the given 855 criteria. You can specify the name of the Tag and any 856 attributes you want the Tag to have. 857 858 The value of a key-value pair in the 'attrs' map can be a 859 string, a list of strings, a regular expression object, or a 860 callable that takes a string and returns whether or not the 861 string matches for some custom definition of 'matches'. The 862 same is true of the tag name.""" 863 generator = self.recursiveChildGenerator 864 if not recursive: 865 generator = self.childGenerator 866 return self._findAll(name, attrs, text, limit, generator, **kwargs) 867 findChildren = findAll 868 869 # Pre-3.x compatibility methods 870 first = find 871 fetch = findAll 872 873 def fetchText(self, text=None, recursive=True, limit=None): 874 return self.findAll(text=text, recursive=recursive, limit=limit) 875 876 def firstText(self, text=None, recursive=True): 877 return self.find(text=text, recursive=recursive) 878 879 #Private methods 880 881 def _getAttrMap(self): 882 """Initializes a map representation of this tag's attributes, 883 if not already initialized.""" 884 if not getattr(self, 'attrMap'): 885 self.attrMap = {} 886 for (key, value) in self.attrs: 887 self.attrMap[key] = value 888 return self.attrMap 889 890 #Generator methods 891 def childGenerator(self): 892 # Just use the iterator from the contents 893 return iter(self.contents) 894 895 def recursiveChildGenerator(self): 896 if not len(self.contents): 897 return # Note: https://stackoverflow.com/a/30217723 (PEP 479) 898 stopNode = self._lastRecursiveChild().next 899 current = self.contents[0] 900 while current is not stopNode: 901 yield current 902 current = current.next 903 904 905# Next, a couple classes to represent queries and their results. 906class SoupStrainer: 907 """Encapsulates a number of ways of matching a markup element (tag or 908 text).""" 909 910 def __init__(self, name=None, attrs={}, text=None, **kwargs): 911 self.name = name 912 if isinstance(attrs, basestring): 913 kwargs['class'] = _match_css_class(attrs) 914 attrs = None 915 if kwargs: 916 if attrs: 917 attrs = attrs.copy() 918 attrs.update(kwargs) 919 else: 920 attrs = kwargs 921 self.attrs = attrs 922 self.text = text 923 924 def __str__(self): 925 if self.text: 926 return self.text 927 else: 928 return "%s|%s" % (self.name, self.attrs) 929 930 def searchTag(self, markupName=None, markupAttrs={}): 931 found = None 932 markup = None 933 if isinstance(markupName, Tag): 934 markup = markupName 935 markupAttrs = markup 936 callFunctionWithTagData = callable(self.name) \ 937 and not isinstance(markupName, Tag) 938 939 if (not self.name) \ 940 or callFunctionWithTagData \ 941 or (markup and self._matches(markup, self.name)) \ 942 or (not markup and self._matches(markupName, self.name)): 943 if callFunctionWithTagData: 944 match = self.name(markupName, markupAttrs) 945 else: 946 match = True 947 markupAttrMap = None 948 for attr, matchAgainst in self.attrs.items(): 949 if not markupAttrMap: 950 if hasattr(markupAttrs, 'get'): 951 markupAttrMap = markupAttrs 952 else: 953 markupAttrMap = {} 954 for k,v in markupAttrs: 955 markupAttrMap[k] = v 956 attrValue = markupAttrMap.get(attr) 957 if not self._matches(attrValue, matchAgainst): 958 match = False 959 break 960 if match: 961 if markup: 962 found = markup 963 else: 964 found = markupName 965 return found 966 967 def search(self, markup): 968 #print 'looking for %s in %s' % (self, markup) 969 found = None 970 # If given a list of items, scan it for a text element that 971 # matches. 972 if hasattr(markup, "__iter__") \ 973 and not isinstance(markup, Tag): 974 for element in markup: 975 if isinstance(element, NavigableString) \ 976 and self.search(element): 977 found = element 978 break 979 # If it's a Tag, make sure its name or attributes match. 980 # Don't bother with Tags if we're searching for text. 981 elif isinstance(markup, Tag): 982 if not self.text: 983 found = self.searchTag(markup) 984 # If it's text, make sure the text matches. 985 elif isinstance(markup, NavigableString) or \ 986 isinstance(markup, basestring): 987 if self._matches(markup, self.text): 988 found = markup 989 else: 990 raise Exception("I don't know how to match against a %s" \ 991 % markup.__class__) 992 return found 993 994 def _matches(self, markup, matchAgainst): 995 #print "Matching %s against %s" % (markup, matchAgainst) 996 result = False 997 if matchAgainst is True: 998 result = markup is not None 999 elif callable(matchAgainst): 1000 result = matchAgainst(markup) 1001 else: 1002 #Custom match methods take the tag as an argument, but all 1003 #other ways of matching match the tag name as a string. 1004 if isinstance(markup, Tag): 1005 markup = markup.name 1006 if markup and not isinstance(markup, basestring): 1007 markup = text_type(markup) 1008 #Now we know that chunk is either a string, or None. 1009 if hasattr(matchAgainst, 'match'): 1010 # It's a regexp object. 1011 result = markup and matchAgainst.search(markup) 1012 elif hasattr(matchAgainst, '__iter__'): # list-like 1013 result = markup in matchAgainst 1014 elif hasattr(matchAgainst, 'items'): 1015 result = markup.has_key(matchAgainst) 1016 elif matchAgainst and isinstance(markup, basestring): 1017 if isinstance(markup, text_type): 1018 matchAgainst = text_type(matchAgainst) 1019 else: 1020 matchAgainst = str(matchAgainst) 1021 1022 if not result: 1023 result = matchAgainst == markup 1024 return result 1025 1026class ResultSet(list): 1027 """A ResultSet is just a list that keeps track of the SoupStrainer 1028 that created it.""" 1029 def __init__(self, source): 1030 list.__init__([]) 1031 self.source = source 1032 1033# Now, some helper functions. 1034 1035def buildTagMap(default, *args): 1036 """Turns a list of maps, lists, or scalars into a single map. 1037 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1038 NESTING_RESET_TAGS maps out of lists and partial maps.""" 1039 built = {} 1040 for portion in args: 1041 if hasattr(portion, 'items'): 1042 #It's a map. Merge it. 1043 for k,v in portion.items(): 1044 built[k] = v 1045 elif hasattr(portion, '__iter__'): # is a list 1046 #It's a list. Map each item to the default. 1047 for k in portion: 1048 built[k] = default 1049 else: 1050 #It's a scalar. Map it to the default. 1051 built[portion] = default 1052 return built 1053 1054# Now, the parser classes. 1055 1056class BeautifulStoneSoup(Tag, sgmllib.SGMLParser): 1057 1058 """This class contains the basic parser and search code. It defines 1059 a parser that knows nothing about tag behavior except for the 1060 following: 1061 1062 You can't close a tag without closing all the tags it encloses. 1063 That is, "<foo><bar></foo>" actually means 1064 "<foo><bar></bar></foo>". 1065 1066 [Another possible explanation is "<foo><bar /></foo>", but since 1067 this class defines no SELF_CLOSING_TAGS, it will never use that 1068 explanation.] 1069 1070 This class is useful for parsing XML or made-up markup languages, 1071 or when BeautifulSoup makes an assumption counter to what you were 1072 expecting.""" 1073 1074 SELF_CLOSING_TAGS = {} 1075 NESTABLE_TAGS = {} 1076 RESET_NESTING_TAGS = {} 1077 QUOTE_TAGS = {} 1078 PRESERVE_WHITESPACE_TAGS = [] 1079 1080 MARKUP_MASSAGE = [(re.compile(r'(<[^<>]*)/>'), 1081 lambda x: x.group(1) + ' />'), 1082 (re.compile(r'<!\s+([^<>]*)>'), 1083 lambda x: '<!' + x.group(1) + '>') 1084 ] 1085 1086 ROOT_TAG_NAME = u'[document]' 1087 1088 HTML_ENTITIES = "html" 1089 XML_ENTITIES = "xml" 1090 XHTML_ENTITIES = "xhtml" 1091 # TODO: This only exists for backwards-compatibility 1092 ALL_ENTITIES = XHTML_ENTITIES 1093 1094 # Used when determining whether a text node is all whitespace and 1095 # can be replaced with a single space. A text node that contains 1096 # fancy Unicode spaces (usually non-breaking) should be left 1097 # alone. 1098 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1099 1100 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1101 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1102 convertEntities=None, selfClosingTags=None, isHTML=False): 1103 """The Soup object is initialized as the 'root tag', and the 1104 provided markup (which can be a string or a file-like object) 1105 is fed into the underlying parser. 1106 1107 sgmllib will process most bad HTML, and the BeautifulSoup 1108 class has some tricks for dealing with some HTML that kills 1109 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1110 if your data uses self-closing tags or declarations 1111 incorrectly. 1112 1113 By default, Beautiful Soup uses regexes to sanitize input, 1114 avoiding the vast majority of these problems. If the problems 1115 don't apply to you, pass in False for markupMassage, and 1116 you'll get better performance. 1117 1118 The default parser massage techniques fix the two most common 1119 instances of invalid HTML that choke sgmllib: 1120 1121 <br/> (No space between name of closing tag and tag close) 1122 <! --Comment--> (Extraneous whitespace in declaration) 1123 1124 You can pass in a custom list of (RE object, replace method) 1125 tuples to get Beautiful Soup to scrub your input the way you 1126 want.""" 1127 1128 self.parseOnlyThese = parseOnlyThese 1129 self.fromEncoding = fromEncoding 1130 self.smartQuotesTo = smartQuotesTo 1131 self.convertEntities = convertEntities 1132 # Set the rules for how we'll deal with the entities we 1133 # encounter 1134 if self.convertEntities: 1135 # It doesn't make sense to convert encoded characters to 1136 # entities even while you're converting entities to Unicode. 1137 # Just convert it all to Unicode. 1138 self.smartQuotesTo = None 1139 if convertEntities == self.HTML_ENTITIES: 1140 self.convertXMLEntities = False 1141 self.convertHTMLEntities = True 1142 self.escapeUnrecognizedEntities = True 1143 elif convertEntities == self.XHTML_ENTITIES: 1144 self.convertXMLEntities = True 1145 self.convertHTMLEntities = True 1146 self.escapeUnrecognizedEntities = False 1147 elif convertEntities == self.XML_ENTITIES: 1148 self.convertXMLEntities = True 1149 self.convertHTMLEntities = False 1150 self.escapeUnrecognizedEntities = False 1151 else: 1152 self.convertXMLEntities = False 1153 self.convertHTMLEntities = False 1154 self.escapeUnrecognizedEntities = False 1155 1156 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1157 sgmllib.SGMLParser.__init__(self) 1158 1159 if hasattr(markup, 'read'): # It's a file-type object. 1160 markup = markup.read() 1161 self.markup = markup 1162 self.markupMassage = markupMassage 1163 try: 1164 self._feed(isHTML=isHTML) 1165 except StopParsing: 1166 pass 1167 self.markup = None # The markup can now be GCed 1168 1169 def convert_charref(self, name): 1170 """This method fixes a bug in Python's SGMLParser.""" 1171 try: 1172 n = int(name) 1173 except ValueError: 1174 return 1175 if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1176 return 1177 return self.convert_codepoint(n) 1178 1179 def _feed(self, inDocumentEncoding=None, isHTML=False): 1180 # Convert the document to Unicode. 1181 markup = self.markup 1182 if isinstance(markup, text_type): 1183 if not hasattr(self, 'originalEncoding'): 1184 self.originalEncoding = None 1185 else: 1186 dammit = UnicodeDammit\ 1187 (markup, [self.fromEncoding, inDocumentEncoding], 1188 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1189 markup = dammit.unicode 1190 self.originalEncoding = dammit.originalEncoding 1191 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1192 if markup: 1193 if self.markupMassage: 1194 if not hasattr(self.markupMassage, "__iter__"): 1195 self.markupMassage = self.MARKUP_MASSAGE 1196 for fix, m in self.markupMassage: 1197 markup = fix.sub(m, markup) 1198 # TODO: We get rid of markupMassage so that the 1199 # soup object can be deepcopied later on. Some 1200 # Python installations can't copy regexes. If anyone 1201 # was relying on the existence of markupMassage, this 1202 # might cause problems. 1203 del(self.markupMassage) 1204 self.reset() 1205 1206 sgmllib.SGMLParser.feed(self, markup) 1207 # Close out any unfinished strings and close all the open tags. 1208 self.endData() 1209 while self.currentTag.name != self.ROOT_TAG_NAME: 1210 self.popTag() 1211 1212 def __getattr__(self, methodName): 1213 """This method routes method call requests to either the SGMLParser 1214 superclass or the Tag superclass, depending on the method name.""" 1215 #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1216 1217 if methodName.startswith('start_') or methodName.startswith('end_') \ 1218 or methodName.startswith('do_'): 1219 return sgmllib.SGMLParser.__getattr__(self, methodName) 1220 elif not methodName.startswith('__'): 1221 return Tag.__getattr__(self, methodName) 1222 else: 1223 raise AttributeError 1224 1225 def isSelfClosingTag(self, name): 1226 """Returns true iff the given string is the name of a 1227 self-closing tag according to this parser.""" 1228 return name in self.SELF_CLOSING_TAGS \ 1229 or name in self.instanceSelfClosingTags 1230 1231 def reset(self): 1232 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1233 self.hidden = 1 1234 sgmllib.SGMLParser.reset(self) 1235 self.currentData = [] 1236 self.currentTag = None 1237 self.tagStack = [] 1238 self.quoteStack = [] 1239 self.pushTag(self) 1240 1241 def popTag(self): 1242 tag = self.tagStack.pop() 1243 1244 #print "Pop", tag.name 1245 if self.tagStack: 1246 self.currentTag = self.tagStack[-1] 1247 return self.currentTag 1248 1249 def pushTag(self, tag): 1250 #print "Push", tag.name 1251 if self.currentTag: 1252 self.currentTag.contents.append(tag) 1253 self.tagStack.append(tag) 1254 self.currentTag = self.tagStack[-1] 1255 1256 def endData(self, containerClass=NavigableString): 1257 if self.currentData: 1258 currentData = u''.join(self.currentData) 1259 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1260 not set([tag.name for tag in self.tagStack]).intersection( 1261 self.PRESERVE_WHITESPACE_TAGS)): 1262 if '\n' in currentData: 1263 currentData = '\n' 1264 else: 1265 currentData = ' ' 1266 self.currentData = [] 1267 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1268 (not self.parseOnlyThese.text or \ 1269 not self.parseOnlyThese.search(currentData)): 1270 return 1271 o = containerClass(currentData) 1272 o.setup(self.currentTag, self.previous) 1273 if self.previous: 1274 self.previous.next = o 1275 self.previous = o 1276 self.currentTag.contents.append(o) 1277 1278 1279 def _popToTag(self, name, inclusivePop=True): 1280 """Pops the tag stack up to and including the most recent 1281 instance of the given tag. If inclusivePop is false, pops the tag 1282 stack up to but *not* including the most recent instqance of 1283 the given tag.""" 1284 #print "Popping to %s" % name 1285 if name == self.ROOT_TAG_NAME: 1286 return 1287 1288 numPops = 0 1289 mostRecentTag = None 1290 for i in xrange(len(self.tagStack)-1, 0, -1): 1291 if name == self.tagStack[i].name: 1292 numPops = len(self.tagStack)-i 1293 break 1294 if not inclusivePop: 1295 numPops = numPops - 1 1296 1297 for i in xrange(0, numPops): 1298 mostRecentTag = self.popTag() 1299 return mostRecentTag 1300 1301 def _smartPop(self, name): 1302 1303 """We need to pop up to the previous tag of this type, unless 1304 one of this tag's nesting reset triggers comes between this 1305 tag and the previous tag of this type, OR unless this tag is a 1306 generic nesting trigger and another generic nesting trigger 1307 comes between this tag and the previous tag of this type. 1308 1309 Examples: 1310 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1311 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1312 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1313 1314 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1315 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1316 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1317 """ 1318 1319 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1320 isNestable = nestingResetTriggers != None 1321 isResetNesting = name in self.RESET_NESTING_TAGS 1322 popTo = None 1323 inclusive = True 1324 for i in xrange(len(self.tagStack)-1, 0, -1): 1325 p = self.tagStack[i] 1326 if (not p or p.name == name) and not isNestable: 1327 #Non-nestable tags get popped to the top or to their 1328 #last occurance. 1329 popTo = name 1330 break 1331 if (nestingResetTriggers is not None 1332 and p.name in nestingResetTriggers) \ 1333 or (nestingResetTriggers is None and isResetNesting 1334 and p.name in self.RESET_NESTING_TAGS): 1335 1336 #If we encounter one of the nesting reset triggers 1337 #peculiar to this tag, or we encounter another tag 1338 #that causes nesting to reset, pop up to but not 1339 #including that tag. 1340 popTo = p.name 1341 inclusive = False 1342 break 1343 p = p.parent 1344 if popTo: 1345 self._popToTag(popTo, inclusive) 1346 1347 def unknown_starttag(self, name, attrs, selfClosing=0): 1348 #print "Start tag %s: %s" % (name, attrs) 1349 if self.quoteStack: 1350 #This is not a real tag. 1351 #print "<%s> is not real!" % name 1352 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1353 self.handle_data('<%s%s>' % (name, attrs)) 1354 return 1355 self.endData() 1356 1357 if not self.isSelfClosingTag(name) and not selfClosing: 1358 self._smartPop(name) 1359 1360 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1361 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1362 return 1363 1364 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1365 if self.previous: 1366 self.previous.next = tag 1367 self.previous = tag 1368 self.pushTag(tag) 1369 if selfClosing or self.isSelfClosingTag(name): 1370 self.popTag() 1371 if name in self.QUOTE_TAGS: 1372 #print "Beginning quote (%s)" % name 1373 self.quoteStack.append(name) 1374 self.literal = 1 1375 return tag 1376 1377 def unknown_endtag(self, name): 1378 #print "End tag %s" % name 1379 if self.quoteStack and self.quoteStack[-1] != name: 1380 #This is not a real end tag. 1381 #print "</%s> is not real!" % name 1382 self.handle_data('</%s>' % name) 1383 return 1384 self.endData() 1385 self._popToTag(name) 1386 if self.quoteStack and self.quoteStack[-1] == name: 1387 self.quoteStack.pop() 1388 self.literal = (len(self.quoteStack) > 0) 1389 1390 def handle_data(self, data): 1391 self.currentData.append(data) 1392 1393 def _toStringSubclass(self, text, subclass): 1394 """Adds a certain piece of text to the tree as a NavigableString 1395 subclass.""" 1396 self.endData() 1397 self.handle_data(text) 1398 self.endData(subclass) 1399 1400 def handle_pi(self, text): 1401 """Handle a processing instruction as a ProcessingInstruction 1402 object, possibly one with a %SOUP-ENCODING% slot into which an 1403 encoding will be plugged later.""" 1404 if text[:3] == "xml": 1405 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1406 self._toStringSubclass(text, ProcessingInstruction) 1407 1408 def handle_comment(self, text): 1409 "Handle comments as Comment objects." 1410 self._toStringSubclass(text, Comment) 1411 1412 def handle_charref(self, ref): 1413 "Handle character references as data." 1414 if self.convertEntities: 1415 data = unichr(int(ref)) 1416 else: 1417 data = '&#%s;' % ref 1418 self.handle_data(data) 1419 1420 def handle_entityref(self, ref): 1421 """Handle entity references as data, possibly converting known 1422 HTML and/or XML entity references to the corresponding Unicode 1423 characters.""" 1424 data = None 1425 if self.convertHTMLEntities: 1426 try: 1427 data = unichr(name2codepoint[ref]) 1428 except KeyError: 1429 pass 1430 1431 if not data and self.convertXMLEntities: 1432 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1433 1434 if not data and self.convertHTMLEntities and \ 1435 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1436 # TODO: We've got a problem here. We're told this is 1437 # an entity reference, but it's not an XML entity 1438 # reference or an HTML entity reference. Nonetheless, 1439 # the logical thing to do is to pass it through as an 1440 # unrecognized entity reference. 1441 # 1442 # Except: when the input is "&carol;" this function 1443 # will be called with input "carol". When the input is 1444 # "AT&T", this function will be called with input 1445 # "T". We have no way of knowing whether a semicolon 1446 # was present originally, so we don't know whether 1447 # this is an unknown entity or just a misplaced 1448 # ampersand. 1449 # 1450 # The more common case is a misplaced ampersand, so I 1451 # escape the ampersand and omit the trailing semicolon. 1452 data = "&%s" % ref 1453 if not data: 1454 # This case is different from the one above, because we 1455 # haven't already gone through a supposedly comprehensive 1456 # mapping of entities to Unicode characters. We might not 1457 # have gone through any mapping at all. So the chances are 1458 # very high that this is a real entity, and not a 1459 # misplaced ampersand. 1460 data = "&%s;" % ref 1461 self.handle_data(data) 1462 1463 def handle_decl(self, data): 1464 "Handle DOCTYPEs and the like as Declaration objects." 1465 self._toStringSubclass(data, Declaration) 1466 1467 def parse_declaration(self, i): 1468 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1469 declaration as a CData object.""" 1470 j = None 1471 if self.rawdata[i:i+9] == '<![CDATA[': 1472 k = self.rawdata.find(']]>', i) 1473 if k == -1: 1474 k = len(self.rawdata) 1475 data = self.rawdata[i+9:k] 1476 j = k+3 1477 self._toStringSubclass(data, CData) 1478 else: 1479 try: 1480 j = sgmllib.SGMLParser.parse_declaration(self, i) 1481 except sgmllib.SGMLParseError: 1482 toHandle = self.rawdata[i:] 1483 self.handle_data(toHandle) 1484 j = i + len(toHandle) 1485 return j 1486 1487class BeautifulSoup(BeautifulStoneSoup): 1488 1489 """This parser knows the following facts about HTML: 1490 1491 * Some tags have no closing tag and should be interpreted as being 1492 closed as soon as they are encountered. 1493 1494 * The text inside some tags (ie. 'script') may contain tags which 1495 are not really part of the document and which should be parsed 1496 as text, not tags. If you want to parse the text as tags, you can 1497 always fetch it and parse it explicitly. 1498 1499 * Tag nesting rules: 1500 1501 Most tags can't be nested at all. For instance, the occurance of 1502 a <p> tag should implicitly close the previous <p> tag. 1503 1504 <p>Para1<p>Para2 1505 should be transformed into: 1506 <p>Para1</p><p>Para2 1507 1508 Some tags can be nested arbitrarily. For instance, the occurance 1509 of a <blockquote> tag should _not_ implicitly close the previous 1510 <blockquote> tag. 1511 1512 Alice said: <blockquote>Bob said: <blockquote>Blah 1513 should NOT be transformed into: 1514 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1515 1516 Some tags can be nested, but the nesting is reset by the 1517 interposition of other tags. For instance, a <tr> tag should 1518 implicitly close the previous <tr> tag within the same <table>, 1519 but not close a <tr> tag in another table. 1520 1521 <table><tr>Blah<tr>Blah 1522 should be transformed into: 1523 <table><tr>Blah</tr><tr>Blah 1524 but, 1525 <tr>Blah<table><tr>Blah 1526 should NOT be transformed into 1527 <tr>Blah<table></tr><tr>Blah 1528 1529 Differing assumptions about tag nesting rules are a major source 1530 of problems with the BeautifulSoup class. If BeautifulSoup is not 1531 treating as nestable a tag your page author treats as nestable, 1532 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1533 BeautifulStoneSoup before writing your own subclass.""" 1534 1535 def __init__(self, *args, **kwargs): 1536 if 'smartQuotesTo' not in kwargs: 1537 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1538 kwargs['isHTML'] = True 1539 BeautifulStoneSoup.__init__(self, *args, **kwargs) 1540 1541 SELF_CLOSING_TAGS = buildTagMap(None, 1542 ('br' , 'hr', 'input', 'img', 'meta', 1543 'spacer', 'link', 'frame', 'base', 'col')) 1544 1545 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1546 1547 QUOTE_TAGS = {'script' : None, 'textarea' : None} 1548 1549 #According to the HTML standard, each of these inline tags can 1550 #contain another tag of the same type. Furthermore, it's common 1551 #to actually use these tags this way. 1552 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1553 'center') 1554 1555 #According to the HTML standard, these block tags can contain 1556 #another tag of the same type. Furthermore, it's common 1557 #to actually use these tags this way. 1558 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1559 1560 #Lists can contain other lists, but there are restrictions. 1561 NESTABLE_LIST_TAGS = { 'ol' : [], 1562 'ul' : [], 1563 'li' : ['ul', 'ol'], 1564 'dl' : [], 1565 'dd' : ['dl'], 1566 'dt' : ['dl'] } 1567 1568 #Tables can contain other tables, but there are restrictions. 1569 NESTABLE_TABLE_TAGS = {'table' : [], 1570 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1571 'td' : ['tr'], 1572 'th' : ['tr'], 1573 'thead' : ['table'], 1574 'tbody' : ['table'], 1575 'tfoot' : ['table'], 1576 } 1577 1578 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1579 1580 #If one of these tags is encountered, all tags up to the next tag of 1581 #this type are popped. 1582 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1583 NON_NESTABLE_BLOCK_TAGS, 1584 NESTABLE_LIST_TAGS, 1585 NESTABLE_TABLE_TAGS) 1586 1587 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1588 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1589 1590 # Used to detect the charset in a META tag; see start_meta 1591 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 1592 1593 def start_meta(self, attrs): 1594 """Beautiful Soup can detect a charset included in a META tag, 1595 try to convert the document to that charset, and re-parse the 1596 document from the beginning.""" 1597 httpEquiv = None 1598 contentType = None 1599 contentTypeIndex = None 1600 tagNeedsEncodingSubstitution = False 1601 1602 for i in xrange(0, len(attrs)): 1603 key, value = attrs[i] 1604 key = key.lower() 1605 if key == 'http-equiv': 1606 httpEquiv = value 1607 elif key == 'content': 1608 contentType = value 1609 contentTypeIndex = i 1610 1611 if httpEquiv and contentType: # It's an interesting meta tag. 1612 match = self.CHARSET_RE.search(contentType) 1613 if match: 1614 if (self.declaredHTMLEncoding is not None or 1615 self.originalEncoding == self.fromEncoding): 1616 # An HTML encoding was sniffed while converting 1617 # the document to Unicode, or an HTML encoding was 1618 # sniffed during a previous pass through the 1619 # document, or an encoding was specified 1620 # explicitly and it worked. Rewrite the meta tag. 1621 def rewrite(match): 1622 return match.group(1) + "%SOUP-ENCODING%" 1623 newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1624 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1625 newAttr) 1626 tagNeedsEncodingSubstitution = True 1627 else: 1628 # This is our first pass through the document. 1629 # Go through it again with the encoding information. 1630 newCharset = match.group(3) 1631 if newCharset and newCharset != self.originalEncoding: 1632 self.declaredHTMLEncoding = newCharset 1633 self._feed(self.declaredHTMLEncoding) 1634 raise StopParsing 1635 pass 1636 tag = self.unknown_starttag("meta", attrs) 1637 if tag and tagNeedsEncodingSubstitution: 1638 tag.containsSubstitutions = True 1639 1640class StopParsing(Exception): 1641 pass 1642 1643class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1644 1645 """The BeautifulSoup class is oriented towards skipping over 1646 common HTML errors like unclosed tags. However, sometimes it makes 1647 errors of its own. For instance, consider this fragment: 1648 1649 <b>Foo<b>Bar</b></b> 1650 1651 This is perfectly valid (if bizarre) HTML. However, the 1652 BeautifulSoup class will implicitly close the first b tag when it 1653 encounters the second 'b'. It will think the author wrote 1654 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1655 there's no real-world reason to bold something that's already 1656 bold. When it encounters '</b></b>' it will close two more 'b' 1657 tags, for a grand total of three tags closed instead of two. This 1658 can throw off the rest of your document structure. The same is 1659 true of a number of other tags, listed below. 1660 1661 It's much more common for someone to forget to close a 'b' tag 1662 than to actually use nested 'b' tags, and the BeautifulSoup class 1663 handles the common case. This class handles the not-co-common 1664 case: where you can't believe someone wrote what they did, but 1665 it's valid HTML and BeautifulSoup screwed up by assuming it 1666 wouldn't be.""" 1667 1668 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1669 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1670 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1671 'big') 1672 1673 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1674 1675 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1676 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1677 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1678 1679class MinimalSoup(BeautifulSoup): 1680 """The MinimalSoup class is for parsing HTML that contains 1681 pathologically bad markup. It makes no assumptions about tag 1682 nesting, but it does know which tags are self-closing, that 1683 <script> tags contain Javascript and should not be parsed, that 1684 META tags may contain encoding information, and so on. 1685 1686 This also makes it better for subclassing than BeautifulStoneSoup 1687 or BeautifulSoup.""" 1688 1689 RESET_NESTING_TAGS = buildTagMap('noscript') 1690 NESTABLE_TAGS = {} 1691 1692class BeautifulSOAP(BeautifulStoneSoup): 1693 """This class will push a tag with only a single string child into 1694 the tag's parent as an attribute. The attribute's name is the tag 1695 name, and the value is the string child. An example should give 1696 the flavor of the change: 1697 1698 <foo><bar>baz</bar></foo> 1699 => 1700 <foo bar="baz"><bar>baz</bar></foo> 1701 1702 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1703 1704 This is, of course, useful for scraping structures that tend to 1705 use subelements instead of attributes, such as SOAP messages. Note 1706 that it modifies its input, so don't print the modified version 1707 out. 1708 1709 I'm not sure how many people really want to use this class; let me 1710 know if you do. Mainly I like the name.""" 1711 1712 def popTag(self): 1713 if len(self.tagStack) > 1: 1714 tag = self.tagStack[-1] 1715 parent = self.tagStack[-2] 1716 parent._getAttrMap() 1717 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1718 isinstance(tag.contents[0], NavigableString) and 1719 not parent.attrMap.has_key(tag.name)): 1720 parent[tag.name] = tag.contents[0] 1721 BeautifulStoneSoup.popTag(self) 1722 1723#Enterprise class names! It has come to our attention that some people 1724#think the names of the Beautiful Soup parser classes are too silly 1725#and "unprofessional" for use in enterprise screen-scraping. We feel 1726#your pain! For such-minded folk, the Beautiful Soup Consortium And 1727#All-Night Kosher Bakery recommends renaming this file to 1728#"RobustParser.py" (or, in cases of extreme enterprisiness, 1729#"RobustParserBeanInterface.class") and using the following 1730#enterprise-friendly class aliases: 1731class RobustXMLParser(BeautifulStoneSoup): 1732 pass 1733class RobustHTMLParser(BeautifulSoup): 1734 pass 1735class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1736 pass 1737class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1738 pass 1739class SimplifyingSOAPParser(BeautifulSOAP): 1740 pass 1741 1742###################################################### 1743# 1744# Bonus library: Unicode, Dammit 1745# 1746# This class forces XML data into a standard format (usually to UTF-8 1747# or Unicode). It is heavily based on code from Mark Pilgrim's 1748# Universal Feed Parser. It does not rewrite the XML or HTML to 1749# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1750# (XML) and BeautifulSoup.start_meta (HTML). 1751 1752# Autodetects character encodings. 1753# Download from http://chardet.feedparser.org/ 1754try: 1755 import chardet 1756# import chardet.constants 1757# chardet.constants._debug = 1 1758except ImportError: 1759 chardet = None 1760 1761# cjkcodecs and iconv_codec make Python know about more character encodings. 1762# Both are available from http://cjkpython.i18n.org/ 1763# They're built in if you use Python 2.4. 1764try: 1765 import cjkcodecs.aliases 1766except ImportError: 1767 pass 1768try: 1769 import iconv_codec 1770except ImportError: 1771 pass 1772 1773class UnicodeDammit: 1774 """A class for detecting the encoding of a *ML document and 1775 converting it to a Unicode string. If the source encoding is 1776 windows-1252, can replace MS smart quotes with their HTML or XML 1777 equivalents.""" 1778 1779 # This dictionary maps commonly seen values for "charset" in HTML 1780 # meta tags to the corresponding Python codec names. It only covers 1781 # values that aren't in Python's aliases and can't be determined 1782 # by the heuristics in find_codec. 1783 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1784 "x-sjis" : "shift-jis" } 1785 1786 def __init__(self, markup, overrideEncodings=[], 1787 smartQuotesTo='xml', isHTML=False): 1788 self.declaredHTMLEncoding = None 1789 self.markup, documentEncoding, sniffedEncoding = \ 1790 self._detectEncoding(markup, isHTML) 1791 self.smartQuotesTo = smartQuotesTo 1792 self.triedEncodings = [] 1793 if markup == '' or isinstance(markup, text_type): 1794 self.originalEncoding = None 1795 self.unicode = text_type(markup) 1796 return 1797 1798 u = None 1799 for proposedEncoding in overrideEncodings: 1800 u = self._convertFrom(proposedEncoding) 1801 if u: break 1802 if not u: 1803 for proposedEncoding in (documentEncoding, sniffedEncoding): 1804 u = self._convertFrom(proposedEncoding) 1805 if u: break 1806 1807 # If no luck and we have auto-detection library, try that: 1808 if not u and chardet and not isinstance(self.markup, text_type): 1809 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1810 1811 # As a last resort, try utf-8 and windows-1252: 1812 if not u: 1813 for proposed_encoding in ("utf-8", "windows-1252"): 1814 u = self._convertFrom(proposed_encoding) 1815 if u: break 1816 1817 self.unicode = u 1818 if not u: self.originalEncoding = None 1819 1820 def _subMSChar(self, orig): 1821 """Changes a MS smart quote character to an XML or HTML 1822 entity.""" 1823 sub = self.MS_CHARS.get(orig) 1824 if isinstance(sub, tuple): 1825 if self.smartQuotesTo == 'xml': 1826 sub = '&#x%s;' % sub[1] 1827 else: 1828 sub = '&%s;' % sub[0] 1829 return sub 1830 1831 def _convertFrom(self, proposed): 1832 proposed = self.find_codec(proposed) 1833 if not proposed or proposed in self.triedEncodings: 1834 return None 1835 self.triedEncodings.append(proposed) 1836 markup = self.markup 1837 1838 # Convert smart quotes to HTML if coming from an encoding 1839 # that might have them. 1840 if self.smartQuotesTo and proposed.lower() in("windows-1252", 1841 "iso-8859-1", 1842 "iso-8859-2"): 1843 markup = re.compile("([\x80-\x9f])").sub \ 1844 (lambda x: self._subMSChar(x.group(1)), 1845 markup) 1846 1847 try: 1848 # print "Trying to convert document to %s" % proposed 1849 u = self._toUnicode(markup, proposed) 1850 self.markup = u 1851 self.originalEncoding = proposed 1852 except Exception as e: 1853 # print "That didn't work!" 1854 # print e 1855 return None 1856 #print "Correct encoding: %s" % proposed 1857 return self.markup 1858 1859 def _toUnicode(self, data, encoding): 1860 '''Given a string and its encoding, decodes the string into Unicode. 1861 %encoding is a string recognized by encodings.aliases''' 1862 1863 # strip Byte Order Mark (if present) 1864 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1865 and (data[2:4] != '\x00\x00'): 1866 encoding = 'utf-16be' 1867 data = data[2:] 1868 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1869 and (data[2:4] != '\x00\x00'): 1870 encoding = 'utf-16le' 1871 data = data[2:] 1872 elif data[:3] == '\xef\xbb\xbf': 1873 encoding = 'utf-8' 1874 data = data[3:] 1875 elif data[:4] == '\x00\x00\xfe\xff': 1876 encoding = 'utf-32be' 1877 data = data[4:] 1878 elif data[:4] == '\xff\xfe\x00\x00': 1879 encoding = 'utf-32le' 1880 data = data[4:] 1881 newdata = text_type(data, encoding) 1882 return newdata 1883 1884 def _detectEncoding(self, xml_data, isHTML=False): 1885 """Given a document, tries to detect its XML encoding.""" 1886 xml_encoding = sniffed_xml_encoding = None 1887 try: 1888 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1889 # EBCDIC 1890 xml_data = self._ebcdic_to_ascii(xml_data) 1891 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1892 # UTF-16BE 1893 sniffed_xml_encoding = 'utf-16be' 1894 xml_data = text_type(xml_data, 'utf-16be').encode('utf-8') 1895 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1896 and (xml_data[2:4] != '\x00\x00'): 1897 # UTF-16BE with BOM 1898 sniffed_xml_encoding = 'utf-16be' 1899 xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8') 1900 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1901 # UTF-16LE 1902 sniffed_xml_encoding = 'utf-16le' 1903 xml_data = text_type(xml_data, 'utf-16le').encode('utf-8') 1904 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1905 (xml_data[2:4] != '\x00\x00'): 1906 # UTF-16LE with BOM 1907 sniffed_xml_encoding = 'utf-16le' 1908 xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8') 1909 elif xml_data[:4] == '\x00\x00\x00\x3c': 1910 # UTF-32BE 1911 sniffed_xml_encoding = 'utf-32be' 1912 xml_data = text_type(xml_data, 'utf-32be').encode('utf-8') 1913 elif xml_data[:4] == '\x3c\x00\x00\x00': 1914 # UTF-32LE 1915 sniffed_xml_encoding = 'utf-32le' 1916 xml_data = text_type(xml_data, 'utf-32le').encode('utf-8') 1917 elif xml_data[:4] == '\x00\x00\xfe\xff': 1918 # UTF-32BE with BOM 1919 sniffed_xml_encoding = 'utf-32be' 1920 xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8') 1921 elif xml_data[:4] == '\xff\xfe\x00\x00': 1922 # UTF-32LE with BOM 1923 sniffed_xml_encoding = 'utf-32le' 1924 xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8') 1925 elif xml_data[:3] == '\xef\xbb\xbf': 1926 # UTF-8 with BOM 1927 sniffed_xml_encoding = 'utf-8' 1928 xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8') 1929 else: 1930 sniffed_xml_encoding = 'ascii' 1931 pass 1932 except: 1933 xml_encoding_match = None 1934 xml_encoding_match = re.compile( 1935 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 1936 if not xml_encoding_match and isHTML: 1937 regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) 1938 xml_encoding_match = regexp.search(xml_data) 1939 if xml_encoding_match is not None: 1940 xml_encoding = xml_encoding_match.groups()[0].lower() 1941 if isHTML: 1942 self.declaredHTMLEncoding = xml_encoding 1943 if sniffed_xml_encoding and \ 1944 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1945 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1946 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1947 'utf16', 'u16')): 1948 xml_encoding = sniffed_xml_encoding 1949 return xml_data, xml_encoding, sniffed_xml_encoding 1950 1951 1952 def find_codec(self, charset): 1953 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1954 or (charset and self._codec(charset.replace("-", ""))) \ 1955 or (charset and self._codec(charset.replace("-", "_"))) \ 1956 or charset 1957 1958 def _codec(self, charset): 1959 if not charset: return charset 1960 codec = None 1961 try: 1962 codecs.lookup(charset) 1963 codec = charset 1964 except (LookupError, ValueError): 1965 pass 1966 return codec 1967 1968 EBCDIC_TO_ASCII_MAP = None 1969 def _ebcdic_to_ascii(self, s): 1970 c = self.__class__ 1971 if not c.EBCDIC_TO_ASCII_MAP: 1972 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1973 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1974 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1975 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1976 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1977 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1978 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1979 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1980 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1981 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1982 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1983 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1984 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1985 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1986 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1987 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1988 250,251,252,253,254,255) 1989 import string 1990 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1991 ''.join(map(chr, xrange(256))), ''.join(map(chr, emap))) 1992 return s.translate(c.EBCDIC_TO_ASCII_MAP) 1993 1994 MS_CHARS = { '\x80' : ('euro', '20AC'), 1995 '\x81' : ' ', 1996 '\x82' : ('sbquo', '201A'), 1997 '\x83' : ('fnof', '192'), 1998 '\x84' : ('bdquo', '201E'), 1999 '\x85' : ('hellip', '2026'), 2000 '\x86' : ('dagger', '2020'), 2001 '\x87' : ('Dagger', '2021'), 2002 '\x88' : ('circ', '2C6'), 2003 '\x89' : ('permil', '2030'), 2004 '\x8A' : ('Scaron', '160'), 2005 '\x8B' : ('lsaquo', '2039'), 2006 '\x8C' : ('OElig', '152'), 2007 '\x8D' : '?', 2008 '\x8E' : ('#x17D', '17D'), 2009 '\x8F' : '?', 2010 '\x90' : '?', 2011 '\x91' : ('lsquo', '2018'), 2012 '\x92' : ('rsquo', '2019'), 2013 '\x93' : ('ldquo', '201C'), 2014 '\x94' : ('rdquo', '201D'), 2015 '\x95' : ('bull', '2022'), 2016 '\x96' : ('ndash', '2013'), 2017 '\x97' : ('mdash', '2014'), 2018 '\x98' : ('tilde', '2DC'), 2019 '\x99' : ('trade', '2122'), 2020 '\x9a' : ('scaron', '161'), 2021 '\x9b' : ('rsaquo', '203A'), 2022 '\x9c' : ('oelig', '153'), 2023 '\x9d' : '?', 2024 '\x9e' : ('#x17E', '17E'), 2025 '\x9f' : ('Yuml', ''),} 2026 2027####################################################################### 2028 2029 2030#By default, act as an HTML pretty-printer. 2031if __name__ == '__main__': 2032 import sys 2033 soup = BeautifulSoup(sys.stdin) 2034 print(soup.prettify()) 2035