1# Use of this source code is governed by the MIT license. 2__license__ = "MIT" 3 4try: 5 from collections.abc import Callable # Python 3.6 6except ImportError as e: 7 from collections import Callable 8import re 9import sys 10import warnings 11try: 12 import soupsieve 13except ImportError as e: 14 soupsieve = None 15 warnings.warn( 16 'The soupsieve package is not installed. CSS selectors cannot be used.' 17 ) 18 19from bs4.formatter import ( 20 Formatter, 21 HTMLFormatter, 22 XMLFormatter, 23) 24 25DEFAULT_OUTPUT_ENCODING = "utf-8" 26PY3K = (sys.version_info[0] > 2) 27 28nonwhitespace_re = re.compile(r"\S+") 29 30# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on 31# the off chance someone imported it for their own use. 32whitespace_re = re.compile(r"\s+") 33 34def _alias(attr): 35 """Alias one attribute name to another for backward compatibility""" 36 @property 37 def alias(self): 38 return getattr(self, attr) 39 40 @alias.setter 41 def alias(self): 42 return setattr(self, attr) 43 return alias 44 45 46# These encodings are recognized by Python (so PageElement.encode 47# could theoretically support them) but XML and HTML don't recognize 48# them (so they should not show up in an XML or HTML document as that 49# document's encoding). 50# 51# If an XML document is encoded in one of these encodings, no encoding 52# will be mentioned in the XML declaration. If an HTML document is 53# encoded in one of these encodings, and the HTML document has a 54# <meta> tag that mentions an encoding, the encoding will be given as 55# the empty string. 56# 57# Source: 58# https://docs.python.org/3/library/codecs.html#python-specific-encodings 59PYTHON_SPECIFIC_ENCODINGS = set([ 60 "idna", 61 "mbcs", 62 "oem", 63 "palmos", 64 "punycode", 65 "raw_unicode_escape", 66 "undefined", 67 "unicode_escape", 68 "raw-unicode-escape", 69 "unicode-escape", 70 "string-escape", 71 "string_escape", 72]) 73 74 75class NamespacedAttribute(str): 76 """A namespaced string (e.g. 'xml:lang') that remembers the namespace 77 ('xml') and the name ('lang') that were used to create it. 78 """ 79 80 def __new__(cls, prefix, name=None, namespace=None): 81 if not name: 82 # This is the default namespace. Its name "has no value" 83 # per https://www.w3.org/TR/xml-names/#defaulting 84 name = None 85 86 if not name: 87 obj = str.__new__(cls, prefix) 88 elif not prefix: 89 # Not really namespaced. 90 obj = str.__new__(cls, name) 91 else: 92 obj = str.__new__(cls, prefix + ":" + name) 93 obj.prefix = prefix 94 obj.name = name 95 obj.namespace = namespace 96 return obj 97 98class AttributeValueWithCharsetSubstitution(str): 99 """A stand-in object for a character encoding specified in HTML.""" 100 101class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 102 """A generic stand-in for the value of a meta tag's 'charset' attribute. 103 104 When Beautiful Soup parses the markup '<meta charset="utf8">', the 105 value of the 'charset' attribute will be one of these objects. 106 """ 107 108 def __new__(cls, original_value): 109 obj = str.__new__(cls, original_value) 110 obj.original_value = original_value 111 return obj 112 113 def encode(self, encoding): 114 """When an HTML document is being encoded to a given encoding, the 115 value of a meta tag's 'charset' is the name of the encoding. 116 """ 117 if encoding in PYTHON_SPECIFIC_ENCODINGS: 118 return '' 119 return encoding 120 121 122class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 123 """A generic stand-in for the value of a meta tag's 'content' attribute. 124 125 When Beautiful Soup parses the markup: 126 <meta http-equiv="content-type" content="text/html; charset=utf8"> 127 128 The value of the 'content' attribute will be one of these objects. 129 """ 130 131 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 132 133 def __new__(cls, original_value): 134 match = cls.CHARSET_RE.search(original_value) 135 if match is None: 136 # No substitution necessary. 137 return str.__new__(str, original_value) 138 139 obj = str.__new__(cls, original_value) 140 obj.original_value = original_value 141 return obj 142 143 def encode(self, encoding): 144 if encoding in PYTHON_SPECIFIC_ENCODINGS: 145 return '' 146 def rewrite(match): 147 return match.group(1) + encoding 148 return self.CHARSET_RE.sub(rewrite, self.original_value) 149 150 151class PageElement(object): 152 """Contains the navigational information for some part of the page: 153 that is, its current location in the parse tree. 154 155 NavigableString, Tag, etc. are all subclasses of PageElement. 156 """ 157 158 def setup(self, parent=None, previous_element=None, next_element=None, 159 previous_sibling=None, next_sibling=None): 160 """Sets up the initial relations between this element and 161 other elements. 162 163 :param parent: The parent of this element. 164 165 :param previous_element: The element parsed immediately before 166 this one. 167 168 :param next_element: The element parsed immediately before 169 this one. 170 171 :param previous_sibling: The most recently encountered element 172 on the same level of the parse tree as this one. 173 174 :param previous_sibling: The next element to be encountered 175 on the same level of the parse tree as this one. 176 """ 177 self.parent = parent 178 179 self.previous_element = previous_element 180 if previous_element is not None: 181 self.previous_element.next_element = self 182 183 self.next_element = next_element 184 if self.next_element is not None: 185 self.next_element.previous_element = self 186 187 self.next_sibling = next_sibling 188 if self.next_sibling is not None: 189 self.next_sibling.previous_sibling = self 190 191 if (previous_sibling is None 192 and self.parent is not None and self.parent.contents): 193 previous_sibling = self.parent.contents[-1] 194 195 self.previous_sibling = previous_sibling 196 if previous_sibling is not None: 197 self.previous_sibling.next_sibling = self 198 199 def format_string(self, s, formatter): 200 """Format the given string using the given formatter. 201 202 :param s: A string. 203 :param formatter: A Formatter object, or a string naming one of the standard formatters. 204 """ 205 if formatter is None: 206 return s 207 if not isinstance(formatter, Formatter): 208 formatter = self.formatter_for_name(formatter) 209 output = formatter.substitute(s) 210 return output 211 212 def formatter_for_name(self, formatter): 213 """Look up or create a Formatter for the given identifier, 214 if necessary. 215 216 :param formatter: Can be a Formatter object (used as-is), a 217 function (used as the entity substitution hook for an 218 XMLFormatter or HTMLFormatter), or a string (used to look 219 up an XMLFormatter or HTMLFormatter in the appropriate 220 registry. 221 """ 222 if isinstance(formatter, Formatter): 223 return formatter 224 if self._is_xml: 225 c = XMLFormatter 226 else: 227 c = HTMLFormatter 228 if isinstance(formatter, Callable): 229 return c(entity_substitution=formatter) 230 return c.REGISTRY[formatter] 231 232 @property 233 def _is_xml(self): 234 """Is this element part of an XML tree or an HTML tree? 235 236 This is used in formatter_for_name, when deciding whether an 237 XMLFormatter or HTMLFormatter is more appropriate. It can be 238 inefficient, but it should be called very rarely. 239 """ 240 if self.known_xml is not None: 241 # Most of the time we will have determined this when the 242 # document is parsed. 243 return self.known_xml 244 245 # Otherwise, it's likely that this element was created by 246 # direct invocation of the constructor from within the user's 247 # Python code. 248 if self.parent is None: 249 # This is the top-level object. It should have .known_xml set 250 # from tree creation. If not, take a guess--BS is usually 251 # used on HTML markup. 252 return getattr(self, 'is_xml', False) 253 return self.parent._is_xml 254 255 nextSibling = _alias("next_sibling") # BS3 256 previousSibling = _alias("previous_sibling") # BS3 257 258 default = object() 259 def _all_strings(self, strip=False, types=default): 260 """Yield all strings of certain classes, possibly stripping them. 261 262 This is implemented differently in Tag and NavigableString. 263 """ 264 raise NotImplementedError() 265 266 @property 267 def stripped_strings(self): 268 """Yield all strings in this PageElement, stripping them first. 269 270 :yield: A sequence of stripped strings. 271 """ 272 for string in self._all_strings(True): 273 yield string 274 275 def get_text(self, separator="", strip=False, 276 types=default): 277 """Get all child strings of this PageElement, concatenated using the 278 given separator. 279 280 :param separator: Strings will be concatenated using this separator. 281 282 :param strip: If True, strings will be stripped before being 283 concatenated. 284 285 :param types: A tuple of NavigableString subclasses. Any 286 strings of a subclass not found in this list will be 287 ignored. Although there are exceptions, the default 288 behavior in most cases is to consider only NavigableString 289 and CData objects. That means no comments, processing 290 instructions, etc. 291 292 :return: A string. 293 """ 294 return separator.join([s for s in self._all_strings( 295 strip, types=types)]) 296 getText = get_text 297 text = property(get_text) 298 299 def replace_with(self, *args): 300 """Replace this PageElement with one or more PageElements, keeping the 301 rest of the tree the same. 302 303 :param args: One or more PageElements. 304 :return: `self`, no longer part of the tree. 305 """ 306 if self.parent is None: 307 raise ValueError( 308 "Cannot replace one element with another when the " 309 "element to be replaced is not part of a tree.") 310 if len(args) == 1 and args[0] is self: 311 return 312 if any(x is self.parent for x in args): 313 raise ValueError("Cannot replace a Tag with its parent.") 314 old_parent = self.parent 315 my_index = self.parent.index(self) 316 self.extract(_self_index=my_index) 317 for idx, replace_with in enumerate(args, start=my_index): 318 old_parent.insert(idx, replace_with) 319 return self 320 replaceWith = replace_with # BS3 321 322 def unwrap(self): 323 """Replace this PageElement with its contents. 324 325 :return: `self`, no longer part of the tree. 326 """ 327 my_parent = self.parent 328 if self.parent is None: 329 raise ValueError( 330 "Cannot replace an element with its contents when that" 331 "element is not part of a tree.") 332 my_index = self.parent.index(self) 333 self.extract(_self_index=my_index) 334 for child in reversed(self.contents[:]): 335 my_parent.insert(my_index, child) 336 return self 337 replace_with_children = unwrap 338 replaceWithChildren = unwrap # BS3 339 340 def wrap(self, wrap_inside): 341 """Wrap this PageElement inside another one. 342 343 :param wrap_inside: A PageElement. 344 :return: `wrap_inside`, occupying the position in the tree that used 345 to be occupied by `self`, and with `self` inside it. 346 """ 347 me = self.replace_with(wrap_inside) 348 wrap_inside.append(me) 349 return wrap_inside 350 351 def extract(self, _self_index=None): 352 """Destructively rips this element out of the tree. 353 354 :param _self_index: The location of this element in its parent's 355 .contents, if known. Passing this in allows for a performance 356 optimization. 357 358 :return: `self`, no longer part of the tree. 359 """ 360 if self.parent is not None: 361 if _self_index is None: 362 _self_index = self.parent.index(self) 363 del self.parent.contents[_self_index] 364 365 #Find the two elements that would be next to each other if 366 #this element (and any children) hadn't been parsed. Connect 367 #the two. 368 last_child = self._last_descendant() 369 next_element = last_child.next_element 370 371 if (self.previous_element is not None and 372 self.previous_element is not next_element): 373 self.previous_element.next_element = next_element 374 if next_element is not None and next_element is not self.previous_element: 375 next_element.previous_element = self.previous_element 376 self.previous_element = None 377 last_child.next_element = None 378 379 self.parent = None 380 if (self.previous_sibling is not None 381 and self.previous_sibling is not self.next_sibling): 382 self.previous_sibling.next_sibling = self.next_sibling 383 if (self.next_sibling is not None 384 and self.next_sibling is not self.previous_sibling): 385 self.next_sibling.previous_sibling = self.previous_sibling 386 self.previous_sibling = self.next_sibling = None 387 return self 388 389 def _last_descendant(self, is_initialized=True, accept_self=True): 390 """Finds the last element beneath this object to be parsed. 391 392 :param is_initialized: Has `setup` been called on this PageElement 393 yet? 394 :param accept_self: Is `self` an acceptable answer to the question? 395 """ 396 if is_initialized and self.next_sibling is not None: 397 last_child = self.next_sibling.previous_element 398 else: 399 last_child = self 400 while isinstance(last_child, Tag) and last_child.contents: 401 last_child = last_child.contents[-1] 402 if not accept_self and last_child is self: 403 last_child = None 404 return last_child 405 # BS3: Not part of the API! 406 _lastRecursiveChild = _last_descendant 407 408 def insert(self, position, new_child): 409 """Insert a new PageElement in the list of this PageElement's children. 410 411 This works the same way as `list.insert`. 412 413 :param position: The numeric position that should be occupied 414 in `self.children` by the new PageElement. 415 :param new_child: A PageElement. 416 """ 417 if new_child is None: 418 raise ValueError("Cannot insert None into a tag.") 419 if new_child is self: 420 raise ValueError("Cannot insert a tag into itself.") 421 if (isinstance(new_child, str) 422 and not isinstance(new_child, NavigableString)): 423 new_child = NavigableString(new_child) 424 425 from bs4 import BeautifulSoup 426 if isinstance(new_child, BeautifulSoup): 427 # We don't want to end up with a situation where one BeautifulSoup 428 # object contains another. Insert the children one at a time. 429 for subchild in list(new_child.contents): 430 self.insert(position, subchild) 431 position += 1 432 return 433 position = min(position, len(self.contents)) 434 if hasattr(new_child, 'parent') and new_child.parent is not None: 435 # We're 'inserting' an element that's already one 436 # of this object's children. 437 if new_child.parent is self: 438 current_index = self.index(new_child) 439 if current_index < position: 440 # We're moving this element further down the list 441 # of this object's children. That means that when 442 # we extract this element, our target index will 443 # jump down one. 444 position -= 1 445 new_child.extract() 446 447 new_child.parent = self 448 previous_child = None 449 if position == 0: 450 new_child.previous_sibling = None 451 new_child.previous_element = self 452 else: 453 previous_child = self.contents[position - 1] 454 new_child.previous_sibling = previous_child 455 new_child.previous_sibling.next_sibling = new_child 456 new_child.previous_element = previous_child._last_descendant(False) 457 if new_child.previous_element is not None: 458 new_child.previous_element.next_element = new_child 459 460 new_childs_last_element = new_child._last_descendant(False) 461 462 if position >= len(self.contents): 463 new_child.next_sibling = None 464 465 parent = self 466 parents_next_sibling = None 467 while parents_next_sibling is None and parent is not None: 468 parents_next_sibling = parent.next_sibling 469 parent = parent.parent 470 if parents_next_sibling is not None: 471 # We found the element that comes next in the document. 472 break 473 if parents_next_sibling is not None: 474 new_childs_last_element.next_element = parents_next_sibling 475 else: 476 # The last element of this tag is the last element in 477 # the document. 478 new_childs_last_element.next_element = None 479 else: 480 next_child = self.contents[position] 481 new_child.next_sibling = next_child 482 if new_child.next_sibling is not None: 483 new_child.next_sibling.previous_sibling = new_child 484 new_childs_last_element.next_element = next_child 485 486 if new_childs_last_element.next_element is not None: 487 new_childs_last_element.next_element.previous_element = new_childs_last_element 488 self.contents.insert(position, new_child) 489 490 def append(self, tag): 491 """Appends the given PageElement to the contents of this one. 492 493 :param tag: A PageElement. 494 """ 495 self.insert(len(self.contents), tag) 496 497 def extend(self, tags): 498 """Appends the given PageElements to this one's contents. 499 500 :param tags: A list of PageElements. 501 """ 502 if isinstance(tags, Tag): 503 # Calling self.append() on another tag's contents will change 504 # the list we're iterating over. Make a list that won't 505 # change. 506 tags = list(tags.contents) 507 for tag in tags: 508 self.append(tag) 509 510 def insert_before(self, *args): 511 """Makes the given element(s) the immediate predecessor of this one. 512 513 All the elements will have the same parent, and the given elements 514 will be immediately before this one. 515 516 :param args: One or more PageElements. 517 """ 518 parent = self.parent 519 if parent is None: 520 raise ValueError( 521 "Element has no parent, so 'before' has no meaning.") 522 if any(x is self for x in args): 523 raise ValueError("Can't insert an element before itself.") 524 for predecessor in args: 525 # Extract first so that the index won't be screwed up if they 526 # are siblings. 527 if isinstance(predecessor, PageElement): 528 predecessor.extract() 529 index = parent.index(self) 530 parent.insert(index, predecessor) 531 532 def insert_after(self, *args): 533 """Makes the given element(s) the immediate successor of this one. 534 535 The elements will have the same parent, and the given elements 536 will be immediately after this one. 537 538 :param args: One or more PageElements. 539 """ 540 # Do all error checking before modifying the tree. 541 parent = self.parent 542 if parent is None: 543 raise ValueError( 544 "Element has no parent, so 'after' has no meaning.") 545 if any(x is self for x in args): 546 raise ValueError("Can't insert an element after itself.") 547 548 offset = 0 549 for successor in args: 550 # Extract first so that the index won't be screwed up if they 551 # are siblings. 552 if isinstance(successor, PageElement): 553 successor.extract() 554 index = parent.index(self) 555 parent.insert(index+1+offset, successor) 556 offset += 1 557 558 def find_next(self, name=None, attrs={}, text=None, **kwargs): 559 """Find the first PageElement that matches the given criteria and 560 appears later in the document than this PageElement. 561 562 All find_* methods take a common set of arguments. See the online 563 documentation for detailed explanations. 564 565 :param name: A filter on tag name. 566 :param attrs: A dictionary of filters on attribute values. 567 :param text: A filter for a NavigableString with specific text. 568 :kwargs: A dictionary of filters on attribute values. 569 :return: A PageElement. 570 :rtype: bs4.element.Tag | bs4.element.NavigableString 571 """ 572 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 573 findNext = find_next # BS3 574 575 def find_all_next(self, name=None, attrs={}, text=None, limit=None, 576 **kwargs): 577 """Find all PageElements that match the given criteria and appear 578 later in the document than this PageElement. 579 580 All find_* methods take a common set of arguments. See the online 581 documentation for detailed explanations. 582 583 :param name: A filter on tag name. 584 :param attrs: A dictionary of filters on attribute values. 585 :param text: A filter for a NavigableString with specific text. 586 :param limit: Stop looking after finding this many results. 587 :kwargs: A dictionary of filters on attribute values. 588 :return: A ResultSet containing PageElements. 589 """ 590 return self._find_all(name, attrs, text, limit, self.next_elements, 591 **kwargs) 592 findAllNext = find_all_next # BS3 593 594 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 595 """Find the closest sibling to this PageElement that matches the 596 given criteria and appears later in the document. 597 598 All find_* methods take a common set of arguments. See the 599 online documentation for detailed explanations. 600 601 :param name: A filter on tag name. 602 :param attrs: A dictionary of filters on attribute values. 603 :param text: A filter for a NavigableString with specific text. 604 :kwargs: A dictionary of filters on attribute values. 605 :return: A PageElement. 606 :rtype: bs4.element.Tag | bs4.element.NavigableString 607 """ 608 return self._find_one(self.find_next_siblings, name, attrs, text, 609 **kwargs) 610 findNextSibling = find_next_sibling # BS3 611 612 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 613 **kwargs): 614 """Find all siblings of this PageElement that match the given criteria 615 and appear later in the document. 616 617 All find_* methods take a common set of arguments. See the online 618 documentation for detailed explanations. 619 620 :param name: A filter on tag name. 621 :param attrs: A dictionary of filters on attribute values. 622 :param text: A filter for a NavigableString with specific text. 623 :param limit: Stop looking after finding this many results. 624 :kwargs: A dictionary of filters on attribute values. 625 :return: A ResultSet of PageElements. 626 :rtype: bs4.element.ResultSet 627 """ 628 return self._find_all(name, attrs, text, limit, 629 self.next_siblings, **kwargs) 630 findNextSiblings = find_next_siblings # BS3 631 fetchNextSiblings = find_next_siblings # BS2 632 633 def find_previous(self, name=None, attrs={}, text=None, **kwargs): 634 """Look backwards in the document from this PageElement and find the 635 first PageElement that matches the given criteria. 636 637 All find_* methods take a common set of arguments. See the online 638 documentation for detailed explanations. 639 640 :param name: A filter on tag name. 641 :param attrs: A dictionary of filters on attribute values. 642 :param text: A filter for a NavigableString with specific text. 643 :kwargs: A dictionary of filters on attribute values. 644 :return: A PageElement. 645 :rtype: bs4.element.Tag | bs4.element.NavigableString 646 """ 647 return self._find_one( 648 self.find_all_previous, name, attrs, text, **kwargs) 649 findPrevious = find_previous # BS3 650 651 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 652 **kwargs): 653 """Look backwards in the document from this PageElement and find all 654 PageElements that match the given criteria. 655 656 All find_* methods take a common set of arguments. See the online 657 documentation for detailed explanations. 658 659 :param name: A filter on tag name. 660 :param attrs: A dictionary of filters on attribute values. 661 :param text: A filter for a NavigableString with specific text. 662 :param limit: Stop looking after finding this many results. 663 :kwargs: A dictionary of filters on attribute values. 664 :return: A ResultSet of PageElements. 665 :rtype: bs4.element.ResultSet 666 """ 667 return self._find_all(name, attrs, text, limit, self.previous_elements, 668 **kwargs) 669 findAllPrevious = find_all_previous # BS3 670 fetchPrevious = find_all_previous # BS2 671 672 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 673 """Returns the closest sibling to this PageElement that matches the 674 given criteria and appears earlier in the document. 675 676 All find_* methods take a common set of arguments. See the online 677 documentation for detailed explanations. 678 679 :param name: A filter on tag name. 680 :param attrs: A dictionary of filters on attribute values. 681 :param text: A filter for a NavigableString with specific text. 682 :kwargs: A dictionary of filters on attribute values. 683 :return: A PageElement. 684 :rtype: bs4.element.Tag | bs4.element.NavigableString 685 """ 686 return self._find_one(self.find_previous_siblings, name, attrs, text, 687 **kwargs) 688 findPreviousSibling = find_previous_sibling # BS3 689 690 def find_previous_siblings(self, name=None, attrs={}, text=None, 691 limit=None, **kwargs): 692 """Returns all siblings to this PageElement that match the 693 given criteria and appear earlier in the document. 694 695 All find_* methods take a common set of arguments. See the online 696 documentation for detailed explanations. 697 698 :param name: A filter on tag name. 699 :param attrs: A dictionary of filters on attribute values. 700 :param text: A filter for a NavigableString with specific text. 701 :param limit: Stop looking after finding this many results. 702 :kwargs: A dictionary of filters on attribute values. 703 :return: A ResultSet of PageElements. 704 :rtype: bs4.element.ResultSet 705 """ 706 return self._find_all(name, attrs, text, limit, 707 self.previous_siblings, **kwargs) 708 findPreviousSiblings = find_previous_siblings # BS3 709 fetchPreviousSiblings = find_previous_siblings # BS2 710 711 def find_parent(self, name=None, attrs={}, **kwargs): 712 """Find the closest parent of this PageElement that matches the given 713 criteria. 714 715 All find_* methods take a common set of arguments. See the online 716 documentation for detailed explanations. 717 718 :param name: A filter on tag name. 719 :param attrs: A dictionary of filters on attribute values. 720 :kwargs: A dictionary of filters on attribute values. 721 722 :return: A PageElement. 723 :rtype: bs4.element.Tag | bs4.element.NavigableString 724 """ 725 # NOTE: We can't use _find_one because findParents takes a different 726 # set of arguments. 727 r = None 728 l = self.find_parents(name, attrs, 1, **kwargs) 729 if l: 730 r = l[0] 731 return r 732 findParent = find_parent # BS3 733 734 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 735 """Find all parents of this PageElement that match the given criteria. 736 737 All find_* methods take a common set of arguments. See the online 738 documentation for detailed explanations. 739 740 :param name: A filter on tag name. 741 :param attrs: A dictionary of filters on attribute values. 742 :param limit: Stop looking after finding this many results. 743 :kwargs: A dictionary of filters on attribute values. 744 745 :return: A PageElement. 746 :rtype: bs4.element.Tag | bs4.element.NavigableString 747 """ 748 return self._find_all(name, attrs, None, limit, self.parents, 749 **kwargs) 750 findParents = find_parents # BS3 751 fetchParents = find_parents # BS2 752 753 @property 754 def next(self): 755 """The PageElement, if any, that was parsed just after this one. 756 757 :return: A PageElement. 758 :rtype: bs4.element.Tag | bs4.element.NavigableString 759 """ 760 return self.next_element 761 762 @property 763 def previous(self): 764 """The PageElement, if any, that was parsed just before this one. 765 766 :return: A PageElement. 767 :rtype: bs4.element.Tag | bs4.element.NavigableString 768 """ 769 return self.previous_element 770 771 #These methods do the real heavy lifting. 772 773 def _find_one(self, method, name, attrs, text, **kwargs): 774 r = None 775 l = method(name, attrs, text, 1, **kwargs) 776 if l: 777 r = l[0] 778 return r 779 780 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 781 "Iterates over a generator looking for things that match." 782 783 if text is None and 'string' in kwargs: 784 text = kwargs['string'] 785 del kwargs['string'] 786 787 if isinstance(name, SoupStrainer): 788 strainer = name 789 else: 790 strainer = SoupStrainer(name, attrs, text, **kwargs) 791 792 if text is None and not limit and not attrs and not kwargs: 793 if name is True or name is None: 794 # Optimization to find all tags. 795 result = (element for element in generator 796 if isinstance(element, Tag)) 797 return ResultSet(strainer, result) 798 elif isinstance(name, str): 799 # Optimization to find all tags with a given name. 800 if name.count(':') == 1: 801 # This is a name with a prefix. If this is a namespace-aware document, 802 # we need to match the local name against tag.name. If not, 803 # we need to match the fully-qualified name against tag.name. 804 prefix, local_name = name.split(':', 1) 805 else: 806 prefix = None 807 local_name = name 808 result = (element for element in generator 809 if isinstance(element, Tag) 810 and ( 811 element.name == name 812 ) or ( 813 element.name == local_name 814 and (prefix is None or element.prefix == prefix) 815 ) 816 ) 817 return ResultSet(strainer, result) 818 results = ResultSet(strainer) 819 while True: 820 try: 821 i = next(generator) 822 except StopIteration: 823 break 824 if i: 825 found = strainer.search(i) 826 if found: 827 results.append(found) 828 if limit and len(results) >= limit: 829 break 830 return results 831 832 #These generators can be used to navigate starting from both 833 #NavigableStrings and Tags. 834 @property 835 def next_elements(self): 836 """All PageElements that were parsed after this one. 837 838 :yield: A sequence of PageElements. 839 """ 840 i = self.next_element 841 while i is not None: 842 yield i 843 i = i.next_element 844 845 @property 846 def next_siblings(self): 847 """All PageElements that are siblings of this one but were parsed 848 later. 849 850 :yield: A sequence of PageElements. 851 """ 852 i = self.next_sibling 853 while i is not None: 854 yield i 855 i = i.next_sibling 856 857 @property 858 def previous_elements(self): 859 """All PageElements that were parsed before this one. 860 861 :yield: A sequence of PageElements. 862 """ 863 i = self.previous_element 864 while i is not None: 865 yield i 866 i = i.previous_element 867 868 @property 869 def previous_siblings(self): 870 """All PageElements that are siblings of this one but were parsed 871 earlier. 872 873 :yield: A sequence of PageElements. 874 """ 875 i = self.previous_sibling 876 while i is not None: 877 yield i 878 i = i.previous_sibling 879 880 @property 881 def parents(self): 882 """All PageElements that are parents of this PageElement. 883 884 :yield: A sequence of PageElements. 885 """ 886 i = self.parent 887 while i is not None: 888 yield i 889 i = i.parent 890 891 @property 892 def decomposed(self): 893 """Check whether a PageElement has been decomposed. 894 895 :rtype: bool 896 """ 897 return getattr(self, '_decomposed', False) or False 898 899 # Old non-property versions of the generators, for backwards 900 # compatibility with BS3. 901 def nextGenerator(self): 902 return self.next_elements 903 904 def nextSiblingGenerator(self): 905 return self.next_siblings 906 907 def previousGenerator(self): 908 return self.previous_elements 909 910 def previousSiblingGenerator(self): 911 return self.previous_siblings 912 913 def parentGenerator(self): 914 return self.parents 915 916 917class NavigableString(str, PageElement): 918 """A Python Unicode string that is part of a parse tree. 919 920 When Beautiful Soup parses the markup <b>penguin</b>, it will 921 create a NavigableString for the string "penguin". 922 """ 923 924 PREFIX = '' 925 SUFFIX = '' 926 927 # We can't tell just by looking at a string whether it's contained 928 # in an XML document or an HTML document. 929 930 known_xml = None 931 932 def __new__(cls, value): 933 """Create a new NavigableString. 934 935 When unpickling a NavigableString, this method is called with 936 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 937 passed in to the superclass's __new__ or the superclass won't know 938 how to handle non-ASCII characters. 939 """ 940 if isinstance(value, str): 941 u = str.__new__(cls, value) 942 else: 943 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 944 u.setup() 945 return u 946 947 def __copy__(self): 948 """A copy of a NavigableString has the same contents and class 949 as the original, but it is not connected to the parse tree. 950 """ 951 return type(self)(self) 952 953 def __getnewargs__(self): 954 return (str(self),) 955 956 def __getattr__(self, attr): 957 """text.string gives you text. This is for backwards 958 compatibility for Navigable*String, but for CData* it lets you 959 get the string without the CData wrapper.""" 960 if attr == 'string': 961 return self 962 else: 963 raise AttributeError( 964 "'%s' object has no attribute '%s'" % ( 965 self.__class__.__name__, attr)) 966 967 def output_ready(self, formatter="minimal"): 968 """Run the string through the provided formatter. 969 970 :param formatter: A Formatter object, or a string naming one of the standard formatters. 971 """ 972 output = self.format_string(self, formatter) 973 return self.PREFIX + output + self.SUFFIX 974 975 @property 976 def name(self): 977 """Since a NavigableString is not a Tag, it has no .name. 978 979 This property is implemented so that code like this doesn't crash 980 when run on a mixture of Tag and NavigableString objects: 981 [x.name for x in tag.children] 982 """ 983 return None 984 985 @name.setter 986 def name(self, name): 987 """Prevent NavigableString.name from ever being set.""" 988 raise AttributeError("A NavigableString cannot be given a name.") 989 990 def _all_strings(self, strip=False, types=PageElement.default): 991 """Yield all strings of certain classes, possibly stripping them. 992 993 This makes it easy for NavigableString to implement methods 994 like get_text() as conveniences, creating a consistent 995 text-extraction API across all PageElements. 996 997 :param strip: If True, all strings will be stripped before being 998 yielded. 999 1000 :param types: A tuple of NavigableString subclasses. If this 1001 NavigableString isn't one of those subclasses, the 1002 sequence will be empty. By default, the subclasses 1003 considered are NavigableString and CData objects. That 1004 means no comments, processing instructions, etc. 1005 1006 :yield: A sequence that either contains this string, or is empty. 1007 1008 """ 1009 if types is self.default: 1010 # This is kept in Tag because it's full of subclasses of 1011 # this class, which aren't defined until later in the file. 1012 types = Tag.DEFAULT_INTERESTING_STRING_TYPES 1013 1014 # Do nothing if the caller is looking for specific types of 1015 # string, and we're of a different type. 1016 my_type = type(self) 1017 if types is not None: 1018 if isinstance(types, type): 1019 # Looking for a single type. 1020 if my_type is not types: 1021 return 1022 elif my_type not in types: 1023 # Looking for one of a list of types. 1024 return 1025 1026 value = self 1027 if strip: 1028 value = value.strip() 1029 if len(value) > 0: 1030 yield value 1031 strings = property(_all_strings) 1032 1033class PreformattedString(NavigableString): 1034 """A NavigableString not subject to the normal formatting rules. 1035 1036 This is an abstract class used for special kinds of strings such 1037 as comments (the Comment class) and CDATA blocks (the CData 1038 class). 1039 """ 1040 1041 PREFIX = '' 1042 SUFFIX = '' 1043 1044 def output_ready(self, formatter=None): 1045 """Make this string ready for output by adding any subclass-specific 1046 prefix or suffix. 1047 1048 :param formatter: A Formatter object, or a string naming one 1049 of the standard formatters. The string will be passed into the 1050 Formatter, but only to trigger any side effects: the return 1051 value is ignored. 1052 1053 :return: The string, with any subclass-specific prefix and 1054 suffix added on. 1055 """ 1056 if formatter is not None: 1057 ignore = self.format_string(self, formatter) 1058 return self.PREFIX + self + self.SUFFIX 1059 1060class CData(PreformattedString): 1061 """A CDATA block.""" 1062 PREFIX = '<![CDATA[' 1063 SUFFIX = ']]>' 1064 1065class ProcessingInstruction(PreformattedString): 1066 """A SGML processing instruction.""" 1067 1068 PREFIX = '<?' 1069 SUFFIX = '>' 1070 1071class XMLProcessingInstruction(ProcessingInstruction): 1072 """An XML processing instruction.""" 1073 PREFIX = '<?' 1074 SUFFIX = '?>' 1075 1076class Comment(PreformattedString): 1077 """An HTML or XML comment.""" 1078 PREFIX = '<!--' 1079 SUFFIX = '-->' 1080 1081 1082class Declaration(PreformattedString): 1083 """An XML declaration.""" 1084 PREFIX = '<?' 1085 SUFFIX = '?>' 1086 1087 1088class Doctype(PreformattedString): 1089 """A document type declaration.""" 1090 @classmethod 1091 def for_name_and_ids(cls, name, pub_id, system_id): 1092 """Generate an appropriate document type declaration for a given 1093 public ID and system ID. 1094 1095 :param name: The name of the document's root element, e.g. 'html'. 1096 :param pub_id: The Formal Public Identifier for this document type, 1097 e.g. '-//W3C//DTD XHTML 1.1//EN' 1098 :param system_id: The system identifier for this document type, 1099 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 1100 1101 :return: A Doctype. 1102 """ 1103 value = name or '' 1104 if pub_id is not None: 1105 value += ' PUBLIC "%s"' % pub_id 1106 if system_id is not None: 1107 value += ' "%s"' % system_id 1108 elif system_id is not None: 1109 value += ' SYSTEM "%s"' % system_id 1110 1111 return Doctype(value) 1112 1113 PREFIX = '<!DOCTYPE ' 1114 SUFFIX = '>\n' 1115 1116 1117class Stylesheet(NavigableString): 1118 """A NavigableString representing an stylesheet (probably 1119 CSS). 1120 1121 Used to distinguish embedded stylesheets from textual content. 1122 """ 1123 pass 1124 1125 1126class Script(NavigableString): 1127 """A NavigableString representing an executable script (probably 1128 Javascript). 1129 1130 Used to distinguish executable code from textual content. 1131 """ 1132 pass 1133 1134 1135class TemplateString(NavigableString): 1136 """A NavigableString representing a string found inside an HTML 1137 template embedded in a larger document. 1138 1139 Used to distinguish such strings from the main body of the document. 1140 """ 1141 pass 1142 1143 1144class Tag(PageElement): 1145 """Represents an HTML or XML tag that is part of a parse tree, along 1146 with its attributes and contents. 1147 1148 When Beautiful Soup parses the markup <b>penguin</b>, it will 1149 create a Tag object representing the <b> tag. 1150 """ 1151 1152 def __init__(self, parser=None, builder=None, name=None, namespace=None, 1153 prefix=None, attrs=None, parent=None, previous=None, 1154 is_xml=None, sourceline=None, sourcepos=None, 1155 can_be_empty_element=None, cdata_list_attributes=None, 1156 preserve_whitespace_tags=None, 1157 interesting_string_types=None, 1158 ): 1159 """Basic constructor. 1160 1161 :param parser: A BeautifulSoup object. 1162 :param builder: A TreeBuilder. 1163 :param name: The name of the tag. 1164 :param namespace: The URI of this Tag's XML namespace, if any. 1165 :param prefix: The prefix for this Tag's XML namespace, if any. 1166 :param attrs: A dictionary of this Tag's attribute values. 1167 :param parent: The PageElement to use as this Tag's parent. 1168 :param previous: The PageElement that was parsed immediately before 1169 this tag. 1170 :param is_xml: If True, this is an XML tag. Otherwise, this is an 1171 HTML tag. 1172 :param sourceline: The line number where this tag was found in its 1173 source document. 1174 :param sourcepos: The character position within `sourceline` where this 1175 tag was found. 1176 :param can_be_empty_element: If True, this tag should be 1177 represented as <tag/>. If False, this tag should be represented 1178 as <tag></tag>. 1179 :param cdata_list_attributes: A list of attributes whose values should 1180 be treated as CDATA if they ever show up on this tag. 1181 :param preserve_whitespace_tags: A list of tag names whose contents 1182 should have their whitespace preserved. 1183 :param interesting_string_types: This is a NavigableString 1184 subclass or a tuple of them. When iterating over this 1185 Tag's strings in methods like Tag.strings or Tag.get_text, 1186 these are the types of strings that are interesting enough 1187 to be considered. The default is to consider 1188 NavigableString and CData the only interesting string 1189 subtypes. 1190 """ 1191 if parser is None: 1192 self.parser_class = None 1193 else: 1194 # We don't actually store the parser object: that lets extracted 1195 # chunks be garbage-collected. 1196 self.parser_class = parser.__class__ 1197 if name is None: 1198 raise ValueError("No value provided for new tag's name.") 1199 self.name = name 1200 self.namespace = namespace 1201 self.prefix = prefix 1202 if ((not builder or builder.store_line_numbers) 1203 and (sourceline is not None or sourcepos is not None)): 1204 self.sourceline = sourceline 1205 self.sourcepos = sourcepos 1206 if attrs is None: 1207 attrs = {} 1208 elif attrs: 1209 if builder is not None and builder.cdata_list_attributes: 1210 attrs = builder._replace_cdata_list_attribute_values( 1211 self.name, attrs) 1212 else: 1213 attrs = dict(attrs) 1214 else: 1215 attrs = dict(attrs) 1216 1217 # If possible, determine ahead of time whether this tag is an 1218 # XML tag. 1219 if builder: 1220 self.known_xml = builder.is_xml 1221 else: 1222 self.known_xml = is_xml 1223 self.attrs = attrs 1224 self.contents = [] 1225 self.setup(parent, previous) 1226 self.hidden = False 1227 1228 if builder is None: 1229 # In the absence of a TreeBuilder, use whatever values were 1230 # passed in here. They're probably None, unless this is a copy of some 1231 # other tag. 1232 self.can_be_empty_element = can_be_empty_element 1233 self.cdata_list_attributes = cdata_list_attributes 1234 self.preserve_whitespace_tags = preserve_whitespace_tags 1235 self.interesting_string_types = interesting_string_types 1236 else: 1237 # Set up any substitutions for this tag, such as the charset in a META tag. 1238 builder.set_up_substitutions(self) 1239 1240 # Ask the TreeBuilder whether this tag might be an empty-element tag. 1241 self.can_be_empty_element = builder.can_be_empty_element(name) 1242 1243 # Keep track of the list of attributes of this tag that 1244 # might need to be treated as a list. 1245 # 1246 # For performance reasons, we store the whole data structure 1247 # rather than asking the question of every tag. Asking would 1248 # require building a new data structure every time, and 1249 # (unlike can_be_empty_element), we almost never need 1250 # to check this. 1251 self.cdata_list_attributes = builder.cdata_list_attributes 1252 1253 # Keep track of the names that might cause this tag to be treated as a 1254 # whitespace-preserved tag. 1255 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 1256 1257 if self.name in builder.string_containers: 1258 # This sort of tag uses a special string container 1259 # subclass for most of its strings. When we ask the 1260 self.interesting_string_types = builder.string_containers[self.name] 1261 else: 1262 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES 1263 1264 parserClass = _alias("parser_class") # BS3 1265 1266 def __copy__(self): 1267 """A copy of a Tag is a new Tag, unconnected to the parse tree. 1268 Its contents are a copy of the old Tag's contents. 1269 """ 1270 clone = type(self)( 1271 None, self.builder, self.name, self.namespace, 1272 self.prefix, self.attrs, is_xml=self._is_xml, 1273 sourceline=self.sourceline, sourcepos=self.sourcepos, 1274 can_be_empty_element=self.can_be_empty_element, 1275 cdata_list_attributes=self.cdata_list_attributes, 1276 preserve_whitespace_tags=self.preserve_whitespace_tags 1277 ) 1278 for attr in ('can_be_empty_element', 'hidden'): 1279 setattr(clone, attr, getattr(self, attr)) 1280 for child in self.contents: 1281 clone.append(child.__copy__()) 1282 return clone 1283 1284 @property 1285 def is_empty_element(self): 1286 """Is this tag an empty-element tag? (aka a self-closing tag) 1287 1288 A tag that has contents is never an empty-element tag. 1289 1290 A tag that has no contents may or may not be an empty-element 1291 tag. It depends on the builder used to create the tag. If the 1292 builder has a designated list of empty-element tags, then only 1293 a tag whose name shows up in that list is considered an 1294 empty-element tag. 1295 1296 If the builder has no designated list of empty-element tags, 1297 then any tag with no contents is an empty-element tag. 1298 """ 1299 return len(self.contents) == 0 and self.can_be_empty_element 1300 isSelfClosing = is_empty_element # BS3 1301 1302 @property 1303 def string(self): 1304 """Convenience property to get the single string within this 1305 PageElement. 1306 1307 TODO It might make sense to have NavigableString.string return 1308 itself. 1309 1310 :return: If this element has a single string child, return 1311 value is that string. If this element has one child tag, 1312 return value is the 'string' attribute of the child tag, 1313 recursively. If this element is itself a string, has no 1314 children, or has more than one child, return value is None. 1315 """ 1316 if len(self.contents) != 1: 1317 return None 1318 child = self.contents[0] 1319 if isinstance(child, NavigableString): 1320 return child 1321 return child.string 1322 1323 @string.setter 1324 def string(self, string): 1325 """Replace this PageElement's contents with `string`.""" 1326 self.clear() 1327 self.append(string.__class__(string)) 1328 1329 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) 1330 def _all_strings(self, strip=False, types=PageElement.default): 1331 """Yield all strings of certain classes, possibly stripping them. 1332 1333 :param strip: If True, all strings will be stripped before being 1334 yielded. 1335 1336 :param types: A tuple of NavigableString subclasses. Any strings of 1337 a subclass not found in this list will be ignored. By 1338 default, the subclasses considered are the ones found in 1339 self.interesting_string_types. If that's not specified, 1340 only NavigableString and CData objects will be 1341 considered. That means no comments, processing 1342 instructions, etc. 1343 1344 :yield: A sequence of strings. 1345 1346 """ 1347 if types is self.default: 1348 types = self.interesting_string_types 1349 1350 for descendant in self.descendants: 1351 if (types is None and not isinstance(descendant, NavigableString)): 1352 continue 1353 descendant_type = type(descendant) 1354 if isinstance(types, type): 1355 if descendant_type is not types: 1356 # We're not interested in strings of this type. 1357 continue 1358 elif types is not None and descendant_type not in types: 1359 # We're not interested in strings of this type. 1360 continue 1361 if strip: 1362 descendant = descendant.strip() 1363 if len(descendant) == 0: 1364 continue 1365 yield descendant 1366 strings = property(_all_strings) 1367 1368 def decompose(self): 1369 """Recursively destroys this PageElement and its children. 1370 1371 This element will be removed from the tree and wiped out; so 1372 will everything beneath it. 1373 1374 The behavior of a decomposed PageElement is undefined and you 1375 should never use one for anything, but if you need to _check_ 1376 whether an element has been decomposed, you can use the 1377 `decomposed` property. 1378 """ 1379 self.extract() 1380 i = self 1381 while i is not None: 1382 n = i.next_element 1383 i.__dict__.clear() 1384 i.contents = [] 1385 i._decomposed = True 1386 i = n 1387 1388 def clear(self, decompose=False): 1389 """Wipe out all children of this PageElement by calling extract() 1390 on them. 1391 1392 :param decompose: If this is True, decompose() (a more 1393 destructive method) will be called instead of extract(). 1394 """ 1395 if decompose: 1396 for element in self.contents[:]: 1397 if isinstance(element, Tag): 1398 element.decompose() 1399 else: 1400 element.extract() 1401 else: 1402 for element in self.contents[:]: 1403 element.extract() 1404 1405 def smooth(self): 1406 """Smooth out this element's children by consolidating consecutive 1407 strings. 1408 1409 This makes pretty-printed output look more natural following a 1410 lot of operations that modified the tree. 1411 """ 1412 # Mark the first position of every pair of children that need 1413 # to be consolidated. Do this rather than making a copy of 1414 # self.contents, since in most cases very few strings will be 1415 # affected. 1416 marked = [] 1417 for i, a in enumerate(self.contents): 1418 if isinstance(a, Tag): 1419 # Recursively smooth children. 1420 a.smooth() 1421 if i == len(self.contents)-1: 1422 # This is the last item in .contents, and it's not a 1423 # tag. There's no chance it needs any work. 1424 continue 1425 b = self.contents[i+1] 1426 if (isinstance(a, NavigableString) 1427 and isinstance(b, NavigableString) 1428 and not isinstance(a, PreformattedString) 1429 and not isinstance(b, PreformattedString) 1430 ): 1431 marked.append(i) 1432 1433 # Go over the marked positions in reverse order, so that 1434 # removing items from .contents won't affect the remaining 1435 # positions. 1436 for i in reversed(marked): 1437 a = self.contents[i] 1438 b = self.contents[i+1] 1439 b.extract() 1440 n = NavigableString(a+b) 1441 a.replace_with(n) 1442 1443 def index(self, element): 1444 """Find the index of a child by identity, not value. 1445 1446 Avoids issues with tag.contents.index(element) getting the 1447 index of equal elements. 1448 1449 :param element: Look for this PageElement in `self.contents`. 1450 """ 1451 for i, child in enumerate(self.contents): 1452 if child is element: 1453 return i 1454 raise ValueError("Tag.index: element not in tag") 1455 1456 def get(self, key, default=None): 1457 """Returns the value of the 'key' attribute for the tag, or 1458 the value given for 'default' if it doesn't have that 1459 attribute.""" 1460 return self.attrs.get(key, default) 1461 1462 def get_attribute_list(self, key, default=None): 1463 """The same as get(), but always returns a list. 1464 1465 :param key: The attribute to look for. 1466 :param default: Use this value if the attribute is not present 1467 on this PageElement. 1468 :return: A list of values, probably containing only a single 1469 value. 1470 """ 1471 value = self.get(key, default) 1472 if not isinstance(value, list): 1473 value = [value] 1474 return value 1475 1476 def has_attr(self, key): 1477 """Does this PageElement have an attribute with the given name?""" 1478 return key in self.attrs 1479 1480 def __hash__(self): 1481 return str(self).__hash__() 1482 1483 def __getitem__(self, key): 1484 """tag[key] returns the value of the 'key' attribute for the Tag, 1485 and throws an exception if it's not there.""" 1486 return self.attrs[key] 1487 1488 def __iter__(self): 1489 "Iterating over a Tag iterates over its contents." 1490 return iter(self.contents) 1491 1492 def __len__(self): 1493 "The length of a Tag is the length of its list of contents." 1494 return len(self.contents) 1495 1496 def __contains__(self, x): 1497 return x in self.contents 1498 1499 def __bool__(self): 1500 "A tag is non-None even if it has no contents." 1501 return True 1502 1503 def __setitem__(self, key, value): 1504 """Setting tag[key] sets the value of the 'key' attribute for the 1505 tag.""" 1506 self.attrs[key] = value 1507 1508 def __delitem__(self, key): 1509 "Deleting tag[key] deletes all 'key' attributes for the tag." 1510 self.attrs.pop(key, None) 1511 1512 def __call__(self, *args, **kwargs): 1513 """Calling a Tag like a function is the same as calling its 1514 find_all() method. Eg. tag('a') returns a list of all the A tags 1515 found within this tag.""" 1516 return self.find_all(*args, **kwargs) 1517 1518 def __getattr__(self, tag): 1519 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 1520 #print("Getattr %s.%s" % (self.__class__, tag)) 1521 if len(tag) > 3 and tag.endswith('Tag'): 1522 # BS3: soup.aTag -> "soup.find("a") 1523 tag_name = tag[:-3] 1524 warnings.warn( 1525 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( 1526 name=tag_name 1527 ) 1528 ) 1529 return self.find(tag_name) 1530 # We special case contents to avoid recursion. 1531 elif not tag.startswith("__") and not tag == "contents": 1532 return self.find(tag) 1533 raise AttributeError( 1534 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1535 1536 def __eq__(self, other): 1537 """Returns true iff this Tag has the same name, the same attributes, 1538 and the same contents (recursively) as `other`.""" 1539 if self is other: 1540 return True 1541 if (not hasattr(other, 'name') or 1542 not hasattr(other, 'attrs') or 1543 not hasattr(other, 'contents') or 1544 self.name != other.name or 1545 self.attrs != other.attrs or 1546 len(self) != len(other)): 1547 return False 1548 for i, my_child in enumerate(self.contents): 1549 if my_child != other.contents[i]: 1550 return False 1551 return True 1552 1553 def __ne__(self, other): 1554 """Returns true iff this Tag is not identical to `other`, 1555 as defined in __eq__.""" 1556 return not self == other 1557 1558 def __repr__(self, encoding="unicode-escape"): 1559 """Renders this PageElement as a string. 1560 1561 :param encoding: The encoding to use (Python 2 only). 1562 :return: Under Python 2, a bytestring; under Python 3, 1563 a Unicode string. 1564 """ 1565 if PY3K: 1566 # "The return value must be a string object", i.e. Unicode 1567 return self.decode() 1568 else: 1569 # "The return value must be a string object", i.e. a bytestring. 1570 # By convention, the return value of __repr__ should also be 1571 # an ASCII string. 1572 return self.encode(encoding) 1573 1574 def __unicode__(self): 1575 """Renders this PageElement as a Unicode string.""" 1576 return self.decode() 1577 1578 def __str__(self): 1579 """Renders this PageElement as a generic string. 1580 1581 :return: Under Python 2, a UTF-8 bytestring; under Python 3, 1582 a Unicode string. 1583 """ 1584 if PY3K: 1585 return self.decode() 1586 else: 1587 return self.encode() 1588 1589 if PY3K: 1590 __str__ = __repr__ = __unicode__ 1591 1592 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1593 indent_level=None, formatter="minimal", 1594 errors="xmlcharrefreplace"): 1595 """Render a bytestring representation of this PageElement and its 1596 contents. 1597 1598 :param encoding: The destination encoding. 1599 :param indent_level: Each line of the rendering will be 1600 indented this many spaces. Used internally in 1601 recursive calls while pretty-printing. 1602 :param formatter: A Formatter object, or a string naming one of 1603 the standard formatters. 1604 :param errors: An error handling strategy such as 1605 'xmlcharrefreplace'. This value is passed along into 1606 encode() and its value should be one of the constants 1607 defined by Python. 1608 :return: A bytestring. 1609 1610 """ 1611 # Turn the data structure into Unicode, then encode the 1612 # Unicode. 1613 u = self.decode(indent_level, encoding, formatter) 1614 return u.encode(encoding, errors) 1615 1616 def decode(self, indent_level=None, 1617 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1618 formatter="minimal"): 1619 """Render a Unicode representation of this PageElement and its 1620 contents. 1621 1622 :param indent_level: Each line of the rendering will be 1623 indented this many spaces. Used internally in 1624 recursive calls while pretty-printing. 1625 :param eventual_encoding: The tag is destined to be 1626 encoded into this encoding. This method is _not_ 1627 responsible for performing that encoding. This information 1628 is passed in so that it can be substituted in if the 1629 document contains a <META> tag that mentions the document's 1630 encoding. 1631 :param formatter: A Formatter object, or a string naming one of 1632 the standard formatters. 1633 """ 1634 1635 # First off, turn a non-Formatter `formatter` into a Formatter 1636 # object. This will stop the lookup from happening over and 1637 # over again. 1638 if not isinstance(formatter, Formatter): 1639 formatter = self.formatter_for_name(formatter) 1640 attributes = formatter.attributes(self) 1641 attrs = [] 1642 for key, val in attributes: 1643 if val is None: 1644 decoded = key 1645 else: 1646 if isinstance(val, list) or isinstance(val, tuple): 1647 val = ' '.join(val) 1648 elif not isinstance(val, str): 1649 val = str(val) 1650 elif ( 1651 isinstance(val, AttributeValueWithCharsetSubstitution) 1652 and eventual_encoding is not None 1653 ): 1654 val = val.encode(eventual_encoding) 1655 1656 text = formatter.attribute_value(val) 1657 decoded = ( 1658 str(key) + '=' 1659 + formatter.quoted_attribute_value(text)) 1660 attrs.append(decoded) 1661 close = '' 1662 closeTag = '' 1663 1664 prefix = '' 1665 if self.prefix: 1666 prefix = self.prefix + ":" 1667 1668 if self.is_empty_element: 1669 close = formatter.void_element_close_prefix or '' 1670 else: 1671 closeTag = '</%s%s>' % (prefix, self.name) 1672 1673 pretty_print = self._should_pretty_print(indent_level) 1674 space = '' 1675 indent_space = '' 1676 if indent_level is not None: 1677 indent_space = (' ' * (indent_level - 1)) 1678 if pretty_print: 1679 space = indent_space 1680 indent_contents = indent_level + 1 1681 else: 1682 indent_contents = None 1683 contents = self.decode_contents( 1684 indent_contents, eventual_encoding, formatter 1685 ) 1686 1687 if self.hidden: 1688 # This is the 'document root' object. 1689 s = contents 1690 else: 1691 s = [] 1692 attribute_string = '' 1693 if attrs: 1694 attribute_string = ' ' + ' '.join(attrs) 1695 if indent_level is not None: 1696 # Even if this particular tag is not pretty-printed, 1697 # we should indent up to the start of the tag. 1698 s.append(indent_space) 1699 s.append('<%s%s%s%s>' % ( 1700 prefix, self.name, attribute_string, close)) 1701 if pretty_print: 1702 s.append("\n") 1703 s.append(contents) 1704 if pretty_print and contents and contents[-1] != "\n": 1705 s.append("\n") 1706 if pretty_print and closeTag: 1707 s.append(space) 1708 s.append(closeTag) 1709 if indent_level is not None and closeTag and self.next_sibling: 1710 # Even if this particular tag is not pretty-printed, 1711 # we're now done with the tag, and we should add a 1712 # newline if appropriate. 1713 s.append("\n") 1714 s = ''.join(s) 1715 return s 1716 1717 def _should_pretty_print(self, indent_level): 1718 """Should this tag be pretty-printed? 1719 1720 Most of them should, but some (such as <pre> in HTML 1721 documents) should not. 1722 """ 1723 return ( 1724 indent_level is not None 1725 and ( 1726 not self.preserve_whitespace_tags 1727 or self.name not in self.preserve_whitespace_tags 1728 ) 1729 ) 1730 1731 def prettify(self, encoding=None, formatter="minimal"): 1732 """Pretty-print this PageElement as a string. 1733 1734 :param encoding: The eventual encoding of the string. If this is None, 1735 a Unicode string will be returned. 1736 :param formatter: A Formatter object, or a string naming one of 1737 the standard formatters. 1738 :return: A Unicode string (if encoding==None) or a bytestring 1739 (otherwise). 1740 """ 1741 if encoding is None: 1742 return self.decode(True, formatter=formatter) 1743 else: 1744 return self.encode(encoding, True, formatter=formatter) 1745 1746 def decode_contents(self, indent_level=None, 1747 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1748 formatter="minimal"): 1749 """Renders the contents of this tag as a Unicode string. 1750 1751 :param indent_level: Each line of the rendering will be 1752 indented this many spaces. Used internally in 1753 recursive calls while pretty-printing. 1754 1755 :param eventual_encoding: The tag is destined to be 1756 encoded into this encoding. decode_contents() is _not_ 1757 responsible for performing that encoding. This information 1758 is passed in so that it can be substituted in if the 1759 document contains a <META> tag that mentions the document's 1760 encoding. 1761 1762 :param formatter: A Formatter object, or a string naming one of 1763 the standard Formatters. 1764 """ 1765 # First off, turn a string formatter into a Formatter object. This 1766 # will stop the lookup from happening over and over again. 1767 if not isinstance(formatter, Formatter): 1768 formatter = self.formatter_for_name(formatter) 1769 1770 pretty_print = (indent_level is not None) 1771 s = [] 1772 for c in self: 1773 text = None 1774 if isinstance(c, NavigableString): 1775 text = c.output_ready(formatter) 1776 elif isinstance(c, Tag): 1777 s.append(c.decode(indent_level, eventual_encoding, 1778 formatter)) 1779 preserve_whitespace = ( 1780 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags 1781 ) 1782 if text and indent_level and not preserve_whitespace: 1783 text = text.strip() 1784 if text: 1785 if pretty_print and not preserve_whitespace: 1786 s.append(" " * (indent_level - 1)) 1787 s.append(text) 1788 if pretty_print and not preserve_whitespace: 1789 s.append("\n") 1790 return ''.join(s) 1791 1792 def encode_contents( 1793 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1794 formatter="minimal"): 1795 """Renders the contents of this PageElement as a bytestring. 1796 1797 :param indent_level: Each line of the rendering will be 1798 indented this many spaces. Used internally in 1799 recursive calls while pretty-printing. 1800 1801 :param eventual_encoding: The bytestring will be in this encoding. 1802 1803 :param formatter: A Formatter object, or a string naming one of 1804 the standard Formatters. 1805 1806 :return: A bytestring. 1807 """ 1808 contents = self.decode_contents(indent_level, encoding, formatter) 1809 return contents.encode(encoding) 1810 1811 # Old method for BS3 compatibility 1812 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1813 prettyPrint=False, indentLevel=0): 1814 """Deprecated method for BS3 compatibility.""" 1815 if not prettyPrint: 1816 indentLevel = None 1817 return self.encode_contents( 1818 indent_level=indentLevel, encoding=encoding) 1819 1820 #Soup methods 1821 1822 def find(self, name=None, attrs={}, recursive=True, text=None, 1823 **kwargs): 1824 """Look in the children of this PageElement and find the first 1825 PageElement that matches the given criteria. 1826 1827 All find_* methods take a common set of arguments. See the online 1828 documentation for detailed explanations. 1829 1830 :param name: A filter on tag name. 1831 :param attrs: A dictionary of filters on attribute values. 1832 :param recursive: If this is True, find() will perform a 1833 recursive search of this PageElement's children. Otherwise, 1834 only the direct children will be considered. 1835 :param limit: Stop looking after finding this many results. 1836 :kwargs: A dictionary of filters on attribute values. 1837 :return: A PageElement. 1838 :rtype: bs4.element.Tag | bs4.element.NavigableString 1839 """ 1840 r = None 1841 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 1842 if l: 1843 r = l[0] 1844 return r 1845 findChild = find #BS2 1846 1847 def find_all(self, name=None, attrs={}, recursive=True, text=None, 1848 limit=None, **kwargs): 1849 """Look in the children of this PageElement and find all 1850 PageElements that match the given criteria. 1851 1852 All find_* methods take a common set of arguments. See the online 1853 documentation for detailed explanations. 1854 1855 :param name: A filter on tag name. 1856 :param attrs: A dictionary of filters on attribute values. 1857 :param recursive: If this is True, find_all() will perform a 1858 recursive search of this PageElement's children. Otherwise, 1859 only the direct children will be considered. 1860 :param limit: Stop looking after finding this many results. 1861 :kwargs: A dictionary of filters on attribute values. 1862 :return: A ResultSet of PageElements. 1863 :rtype: bs4.element.ResultSet 1864 """ 1865 generator = self.descendants 1866 if not recursive: 1867 generator = self.children 1868 return self._find_all(name, attrs, text, limit, generator, **kwargs) 1869 findAll = find_all # BS3 1870 findChildren = find_all # BS2 1871 1872 #Generator methods 1873 @property 1874 def children(self): 1875 """Iterate over all direct children of this PageElement. 1876 1877 :yield: A sequence of PageElements. 1878 """ 1879 # return iter() to make the purpose of the method clear 1880 return iter(self.contents) # XXX This seems to be untested. 1881 1882 @property 1883 def descendants(self): 1884 """Iterate over all children of this PageElement in a 1885 breadth-first sequence. 1886 1887 :yield: A sequence of PageElements. 1888 """ 1889 if not len(self.contents): 1890 return 1891 stopNode = self._last_descendant().next_element 1892 current = self.contents[0] 1893 while current is not stopNode: 1894 yield current 1895 current = current.next_element 1896 1897 # CSS selector code 1898 def select_one(self, selector, namespaces=None, **kwargs): 1899 """Perform a CSS selection operation on the current element. 1900 1901 :param selector: A CSS selector. 1902 1903 :param namespaces: A dictionary mapping namespace prefixes 1904 used in the CSS selector to namespace URIs. By default, 1905 Beautiful Soup will use the prefixes it encountered while 1906 parsing the document. 1907 1908 :param kwargs: Keyword arguments to be passed into SoupSieve's 1909 soupsieve.select() method. 1910 1911 :return: A Tag. 1912 :rtype: bs4.element.Tag 1913 """ 1914 value = self.select(selector, namespaces, 1, **kwargs) 1915 if value: 1916 return value[0] 1917 return None 1918 1919 def select(self, selector, namespaces=None, limit=None, **kwargs): 1920 """Perform a CSS selection operation on the current element. 1921 1922 This uses the SoupSieve library. 1923 1924 :param selector: A string containing a CSS selector. 1925 1926 :param namespaces: A dictionary mapping namespace prefixes 1927 used in the CSS selector to namespace URIs. By default, 1928 Beautiful Soup will use the prefixes it encountered while 1929 parsing the document. 1930 1931 :param limit: After finding this number of results, stop looking. 1932 1933 :param kwargs: Keyword arguments to be passed into SoupSieve's 1934 soupsieve.select() method. 1935 1936 :return: A ResultSet of Tags. 1937 :rtype: bs4.element.ResultSet 1938 """ 1939 if namespaces is None: 1940 namespaces = self._namespaces 1941 1942 if limit is None: 1943 limit = 0 1944 if soupsieve is None: 1945 raise NotImplementedError( 1946 "Cannot execute CSS selectors because the soupsieve package is not installed." 1947 ) 1948 1949 results = soupsieve.select(selector, self, namespaces, limit, **kwargs) 1950 1951 # We do this because it's more consistent and because 1952 # ResultSet.__getattr__ has a helpful error message. 1953 return ResultSet(None, results) 1954 1955 # Old names for backwards compatibility 1956 def childGenerator(self): 1957 """Deprecated generator.""" 1958 return self.children 1959 1960 def recursiveChildGenerator(self): 1961 """Deprecated generator.""" 1962 return self.descendants 1963 1964 def has_key(self, key): 1965 """Deprecated method. This was kind of misleading because has_key() 1966 (attributes) was different from __in__ (contents). 1967 1968 has_key() is gone in Python 3, anyway. 1969 """ 1970 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 1971 key)) 1972 return self.has_attr(key) 1973 1974# Next, a couple classes to represent queries and their results. 1975class SoupStrainer(object): 1976 """Encapsulates a number of ways of matching a markup element (tag or 1977 string). 1978 1979 This is primarily used to underpin the find_* methods, but you can 1980 create one yourself and pass it in as `parse_only` to the 1981 `BeautifulSoup` constructor, to parse a subset of a large 1982 document. 1983 """ 1984 1985 def __init__(self, name=None, attrs={}, text=None, **kwargs): 1986 """Constructor. 1987 1988 The SoupStrainer constructor takes the same arguments passed 1989 into the find_* methods. See the online documentation for 1990 detailed explanations. 1991 1992 :param name: A filter on tag name. 1993 :param attrs: A dictionary of filters on attribute values. 1994 :param text: A filter for a NavigableString with specific text. 1995 :kwargs: A dictionary of filters on attribute values. 1996 """ 1997 self.name = self._normalize_search_value(name) 1998 if not isinstance(attrs, dict): 1999 # Treat a non-dict value for attrs as a search for the 'class' 2000 # attribute. 2001 kwargs['class'] = attrs 2002 attrs = None 2003 2004 if 'class_' in kwargs: 2005 # Treat class_="foo" as a search for the 'class' 2006 # attribute, overriding any non-dict value for attrs. 2007 kwargs['class'] = kwargs['class_'] 2008 del kwargs['class_'] 2009 2010 if kwargs: 2011 if attrs: 2012 attrs = attrs.copy() 2013 attrs.update(kwargs) 2014 else: 2015 attrs = kwargs 2016 normalized_attrs = {} 2017 for key, value in list(attrs.items()): 2018 normalized_attrs[key] = self._normalize_search_value(value) 2019 2020 self.attrs = normalized_attrs 2021 self.text = self._normalize_search_value(text) 2022 2023 def _normalize_search_value(self, value): 2024 # Leave it alone if it's a Unicode string, a callable, a 2025 # regular expression, a boolean, or None. 2026 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') 2027 or isinstance(value, bool) or value is None): 2028 return value 2029 2030 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 2031 if isinstance(value, bytes): 2032 return value.decode("utf8") 2033 2034 # If it's listlike, convert it into a list of strings. 2035 if hasattr(value, '__iter__'): 2036 new_value = [] 2037 for v in value: 2038 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 2039 and not isinstance(v, str)): 2040 # This is almost certainly the user's mistake. In the 2041 # interests of avoiding infinite loops, we'll let 2042 # it through as-is rather than doing a recursive call. 2043 new_value.append(v) 2044 else: 2045 new_value.append(self._normalize_search_value(v)) 2046 return new_value 2047 2048 # Otherwise, convert it into a Unicode string. 2049 # The unicode(str()) thing is so this will do the same thing on Python 2 2050 # and Python 3. 2051 return str(str(value)) 2052 2053 def __str__(self): 2054 """A human-readable representation of this SoupStrainer.""" 2055 if self.text: 2056 return self.text 2057 else: 2058 return "%s|%s" % (self.name, self.attrs) 2059 2060 def search_tag(self, markup_name=None, markup_attrs={}): 2061 """Check whether a Tag with the given name and attributes would 2062 match this SoupStrainer. 2063 2064 Used prospectively to decide whether to even bother creating a Tag 2065 object. 2066 2067 :param markup_name: A tag name as found in some markup. 2068 :param markup_attrs: A dictionary of attributes as found in some markup. 2069 2070 :return: True if the prospective tag would match this SoupStrainer; 2071 False otherwise. 2072 """ 2073 found = None 2074 markup = None 2075 if isinstance(markup_name, Tag): 2076 markup = markup_name 2077 markup_attrs = markup 2078 2079 if isinstance(self.name, str): 2080 # Optimization for a very common case where the user is 2081 # searching for a tag with one specific name, and we're 2082 # looking at a tag with a different name. 2083 if markup and not markup.prefix and self.name != markup.name: 2084 return False 2085 2086 call_function_with_tag_data = ( 2087 isinstance(self.name, Callable) 2088 and not isinstance(markup_name, Tag)) 2089 2090 if ((not self.name) 2091 or call_function_with_tag_data 2092 or (markup and self._matches(markup, self.name)) 2093 or (not markup and self._matches(markup_name, self.name))): 2094 if call_function_with_tag_data: 2095 match = self.name(markup_name, markup_attrs) 2096 else: 2097 match = True 2098 markup_attr_map = None 2099 for attr, match_against in list(self.attrs.items()): 2100 if not markup_attr_map: 2101 if hasattr(markup_attrs, 'get'): 2102 markup_attr_map = markup_attrs 2103 else: 2104 markup_attr_map = {} 2105 for k, v in markup_attrs: 2106 markup_attr_map[k] = v 2107 attr_value = markup_attr_map.get(attr) 2108 if not self._matches(attr_value, match_against): 2109 match = False 2110 break 2111 if match: 2112 if markup: 2113 found = markup 2114 else: 2115 found = markup_name 2116 if found and self.text and not self._matches(found.string, self.text): 2117 found = None 2118 return found 2119 2120 # For BS3 compatibility. 2121 searchTag = search_tag 2122 2123 def search(self, markup): 2124 """Find all items in `markup` that match this SoupStrainer. 2125 2126 Used by the core _find_all() method, which is ultimately 2127 called by all find_* methods. 2128 2129 :param markup: A PageElement or a list of them. 2130 """ 2131 # print('looking for %s in %s' % (self, markup)) 2132 found = None 2133 # If given a list of items, scan it for a text element that 2134 # matches. 2135 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 2136 for element in markup: 2137 if isinstance(element, NavigableString) \ 2138 and self.search(element): 2139 found = element 2140 break 2141 # If it's a Tag, make sure its name or attributes match. 2142 # Don't bother with Tags if we're searching for text. 2143 elif isinstance(markup, Tag): 2144 if not self.text or self.name or self.attrs: 2145 found = self.search_tag(markup) 2146 # If it's text, make sure the text matches. 2147 elif isinstance(markup, NavigableString) or \ 2148 isinstance(markup, str): 2149 if not self.name and not self.attrs and self._matches(markup, self.text): 2150 found = markup 2151 else: 2152 raise Exception( 2153 "I don't know how to match against a %s" % markup.__class__) 2154 return found 2155 2156 def _matches(self, markup, match_against, already_tried=None): 2157 # print(u"Matching %s against %s" % (markup, match_against)) 2158 result = False 2159 if isinstance(markup, list) or isinstance(markup, tuple): 2160 # This should only happen when searching a multi-valued attribute 2161 # like 'class'. 2162 for item in markup: 2163 if self._matches(item, match_against): 2164 return True 2165 # We didn't match any particular value of the multivalue 2166 # attribute, but maybe we match the attribute value when 2167 # considered as a string. 2168 if self._matches(' '.join(markup), match_against): 2169 return True 2170 return False 2171 2172 if match_against is True: 2173 # True matches any non-None value. 2174 return markup is not None 2175 2176 if isinstance(match_against, Callable): 2177 return match_against(markup) 2178 2179 # Custom callables take the tag as an argument, but all 2180 # other ways of matching match the tag name as a string. 2181 original_markup = markup 2182 if isinstance(markup, Tag): 2183 markup = markup.name 2184 2185 # Ensure that `markup` is either a Unicode string, or None. 2186 markup = self._normalize_search_value(markup) 2187 2188 if markup is None: 2189 # None matches None, False, an empty string, an empty list, and so on. 2190 return not match_against 2191 2192 if (hasattr(match_against, '__iter__') 2193 and not isinstance(match_against, str)): 2194 # We're asked to match against an iterable of items. 2195 # The markup must be match at least one item in the 2196 # iterable. We'll try each one in turn. 2197 # 2198 # To avoid infinite recursion we need to keep track of 2199 # items we've already seen. 2200 if not already_tried: 2201 already_tried = set() 2202 for item in match_against: 2203 if item.__hash__: 2204 key = item 2205 else: 2206 key = id(item) 2207 if key in already_tried: 2208 continue 2209 else: 2210 already_tried.add(key) 2211 if self._matches(original_markup, item, already_tried): 2212 return True 2213 else: 2214 return False 2215 2216 # Beyond this point we might need to run the test twice: once against 2217 # the tag's name and once against its prefixed name. 2218 match = False 2219 2220 if not match and isinstance(match_against, str): 2221 # Exact string match 2222 match = markup == match_against 2223 2224 if not match and hasattr(match_against, 'search'): 2225 # Regexp match 2226 return match_against.search(markup) 2227 2228 if (not match 2229 and isinstance(original_markup, Tag) 2230 and original_markup.prefix): 2231 # Try the whole thing again with the prefixed tag name. 2232 return self._matches( 2233 original_markup.prefix + ':' + original_markup.name, match_against 2234 ) 2235 2236 return match 2237 2238 2239class ResultSet(list): 2240 """A ResultSet is just a list that keeps track of the SoupStrainer 2241 that created it.""" 2242 def __init__(self, source, result=()): 2243 """Constructor. 2244 2245 :param source: A SoupStrainer. 2246 :param result: A list of PageElements. 2247 """ 2248 super(ResultSet, self).__init__(result) 2249 self.source = source 2250 2251 def __getattr__(self, key): 2252 """Raise a helpful exception to explain a common code fix.""" 2253 raise AttributeError( 2254 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key 2255 ) 2256