1try: 2 from collections.abc import Callable # Python 3.6 3except ImportError as e: 4 from collections import Callable 5import sys 6from collections import OrderedDict 7 8from pdb import set_trace 9import re 10import warnings 11from sigil_bs4.dammit import EntitySubstitution 12 13DEFAULT_OUTPUT_ENCODING = "utf-8" 14 15whitespace_re = re.compile("\s+") 16 17NON_BREAKING_INLINE_TAGS = ("a","abbr","acronym","b","bdo","big","br", 18 "button","cite","code","del","dfn","em","font","i","image","img", 19 "input","ins","kbd","label","map","mark", "nobr","object","q", 20 "ruby","rt","s","samp","select","small","span","strike","strong", 21 "sub","sup","textarea","tt","u","var","wbr","mbp:nu") 22 23PRESERVE_WHITESPACE_TAGS = ("code","pre","textarea","script","style") 24 25VOID_TAGS = ("area","base","basefont","bgsound","br","col","command", 26 "embed","event-source","frame","hr","img","input","keygen", 27 "link","meta","param","source","spacer","track","wbr", 28 "mbp:pagebreak") 29 30NO_ENTITY_SUB_TAGS = ("script", "style") 31 32SPECIAL_HANDLING_TAGS = ("html", "body") 33 34STRUCTURAL_TAGS = ("article","aside","blockquote","body","canvas", 35 "colgroup","div","dl","figure","footer","head","header","hr","html", 36 "ol","section","table","tbody","tfoot","thead","td","th","tr","ul") 37 38OTHER_TEXTHOLDING_TAGS = ("address","caption","dd","div","dt","h1","h2", 39 "h3","h4","h5","h6","legend","li","option","p","td","th","title") 40 41EBOOK_XML_PARENT_TAGS = ("package","metadata","manifest","spine","guide","ncx", 42 "head","doctitle","docauthor","navmap", "navpoint", 43 "navlabel", "pagelist", "pagetarget") 44 45def _alias(attr): 46 """Alias one attribute name to another for backward compatibility""" 47 @property 48 def alias(self): 49 return getattr(self, attr) 50 51 @alias.setter 52 def alias(self): 53 return setattr(self, attr) 54 return alias 55 56 57class NamespacedAttribute(str): 58 59 def __new__(cls, prefix, name, namespace=None): 60 if name is None: 61 obj = str.__new__(cls, prefix) 62 63 elif prefix is None: 64 # Not really namespaced. 65 obj = str.__new__(cls, name) 66 else: 67 obj = str.__new__(cls, prefix + ":" + name) 68 obj.prefix = prefix 69 obj.name = name 70 obj.namespace = namespace 71 return obj 72 73class AttributeValueWithCharsetSubstitution(str): 74 """A stand-in object for a character encoding specified in HTML.""" 75 76class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 77 """A generic stand-in for the value of a meta tag's 'charset' attribute. 78 79 When Beautiful Soup parses the markup '<meta charset="utf8">', the 80 value of the 'charset' attribute will be one of these objects. 81 """ 82 83 def __new__(cls, original_value): 84 obj = str.__new__(cls, original_value) 85 obj.original_value = original_value 86 return obj 87 88 def encode(self, encoding): 89 return encoding 90 91 92class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 93 """A generic stand-in for the value of a meta tag's 'content' attribute. 94 95 When Beautiful Soup parses the markup: 96 <meta http-equiv="content-type" content="text/html; charset=utf8"> 97 98 The value of the 'content' attribute will be one of these objects. 99 """ 100 101 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 102 103 def __new__(cls, original_value): 104 match = cls.CHARSET_RE.search(original_value) 105 if match is None: 106 # No substitution necessary. 107 return str.__new__(str, original_value) 108 109 obj = str.__new__(cls, original_value) 110 obj.original_value = original_value 111 return obj 112 113 def encode(self, encoding): 114 def rewrite(match): 115 return match.group(1) + encoding 116 return self.CHARSET_RE.sub(rewrite, self.original_value) 117 118class HTMLAwareEntitySubstitution(EntitySubstitution): 119 120 """Entity substitution rules that are aware of some HTML quirks. 121 122 Specifically, the contents of <script> and <style> tags should not 123 undergo entity substitution. 124 125 Incoming NavigableString objects are checked to see if they're the 126 direct children of a <script> or <style> tag. 127 """ 128 129 cdata_containing_tags = set(["script", "style"]) 130 131 preformatted_tags = set(["pre"]) 132 133 @classmethod 134 def _substitute_if_appropriate(cls, ns, f): 135 if (isinstance(ns, NavigableString) 136 and ns.parent is not None 137 and ns.parent.name in cls.cdata_containing_tags): 138 # Do nothing. 139 return ns 140 # Substitute. 141 return f(ns) 142 143 @classmethod 144 def substitute_html(cls, ns): 145 return cls._substitute_if_appropriate( 146 ns, EntitySubstitution.substitute_html) 147 148 @classmethod 149 def substitute_xml(cls, ns): 150 return cls._substitute_if_appropriate( 151 ns, EntitySubstitution.substitute_xml_containing_entities) 152 153class PageElement(object): 154 """Contains the navigational information for some part of the page 155 (either a tag or a piece of text)""" 156 157 # There are five possible values for the "formatter" argument passed in 158 # to methods like encode() and prettify(): 159 # 160 # "html" - All Unicode characters with corresponding HTML entities 161 # are converted to those entities on output. 162 # "minimal" - Bare ampersands and angle brackets are converted to 163 # XML entities: & < > 164 # None - The null formatter. Unicode characters are never 165 # converted to entities. This is not recommended, but it's 166 # faster than "minimal". 167 # A function - This function will be called on every string that 168 # needs to undergo entity substitution. 169 # 170 171 # In an HTML document, the default "html" and "minimal" functions 172 # will leave the contents of <script> and <style> tags alone. For 173 # an XML document, all tags will be given the same treatment. 174 175 HTML_FORMATTERS = { 176 "html" : HTMLAwareEntitySubstitution.substitute_html, 177 "minimal" : HTMLAwareEntitySubstitution.substitute_xml, 178 None : None 179 } 180 181 XML_FORMATTERS = { 182 "html" : EntitySubstitution.substitute_html, 183 "minimal" : EntitySubstitution.substitute_xml_containing_entities, 184 None : None 185 } 186 187 def format_string(self, s, formatter='minimal'): 188 """Format the given string using the given formatter.""" 189 if not isinstance(formatter, Callable): 190 formatter = self._formatter_for_name(formatter) 191 if formatter is None: 192 output = s 193 else: 194 output = formatter(s) 195 return output 196 197 @property 198 def _is_xml(self): 199 """Is this element part of an XML tree or an HTML tree? 200 201 This is used when mapping a formatter name ("minimal") to an 202 appropriate function (one that performs entity-substitution on 203 the contents of <script> and <style> tags, or not). It's 204 inefficient, but it should be called very rarely. 205 """ 206 if self.parent is None: 207 # This is the top-level object. It should have .is_xml set 208 # from tree creation. If not, take a guess--BS is usually 209 # used on HTML markup. 210 return getattr(self, 'is_xml', False) 211 return self.parent._is_xml 212 213 def _formatter_for_name(self, name): 214 "Look up a formatter function based on its name and the tree." 215 if self._is_xml: 216 return self.XML_FORMATTERS.get( 217 name, EntitySubstitution.substitute_xml) 218 else: 219 return self.HTML_FORMATTERS.get( 220 name, HTMLAwareEntitySubstitution.substitute_xml) 221 222 def setup(self, parent=None, previous_element=None, next_element=None, 223 previous_sibling=None, next_sibling=None): 224 """Sets up the initial relations between this element and 225 other elements.""" 226 self.parent = parent 227 228 self.previous_element = previous_element 229 if previous_element is not None: 230 self.previous_element.next_element = self 231 232 self.next_element = next_element 233 if self.next_element: 234 self.next_element.previous_element = self 235 236 self.next_sibling = next_sibling 237 if self.next_sibling: 238 self.next_sibling.previous_sibling = self 239 240 if (not previous_sibling 241 and self.parent is not None and self.parent.contents): 242 previous_sibling = self.parent.contents[-1] 243 244 self.previous_sibling = previous_sibling 245 if previous_sibling: 246 self.previous_sibling.next_sibling = self 247 248 nextSibling = _alias("next_sibling") # BS3 249 previousSibling = _alias("previous_sibling") # BS3 250 251 def replace_with(self, replace_with): 252 if not self.parent: 253 raise ValueError( 254 "Cannot replace one element with another when the" 255 "element to be replaced is not part of a tree.") 256 if replace_with is self: 257 return 258 if replace_with is self.parent: 259 raise ValueError("Cannot replace a Tag with its parent.") 260 old_parent = self.parent 261 my_index = self.parent.index(self) 262 self.extract() 263 old_parent.insert(my_index, replace_with) 264 return self 265 replaceWith = replace_with # BS3 266 267 def unwrap(self): 268 my_parent = self.parent 269 if not self.parent: 270 raise ValueError( 271 "Cannot replace an element with its contents when that" 272 "element is not part of a tree.") 273 my_index = self.parent.index(self) 274 self.extract() 275 for child in reversed(self.contents[:]): 276 my_parent.insert(my_index, child) 277 return self 278 replace_with_children = unwrap 279 replaceWithChildren = unwrap # BS3 280 281 def wrap(self, wrap_inside): 282 me = self.replace_with(wrap_inside) 283 wrap_inside.append(me) 284 return wrap_inside 285 286 def extract(self): 287 """Destructively rips this element out of the tree.""" 288 if self.parent is not None: 289 del self.parent.contents[self.parent.index(self)] 290 291 #Find the two elements that would be next to each other if 292 #this element (and any children) hadn't been parsed. Connect 293 #the two. 294 last_child = self._last_descendant() 295 next_element = last_child.next_element 296 297 if (self.previous_element is not None and 298 self.previous_element != next_element): 299 self.previous_element.next_element = next_element 300 if next_element is not None and next_element != self.previous_element: 301 next_element.previous_element = self.previous_element 302 self.previous_element = None 303 last_child.next_element = None 304 305 self.parent = None 306 if (self.previous_sibling is not None 307 and self.previous_sibling != self.next_sibling): 308 self.previous_sibling.next_sibling = self.next_sibling 309 if (self.next_sibling is not None 310 and self.next_sibling != self.previous_sibling): 311 self.next_sibling.previous_sibling = self.previous_sibling 312 self.previous_sibling = self.next_sibling = None 313 return self 314 315 def _last_descendant(self, is_initialized=True, accept_self=True): 316 "Finds the last element beneath this object to be parsed." 317 if is_initialized and self.next_sibling: 318 last_child = self.next_sibling.previous_element 319 else: 320 last_child = self 321 while isinstance(last_child, Tag) and last_child.contents: 322 last_child = last_child.contents[-1] 323 if not accept_self and last_child == self: 324 last_child = None 325 return last_child 326 # BS3: Not part of the API! 327 _lastRecursiveChild = _last_descendant 328 329 def insert(self, position, new_child): 330 if new_child is self: 331 raise ValueError("Cannot insert a tag into itself.") 332 if (isinstance(new_child, str) 333 and not isinstance(new_child, NavigableString)): 334 new_child = NavigableString(new_child) 335 336 position = min(position, len(self.contents)) 337 if hasattr(new_child, 'parent') and new_child.parent is not None: 338 # We're 'inserting' an element that's already one 339 # of this object's children. 340 if new_child.parent is self: 341 current_index = self.index(new_child) 342 if current_index < position: 343 # We're moving this element further down the list 344 # of this object's children. That means that when 345 # we extract this element, our target index will 346 # jump down one. 347 position -= 1 348 new_child.extract() 349 350 new_child.parent = self 351 previous_child = None 352 if position == 0: 353 new_child.previous_sibling = None 354 new_child.previous_element = self 355 else: 356 previous_child = self.contents[position - 1] 357 new_child.previous_sibling = previous_child 358 new_child.previous_sibling.next_sibling = new_child 359 new_child.previous_element = previous_child._last_descendant(False) 360 if new_child.previous_element is not None: 361 new_child.previous_element.next_element = new_child 362 363 new_childs_last_element = new_child._last_descendant(False) 364 365 if position >= len(self.contents): 366 new_child.next_sibling = None 367 368 parent = self 369 parents_next_sibling = None 370 while parents_next_sibling is None and parent is not None: 371 parents_next_sibling = parent.next_sibling 372 parent = parent.parent 373 if parents_next_sibling is not None: 374 # We found the element that comes next in the document. 375 break 376 if parents_next_sibling is not None: 377 new_childs_last_element.next_element = parents_next_sibling 378 else: 379 # The last element of this tag is the last element in 380 # the document. 381 new_childs_last_element.next_element = None 382 else: 383 next_child = self.contents[position] 384 new_child.next_sibling = next_child 385 if new_child.next_sibling is not None: 386 new_child.next_sibling.previous_sibling = new_child 387 new_childs_last_element.next_element = next_child 388 389 if new_childs_last_element.next_element is not None: 390 new_childs_last_element.next_element.previous_element = new_childs_last_element 391 self.contents.insert(position, new_child) 392 393 def append(self, tag): 394 """Appends the given tag to the contents of this tag.""" 395 self.insert(len(self.contents), tag) 396 397 def insert_before(self, predecessor): 398 """Makes the given element the immediate predecessor of this one. 399 400 The two elements will have the same parent, and the given element 401 will be immediately before this one. 402 """ 403 if self is predecessor: 404 raise ValueError("Can't insert an element before itself.") 405 parent = self.parent 406 if parent is None: 407 raise ValueError( 408 "Element has no parent, so 'before' has no meaning.") 409 # Extract first so that the index won't be screwed up if they 410 # are siblings. 411 if isinstance(predecessor, PageElement): 412 predecessor.extract() 413 index = parent.index(self) 414 parent.insert(index, predecessor) 415 416 def insert_after(self, successor): 417 """Makes the given element the immediate successor of this one. 418 419 The two elements will have the same parent, and the given element 420 will be immediately after this one. 421 """ 422 if self is successor: 423 raise ValueError("Can't insert an element after itself.") 424 parent = self.parent 425 if parent is None: 426 raise ValueError( 427 "Element has no parent, so 'after' has no meaning.") 428 # Extract first so that the index won't be screwed up if they 429 # are siblings. 430 if isinstance(successor, PageElement): 431 successor.extract() 432 index = parent.index(self) 433 parent.insert(index+1, successor) 434 435 def find_next(self, name=None, attrs=OrderedDict(), text=None, **kwargs): 436 """Returns the first item that matches the given criteria and 437 appears after this Tag in the document.""" 438 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 439 findNext = find_next # BS3 440 441 def find_all_next(self, name=None, attrs=OrderedDict(), text=None, limit=None, 442 **kwargs): 443 """Returns all items that match the given criteria and appear 444 after this Tag in the document.""" 445 return self._find_all(name, attrs, text, limit, self.next_elements, 446 **kwargs) 447 findAllNext = find_all_next # BS3 448 449 def find_next_sibling(self, name=None, attrs=OrderedDict(), text=None, **kwargs): 450 """Returns the closest sibling to this Tag that matches the 451 given criteria and appears after this Tag in the document.""" 452 return self._find_one(self.find_next_siblings, name, attrs, text, 453 **kwargs) 454 findNextSibling = find_next_sibling # BS3 455 456 def find_next_siblings(self, name=None, attrs=OrderedDict(), text=None, limit=None, 457 **kwargs): 458 """Returns the siblings of this Tag that match the given 459 criteria and appear after this Tag in the document.""" 460 return self._find_all(name, attrs, text, limit, 461 self.next_siblings, **kwargs) 462 findNextSiblings = find_next_siblings # BS3 463 fetchNextSiblings = find_next_siblings # BS2 464 465 def find_previous(self, name=None, attrs=OrderedDict(), text=None, **kwargs): 466 """Returns the first item that matches the given criteria and 467 appears before this Tag in the document.""" 468 return self._find_one( 469 self.find_all_previous, name, attrs, text, **kwargs) 470 findPrevious = find_previous # BS3 471 472 def find_all_previous(self, name=None, attrs=OrderedDict(), text=None, limit=None, 473 **kwargs): 474 """Returns all items that match the given criteria and appear 475 before this Tag in the document.""" 476 return self._find_all(name, attrs, text, limit, self.previous_elements, 477 **kwargs) 478 findAllPrevious = find_all_previous # BS3 479 fetchPrevious = find_all_previous # BS2 480 481 def find_previous_sibling(self, name=None, attrs=OrderedDict(), text=None, **kwargs): 482 """Returns the closest sibling to this Tag that matches the 483 given criteria and appears before this Tag in the document.""" 484 return self._find_one(self.find_previous_siblings, name, attrs, text, 485 **kwargs) 486 findPreviousSibling = find_previous_sibling # BS3 487 488 def find_previous_siblings(self, name=None, attrs=OrderedDict(), text=None, 489 limit=None, **kwargs): 490 """Returns the siblings of this Tag that match the given 491 criteria and appear before this Tag in the document.""" 492 return self._find_all(name, attrs, text, limit, 493 self.previous_siblings, **kwargs) 494 findPreviousSiblings = find_previous_siblings # BS3 495 fetchPreviousSiblings = find_previous_siblings # BS2 496 497 def find_parent(self, name=None, attrs=OrderedDict(), **kwargs): 498 """Returns the closest parent of this Tag that matches the given 499 criteria.""" 500 # NOTE: We can't use _find_one because findParents takes a different 501 # set of arguments. 502 r = None 503 l = self.find_parents(name, attrs, 1, **kwargs) 504 if l: 505 r = l[0] 506 return r 507 findParent = find_parent # BS3 508 509 def find_parents(self, name=None, attrs=OrderedDict(), limit=None, **kwargs): 510 """Returns the parents of this Tag that match the given 511 criteria.""" 512 513 return self._find_all(name, attrs, None, limit, self.parents, 514 **kwargs) 515 findParents = find_parents # BS3 516 fetchParents = find_parents # BS2 517 518 @property 519 def next(self): 520 return self.next_element 521 522 @property 523 def previous(self): 524 return self.previous_element 525 526 #These methods do the real heavy lifting. 527 528 def _find_one(self, method, name, attrs, text, **kwargs): 529 r = None 530 l = method(name, attrs, text, 1, **kwargs) 531 if l: 532 r = l[0] 533 return r 534 535 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 536 "Iterates over a generator looking for things that match." 537 538 if text is None and 'string' in kwargs: 539 text = kwargs['string'] 540 del kwargs['string'] 541 542 if isinstance(name, SoupStrainer): 543 strainer = name 544 else: 545 strainer = SoupStrainer(name, attrs, text, **kwargs) 546 547 if text is None and not limit and not attrs and not kwargs: 548 if name is True or name is None: 549 # Optimization to find all tags. 550 result = (element for element in generator 551 if isinstance(element, Tag)) 552 return ResultSet(strainer, result) 553 elif isinstance(name, str): 554 # Optimization to find all tags with a given name. 555 result = (element for element in generator 556 if isinstance(element, Tag) 557 and element.name == name) 558 return ResultSet(strainer, result) 559 results = ResultSet(strainer) 560 while True: 561 try: 562 i = next(generator) 563 except StopIteration: 564 break 565 if i: 566 found = strainer.search(i) 567 if found: 568 results.append(found) 569 if limit and len(results) >= limit: 570 break 571 return results 572 573 #These generators can be used to navigate starting from both 574 #NavigableStrings and Tags. 575 @property 576 def next_elements(self): 577 i = self.next_element 578 while i is not None: 579 yield i 580 i = i.next_element 581 582 @property 583 def next_siblings(self): 584 i = self.next_sibling 585 while i is not None: 586 yield i 587 i = i.next_sibling 588 589 @property 590 def previous_elements(self): 591 i = self.previous_element 592 while i is not None: 593 yield i 594 i = i.previous_element 595 596 @property 597 def previous_siblings(self): 598 i = self.previous_sibling 599 while i is not None: 600 yield i 601 i = i.previous_sibling 602 603 @property 604 def parents(self): 605 i = self.parent 606 while i is not None: 607 yield i 608 i = i.parent 609 610 # Methods for supporting CSS selectors. 611 612 tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') 613 614 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 615 # \---------------------------/ \---/\-------------/ \-------/ 616 # | | | | 617 # | | | The value 618 # | | ~,|,^,$,* or = 619 # | Attribute 620 # Tag 621 attribselect_re = re.compile( 622 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + 623 r'=?"?(?P<value>[^\]"]*)"?\]$' 624 ) 625 626 def _attr_value_as_string(self, value, default=None): 627 """Force an attribute value into a string representation. 628 629 A multi-valued attribute will be converted into a 630 space-separated stirng. 631 """ 632 value = self.get(value, default) 633 if isinstance(value, list) or isinstance(value, tuple): 634 value =" ".join(value) 635 return value 636 637 def _tag_name_matches_and(self, function, tag_name): 638 if not tag_name: 639 return function 640 else: 641 def _match(tag): 642 return tag.name == tag_name and function(tag) 643 return _match 644 645 def _attribute_checker(self, operator, attribute, value=''): 646 """Create a function that performs a CSS selector operation. 647 648 Takes an operator, attribute and optional value. Returns a 649 function that will return True for elements that match that 650 combination. 651 """ 652 if operator == '=': 653 # string representation of `attribute` is equal to `value` 654 return lambda el: el._attr_value_as_string(attribute) == value 655 elif operator == '~': 656 # space-separated list representation of `attribute` 657 # contains `value` 658 def _includes_value(element): 659 attribute_value = element.get(attribute, []) 660 if not isinstance(attribute_value, list): 661 attribute_value = attribute_value.split() 662 return value in attribute_value 663 return _includes_value 664 elif operator == '^': 665 # string representation of `attribute` starts with `value` 666 return lambda el: el._attr_value_as_string( 667 attribute, '').startswith(value) 668 elif operator == '$': 669 # string represenation of `attribute` ends with `value` 670 return lambda el: el._attr_value_as_string( 671 attribute, '').endswith(value) 672 elif operator == '*': 673 # string representation of `attribute` contains `value` 674 return lambda el: value in el._attr_value_as_string(attribute, '') 675 elif operator == '|': 676 # string representation of `attribute` is either exactly 677 # `value` or starts with `value` and then a dash. 678 def _is_or_starts_with_dash(element): 679 attribute_value = element._attr_value_as_string(attribute, '') 680 return (attribute_value == value or attribute_value.startswith( 681 value + '-')) 682 return _is_or_starts_with_dash 683 else: 684 return lambda el: el.has_attr(attribute) 685 686 # Old non-property versions of the generators, for backwards 687 # compatibility with BS3. 688 def nextGenerator(self): 689 return self.next_elements 690 691 def nextSiblingGenerator(self): 692 return self.next_siblings 693 694 def previousGenerator(self): 695 return self.previous_elements 696 697 def previousSiblingGenerator(self): 698 return self.previous_siblings 699 700 def parentGenerator(self): 701 return self.parents 702 703 704class NavigableString(str, PageElement): 705 706 PREFIX = '' 707 SUFFIX = '' 708 709 def __new__(cls, value): 710 """Create a new NavigableString. 711 712 When unpickling a NavigableString, this method is called with 713 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 714 passed in to the superclass's __new__ or the superclass won't know 715 how to handle non-ASCII characters. 716 """ 717 if isinstance(value, str): 718 u = str.__new__(cls, value) 719 else: 720 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 721 u.setup() 722 return u 723 724 def __copy__(self): 725 """A copy of a NavigableString has the same contents and class 726 as the original, but it is not connected to the parse tree. 727 """ 728 return type(self)(self) 729 730 def __getnewargs__(self): 731 return (str(self),) 732 733 def __getattr__(self, attr): 734 """text.string gives you text. This is for backwards 735 compatibility for Navigable*String, but for CData* it lets you 736 get the string without the CData wrapper.""" 737 if attr == 'string': 738 return self 739 else: 740 raise AttributeError( 741 "'%s' object has no attribute '%s'" % ( 742 self.__class__.__name__, attr)) 743 744 def output_ready(self, formatter="minimal"): 745 output = self.format_string(self, formatter) 746 return self.PREFIX + output + self.SUFFIX 747 748 @property 749 def name(self): 750 return None 751 752 @name.setter 753 def name(self, name): 754 raise AttributeError("A NavigableString cannot be given a name.") 755 756class PreformattedString(NavigableString): 757 """A NavigableString not subject to the normal formatting rules. 758 759 The string will be passed into the formatter (to trigger side effects), 760 but the return value will be ignored. 761 """ 762 763 def output_ready(self, formatter="minimal"): 764 """CData strings are passed into the formatter. 765 But the return value is ignored.""" 766 self.format_string(self, formatter) 767 return self.PREFIX + self + self.SUFFIX 768 769class CData(PreformattedString): 770 771 PREFIX = '<![CDATA[' 772 SUFFIX = ']]>' 773 774class ProcessingInstruction(PreformattedString): 775 776 PREFIX = '<?' 777 SUFFIX = '>' 778 779class Comment(PreformattedString): 780 781 PREFIX = '<!--' 782 SUFFIX = '-->' 783 784 785class Declaration(PreformattedString): 786 PREFIX = '<!' 787 SUFFIX = '!>' 788 789 790class Doctype(PreformattedString): 791 792 @classmethod 793 def for_name_and_ids(cls, name, pub_id, system_id): 794 value = name or '' 795 if pub_id is not None: 796 value += ' PUBLIC "%s"' % pub_id 797 if system_id is not None: 798 value += '\n "%s"' % system_id 799 elif system_id is not None: 800 value += ' SYSTEM "%s"' % system_id 801 802 return Doctype(value) 803 804 PREFIX = '<!DOCTYPE ' 805 SUFFIX = '>\n' 806 807 808class Tag(PageElement): 809 810 """Represents a found HTML tag with its attributes and contents.""" 811 812 def __init__(self, parser=None, builder=None, name=None, namespace=None, 813 prefix=None, attrs=None, parent=None, previous=None): 814 "Basic constructor." 815 816 if parser is None: 817 self.parser_class = None 818 else: 819 # We don't actually store the parser object: that lets extracted 820 # chunks be garbage-collected. 821 self.parser_class = parser.__class__ 822 if name is None: 823 raise ValueError("No value provided for new tag's name.") 824 self.name = name 825 self.namespace = namespace 826 self.prefix = prefix 827 if attrs is None: 828 attrs = OrderedDict() 829 elif attrs: 830 if builder is not None and builder.cdata_list_attributes: 831 attrs = builder._replace_cdata_list_attribute_values( 832 self.name, attrs) 833 else: 834 attrs = OrderedDict(attrs) 835 else: 836 attrs = OrderedDict(attrs) 837 self.attrs = attrs 838 self.contents = [] 839 self.setup(parent, previous) 840 self.hidden = False 841 842 # Set up any substitutions, such as the charset in a META tag. 843 if builder is not None: 844 builder.set_up_substitutions(self) 845 self.can_be_empty_element = builder.can_be_empty_element(name) 846 else: 847 self.can_be_empty_element = False 848 849 parserClass = _alias("parser_class") # BS3 850 851 def __copy__(self): 852 """A copy of a Tag is a new Tag, unconnected to the parse tree. 853 Its contents are a copy of the old Tag's contents. 854 """ 855 clone = type(self)(None, self.builder, self.name, self.namespace, 856 self.nsprefix, self.attrs) 857 for attr in ('can_be_empty_element', 'hidden'): 858 setattr(clone, attr, getattr(self, attr)) 859 for child in self.contents: 860 clone.append(child.__copy__()) 861 return clone 862 863 @property 864 def is_empty_element(self): 865 """Is this tag an empty-element tag? (aka a self-closing tag) 866 867 A tag that has contents is never an empty-element tag. 868 869 A tag that has no contents may or may not be an empty-element 870 tag. It depends on the builder used to create the tag. If the 871 builder has a designated list of empty-element tags, then only 872 a tag whose name shows up in that list is considered an 873 empty-element tag. 874 875 If the builder has no designated list of empty-element tags, 876 then any tag with no contents is an empty-element tag. 877 """ 878 return len(self.contents) == 0 and self.can_be_empty_element 879 isSelfClosing = is_empty_element # BS3 880 881 @property 882 def is_non_breaking_inline_tag(self): 883 # used only for pretty printing of html to prevent returns after tags 884 # from introducing spaces where none are desired 885 return self.name in NON_BREAKING_INLINE_TAGS and not self._is_xml 886 887 @property 888 def string(self): 889 """Convenience property to get the single string within this tag. 890 891 :Return: If this tag has a single string child, return value 892 is that string. If this tag has no children, or more than one 893 child, return value is None. If this tag has one child tag, 894 return value is the 'string' attribute of the child tag, 895 recursively. 896 """ 897 if len(self.contents) != 1: 898 return None 899 child = self.contents[0] 900 if isinstance(child, NavigableString): 901 return child 902 return child.string 903 904 @string.setter 905 def string(self, string): 906 self.clear() 907 self.append(string.__class__(string)) 908 909 def _all_strings(self, strip=False, types=(NavigableString, CData)): 910 """Yield all strings of certain classes, possibly stripping them. 911 912 By default, yields only NavigableString and CData objects. So 913 no comments, processing instructions, etc. 914 """ 915 for descendant in self.descendants: 916 if ( 917 (types is None and not isinstance(descendant, NavigableString)) 918 or 919 (types is not None and type(descendant) not in types)): 920 continue 921 if strip: 922 descendant = descendant.strip() 923 if len(descendant) == 0: 924 continue 925 yield descendant 926 927 strings = property(_all_strings) 928 929 @property 930 def stripped_strings(self): 931 for string in self._all_strings(True): 932 yield string 933 934 def get_text(self, separator="", strip=False, 935 types=(NavigableString, CData)): 936 """ 937 Get all child strings, concatenated using the given separator. 938 """ 939 return separator.join([s for s in self._all_strings( 940 strip, types=types)]) 941 getText = get_text 942 text = property(get_text) 943 944 def decompose(self): 945 """Recursively destroys the contents of this tree.""" 946 self.extract() 947 i = self 948 while i is not None: 949 next = i.next_element 950 i.__dict__.clear() 951 i.contents = [] 952 i = next 953 954 def clear(self, decompose=False): 955 """ 956 Extract all children. If decompose is True, decompose instead. 957 """ 958 if decompose: 959 for element in self.contents[:]: 960 if isinstance(element, Tag): 961 element.decompose() 962 else: 963 element.extract() 964 else: 965 for element in self.contents[:]: 966 element.extract() 967 968 def index(self, element): 969 """ 970 Find the index of a child by identity, not value. Avoids issues with 971 tag.contents.index(element) getting the index of equal elements. 972 """ 973 for i, child in enumerate(self.contents): 974 if child is element: 975 return i 976 raise ValueError("Tag.index: element not in tag") 977 978 def get(self, key, default=None): 979 """Returns the value of the 'key' attribute for the tag, or 980 the value given for 'default' if it doesn't have that 981 attribute.""" 982 return self.attrs.get(key, default) 983 984 def has_attr(self, key): 985 return key in self.attrs 986 987 def __hash__(self): 988 return str(self).__hash__() 989 990 def __getitem__(self, key): 991 """tag[key] returns the value of the 'key' attribute for the tag, 992 and throws an exception if it's not there.""" 993 return self.attrs[key] 994 995 def __iter__(self): 996 "Iterating over a tag iterates over its contents." 997 return iter(self.contents) 998 999 def __len__(self): 1000 "The length of a tag is the length of its list of contents." 1001 return len(self.contents) 1002 1003 def __contains__(self, x): 1004 return x in self.contents 1005 1006 def __bool__(self): 1007 "A tag is non-None even if it has no contents." 1008 return True 1009 1010 def __nonzero__(self): 1011 "A tag is non-None even if it has no contents." 1012 return True 1013 1014 def __setitem__(self, key, value): 1015 """Setting tag[key] sets the value of the 'key' attribute for the 1016 tag.""" 1017 self.attrs[key] = value 1018 1019 def __delitem__(self, key): 1020 "Deleting tag[key] deletes all 'key' attributes for the tag." 1021 self.attrs.pop(key, None) 1022 1023 def __call__(self, *args, **kwargs): 1024 """Calling a tag like a function is the same as calling its 1025 find_all() method. Eg. tag('a') returns a list of all the A tags 1026 found within this tag.""" 1027 return self.find_all(*args, **kwargs) 1028 1029 def __getattr__(self, tag): 1030 #print "Getattr %s.%s" % (self.__class__, tag) 1031 if len(tag) > 3 and tag.endswith('Tag'): 1032 # BS3: soup.aTag -> "soup.find("a") 1033 tag_name = tag[:-3] 1034 warnings.warn( 1035 '.%sTag is deprecated, use .find("%s") instead.' % ( 1036 tag_name, tag_name)) 1037 return self.find(tag_name) 1038 # We special case contents to avoid recursion. 1039 elif not tag.startswith("__") and not tag=="contents": 1040 return self.find(tag) 1041 raise AttributeError( 1042 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1043 1044 def __eq__(self, other): 1045 """Returns true iff this tag has the same name, the same attributes, 1046 and the same contents (recursively) as the given tag.""" 1047 if self is other: 1048 return True 1049 if (not hasattr(other, 'name') or 1050 not hasattr(other, 'attrs') or 1051 not hasattr(other, 'contents') or 1052 self.name != other.name or 1053 self.attrs != other.attrs or 1054 len(self) != len(other)): 1055 return False 1056 for i, my_child in enumerate(self.contents): 1057 if my_child != other.contents[i]: 1058 return False 1059 return True 1060 1061 def __ne__(self, other): 1062 """Returns true iff this tag is not identical to the other tag, 1063 as defined in __eq__.""" 1064 return not self == other 1065 1066 def __repr__(self, encoding="unicode-escape"): 1067 """Renders this tag as a string.""" 1068 # "The return value must be a string object", i.e. Unicode 1069 return self.decode() 1070 1071 def __str__(self): 1072 return self.decode() 1073 1074 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1075 indent_level=None, formatter="minimal", 1076 errors="xmlcharrefreplace", indent_chars=" "): 1077 # Turn the data structure into Unicode, then encode the 1078 # Unicode. 1079 u = self.decode(indent_level, encoding, formatter, indent_chars) 1080 return u.encode(encoding, errors) 1081 1082 def _should_pretty_print(self, indent_level): 1083 """Should this tag be pretty-printed?""" 1084 return ( 1085 indent_level is not None and 1086 ((self.name not in HTMLAwareEntitySubstitution.preformatted_tags 1087 and self.name not in NON_BREAKING_INLINE_TAGS) 1088 or self._is_xml)) 1089 1090 def decode(self, indent_level=None, 1091 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1092 formatter="minimal", indent_chars=" "): 1093 """Returns a Unicode representation of this tag and its contents. 1094 1095 :param eventual_encoding: The tag is destined to be 1096 encoded into this encoding. This method is _not_ 1097 responsible for performing that encoding. This information 1098 is passed in so that it can be substituted in if the 1099 document contains a <META> tag that mentions the document's 1100 encoding. 1101 """ 1102 1103 # First off, turn a string formatter into a function. This 1104 # will stop the lookup from happening over and over again. 1105 if not isinstance(formatter, Callable): 1106 formatter = self._formatter_for_name(formatter) 1107 1108 attrs = [] 1109 if self.attrs: 1110 for key, val in sorted(self.attrs.items()): 1111 if val is None: 1112 decoded = key 1113 else: 1114 if isinstance(val, list) or isinstance(val, tuple): 1115 val = ' '.join(val) 1116 elif not isinstance(val, str): 1117 val = str(val) 1118 elif ( 1119 isinstance(val, AttributeValueWithCharsetSubstitution) 1120 and eventual_encoding is not None): 1121 val = val.encode(eventual_encoding) 1122 1123 text = self.format_string(val, formatter) 1124 decoded = ( 1125 str(key) + '=' 1126 + EntitySubstitution.quoted_attribute_value(text)) 1127 attrs.append(decoded) 1128 close = '' 1129 closeTag = '' 1130 1131 prefix = '' 1132 if self.prefix: 1133 prefix = self.prefix + ":" 1134 1135 if self.is_empty_element: 1136 close = '/' 1137 else: 1138 closeTag = '</%s%s>' % (prefix, self.name) 1139 1140 pretty_print = self._should_pretty_print(indent_level) 1141 space = '' 1142 indent_space = '' 1143 if indent_level is not None: 1144 indent_space = (indent_chars * (indent_level - 1)) 1145 if pretty_print: 1146 space = indent_space 1147 indent_contents = indent_level + 1 1148 else: 1149 indent_contents = None 1150 contents = self.decode_contents( 1151 indent_contents, eventual_encoding, formatter, indent_chars) 1152 1153 if self.hidden: 1154 # This is the 'document root' object. 1155 s = contents 1156 else: 1157 s = [] 1158 attribute_string = '' 1159 if attrs: 1160 attribute_string = ' ' + ' '.join(attrs) 1161 if indent_level is not None: 1162 # Even if this particular tag is not pretty-printed, 1163 # we should indent up to the start of the tag. 1164 s.append(indent_space) 1165 s.append('<%s%s%s%s>' % ( 1166 prefix, self.name, attribute_string, close)) 1167 if pretty_print: 1168 s.append("\n") 1169 s.append(contents) 1170 if pretty_print and contents and contents[-1] != "\n": 1171 s.append("\n") 1172 if pretty_print and closeTag: 1173 s.append(space) 1174 s.append(closeTag) 1175 if indent_level is not None and closeTag and self.next_sibling: 1176 # Even if this particular tag is not pretty-printed, 1177 # we're now done with the tag, and we should add a 1178 # newline if appropriate. 1179 s.append("\n") 1180 s = ''.join(s) 1181 return s 1182 1183 def prettify(self, encoding=None, formatter="minimal", indent_chars=" "): 1184 if encoding is None: 1185 return self.decode(True, formatter=formatter, indent_chars=indent_chars) 1186 else: 1187 return self.encode(encoding, True, formatter=formatter, indent_chars=indent_chars) 1188 1189 def decode_contents(self, indent_level=None, 1190 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1191 formatter="minimal", indent_chars=" "): 1192 """Renders the contents of this tag as a Unicode string. 1193 1194 :param indent_level: Each line of the rendering will be 1195 indented this many spaces. 1196 1197 :param eventual_encoding: The tag is destined to be 1198 encoded into this encoding. This method is _not_ 1199 responsible for performing that encoding. This information 1200 is passed in so that it can be substituted in if the 1201 document contains a <META> tag that mentions the document's 1202 encoding. 1203 1204 :param formatter: The output formatter responsible for converting 1205 entities to Unicode characters. 1206 """ 1207 # First off, turn a string formatter into a function. This 1208 # will stop the lookup from happening over and over again. 1209 if not isinstance(formatter, Callable): 1210 formatter = self._formatter_for_name(formatter) 1211 1212 pretty_print = (indent_level is not None) 1213 s = [] 1214 for c in self: 1215 text = None 1216 if isinstance(c, NavigableString): 1217 text = c.output_ready(formatter) 1218 elif isinstance(c, Tag): 1219 s.append(c.decode(indent_level, eventual_encoding, formatter, indent_chars)) 1220 if text and indent_level and not self.name == 'pre': 1221 text = text.strip() 1222 if text: 1223 if pretty_print and not self.name == 'pre': 1224 s.append(indent_chars * (indent_level - 1)) 1225 s.append(text) 1226 if pretty_print and not self.name == 'pre': 1227 s.append("\n") 1228 return ''.join(s) 1229 1230 def decodexml(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1231 formatter="minimal", indent_chars=" "): 1232 1233 # First off, turn a string formatter into a function. This 1234 # will stop the lookup from happening over and over again. 1235 if not isinstance(formatter, Callable): 1236 formatter = self._formatter_for_name(formatter) 1237 1238 is_xmlparent = self.name.lower() in EBOOK_XML_PARENT_TAGS 1239 attrs = [] 1240 if self.attrs: 1241 for key, val in sorted(self.attrs.items()): 1242 if val is None: 1243 decoded = key 1244 else: 1245 if isinstance(val, list) or isinstance(val, tuple): 1246 val = ' '.join(val) 1247 elif not isinstance(val, str): 1248 val = str(val) 1249 elif ( 1250 isinstance(val, AttributeValueWithCharsetSubstitution) 1251 and eventual_encoding is not None): 1252 val = val.encode(eventual_encoding) 1253 1254 text = self.format_string(val, formatter) 1255 decoded = ( 1256 str(key) + '=' 1257 + EntitySubstitution.quoted_attribute_value(text)) 1258 attrs.append(decoded) 1259 1260 prefix = '' 1261 if self.prefix: 1262 prefix = self.prefix + ":" 1263 1264 # for pure xml, a self closing tag with only whitespace 1265 # "contents" should be treated as empty 1266 if self.can_be_empty_element: 1267 tagcontents = self.string 1268 if tagcontents is not None and len(tagcontents.strip()) == 0: 1269 self.contents = [] 1270 1271 close = '' 1272 closeTag = '' 1273 if self.is_empty_element: 1274 close = '/' 1275 else: 1276 closeTag = '</%s%s>' % (prefix, self.name) 1277 1278 indent_space = (indent_chars * (indent_level - 1)) 1279 indent_contents = indent_level 1280 if is_xmlparent or self.hidden: 1281 indent_contents = indent_level + 1 1282 1283 contents = self.decodexml_contents(indent_contents, eventual_encoding, formatter, indent_chars) 1284 if self.hidden: 1285 # This is the 'document root' object. 1286 s = contents 1287 else: 1288 s = [] 1289 attribute_string = '' 1290 if attrs: 1291 attribute_string = ' ' + ' '.join(attrs) 1292 s.append(indent_space) 1293 s.append('<%s%s%s%s>' % (prefix, self.name, attribute_string, close)) 1294 if is_xmlparent: 1295 s.append("\n") 1296 s.append(contents) 1297 if contents and contents[-1] != "\n" and is_xmlparent or self.is_empty_element: 1298 s.append("\n") 1299 if closeTag and is_xmlparent: 1300 s.append(indent_space) 1301 s.append(closeTag) 1302 if closeTag and self.next_sibling: 1303 s.append("\n") 1304 s = ''.join(s) 1305 return s 1306 1307 def decodexml_contents(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1308 formatter="minimal", indent_chars=" "): 1309 """Renders the contents of this tag as a Unicode string. 1310 """ 1311 # First off, turn a string formatter into a function. This 1312 # will stop the lookup from happening over and over again. 1313 if not isinstance(formatter, Callable): 1314 formatter = self._formatter_for_name(formatter) 1315 1316 is_xmlparent = self.name.lower() in EBOOK_XML_PARENT_TAGS 1317 s = [] 1318 for c in self: 1319 text = None 1320 if isinstance(c, NavigableString): 1321 text = c.output_ready(formatter) 1322 elif isinstance(c, Tag): 1323 val = c.decodexml(indent_level, eventual_encoding, formatter, indent_chars) 1324 s.append(val) 1325 if text: 1326 text = text.strip() 1327 if text: 1328 if is_xmlparent and len(s) == 0: 1329 s.append(indent_chars * (indent_level - 1)) 1330 s.append(text) 1331 return ''.join(s) 1332 1333 def serialize_xhtml(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): 1334 # First off, turn a string formatter into a function. This 1335 # will stop the lookup from happening over and over again. 1336 if not isinstance(formatter, Callable): 1337 formatter = self._formatter_for_name(formatter) 1338 1339 prefix = '' 1340 close = '' 1341 closeTag = '' 1342 attrs = [] 1343 if self.attrs: 1344 for key, val in sorted(self.attrs.items()): 1345 if val is None: 1346 ntext = key 1347 else: 1348 if isinstance(val, list) or isinstance(val, tuple): 1349 val = ' '.join(val) 1350 elif not isinstance(val, str): 1351 val = str(val) 1352 elif (isinstance(val, AttributeValueWithCharsetSubstitution) and 1353 eventual_encoding is not None): 1354 val = val.encode(eventual_encoding) 1355 text = self.format_string(val, formatter) 1356 ntext = (str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) 1357 attrs.append(ntext) 1358 1359 contents = self.serialize_xhtml_contents(eventual_encoding, formatter) 1360 1361 in_xml_ns = self.namespace != 'http://www.w3.org/1999/xhtml' 1362 testcontents = contents.strip() 1363 1364 if self.prefix: 1365 prefix = self.prefix + ":" 1366 1367 if self.name in VOID_TAGS or (in_xml_ns and testcontents==""): 1368 close = '/' 1369 else: 1370 closeTag = '</%s%s>' % (prefix, self.name) 1371 1372 # strip extraneous whitespace before the primary closing tag 1373 if self.name in SPECIAL_HANDLING_TAGS: 1374 contents = contents.strip() 1375 contents += "\n" 1376 1377 if self.hidden: 1378 # This is the 'document root' object. 1379 s = contents 1380 else: 1381 s = [] 1382 attribute_string = '' 1383 if attrs: 1384 attribute_string = ' ' + ' '.join(attrs) 1385 s.append('<%s%s%s%s>' % (prefix, self.name, attribute_string, close)) 1386 if self.name in SPECIAL_HANDLING_TAGS: 1387 s.append("\n") 1388 s.append(contents) 1389 s.append(closeTag) 1390 if self.name in SPECIAL_HANDLING_TAGS: 1391 s.append("\n") 1392 s = ''.join(s) 1393 return s 1394 1395 def serialize_xhtml_contents(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): 1396 1397 # First off, turn a string formatter into a function. This 1398 # will stop the lookup from happening over and over again. 1399 if not isinstance(formatter, Callable): 1400 formatter = self._formatter_for_name(formatter) 1401 1402 s = [] 1403 for c in self: 1404 text = None 1405 if isinstance(c, Comment): 1406 text = Comment(c).output_ready(formatter) 1407 s.append(text) 1408 elif isinstance(c, CData): 1409 text = CData(c).output_ready(formatter) 1410 s.append(text) 1411 elif isinstance(c, NavigableString): 1412 text = c.output_ready(formatter) 1413 s.append(text) 1414 elif isinstance(c, Tag): 1415 s.append(c.serialize_xhtml(eventual_encoding, formatter)) 1416 return ''.join(s) 1417 1418 def prettyprint_xhtml(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1419 formatter="minimal", indent_chars=" "): 1420 1421 # First off, turn a string formatter into a function. This 1422 # will stop the lookup from happening over and over again. 1423 if not isinstance(formatter, Callable): 1424 formatter = self._formatter_for_name(formatter) 1425 1426 is_structural = self.name in STRUCTURAL_TAGS 1427 is_inline = self.name in NON_BREAKING_INLINE_TAGS 1428 1429 # build attribute string 1430 attribs = [] 1431 atts = "" 1432 if self.attrs: 1433 for key, val in sorted(self.attrs.items()): 1434 if val is None: 1435 decoded = key 1436 else: 1437 if isinstance(val, list) or isinstance(val, tuple): 1438 val = ' '.join(val) 1439 elif not isinstance(val, str): 1440 val = str(val) 1441 elif ( 1442 isinstance(val, AttributeValueWithCharsetSubstitution) 1443 and eventual_encoding is not None): 1444 val = val.encode(eventual_encoding) 1445 1446 text = self.format_string(val, formatter) 1447 decoded = ( 1448 str(key) + '=' 1449 + EntitySubstitution.quoted_attribute_value(text)) 1450 attribs.append(decoded) 1451 atts = " " + " ".join(attribs) 1452 1453 1454 # get tag content 1455 contents="" 1456 is_void_tag = self.name in VOID_TAGS 1457 if not is_void_tag: 1458 if is_structural: 1459 contents = self.prettyprint_xhtml_contents(indent_level+1, eventual_encoding, formatter, indent_chars) 1460 else: 1461 contents = self.prettyprint_xhtml_contents(indent_level, eventual_encoding, formatter, indent_chars) 1462 1463 if self.hidden: 1464 # This is the 'document root' object. 1465 return contents 1466 1467 in_xml_ns = self.namespace != 'http://www.w3.org/1999/xhtml' 1468 testcontents = contents.strip() 1469 single = self.name in VOID_TAGS or (in_xml_ns and testcontents == "") 1470 1471 prefix = '' 1472 if self.prefix: 1473 prefix = self.prefix + ":" 1474 1475 is_keepwhitespace = self.name in PRESERVE_WHITESPACE_TAGS 1476 if not is_keepwhitespace and not is_inline: 1477 contents = contents.rstrip() 1478 1479 indent_space = (indent_chars * (indent_level - 1)) 1480 1481 # handle self-closed tags with no content first 1482 if single: 1483 selfclosetag = '<%s%s%s/>' % (prefix, self.name, atts) 1484 if is_inline: 1485 # always add newline after br tags when they are children of structural tags 1486 if (self.name == "br") and self.parent.name in STRUCTURAL_TAGS: 1487 selfclosetag += "\n" 1488 return selfclosetag 1489 return indent_space + selfclosetag + "\n" 1490 1491 # handle the general case 1492 starttag = '<%s%s%s>' % (prefix, self.name, atts) 1493 closetag = '</%s%s>' % (prefix, self.name) 1494 results = "" 1495 if is_structural: 1496 results = indent_space + starttag 1497 if contents != "": 1498 results += "\n" + contents + "\n" + indent_space 1499 results += closetag + "\n" 1500 elif is_inline: 1501 results = starttag 1502 results += contents 1503 results += closetag 1504 else: 1505 results = indent_space + starttag 1506 if not is_keepwhitespace: 1507 contents = contents.lstrip() 1508 results += contents 1509 results += closetag + "\n" 1510 return results 1511 1512 def prettyprint_xhtml_contents(self, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1513 formatter="minimal", indent_chars=" "): 1514 """Renders the contents of this tag as a Unicode string. 1515 """ 1516 # First off, turn a string formatter into a function. This 1517 # will stop the lookup from happening over and over again. 1518 if not isinstance(formatter, Callable): 1519 formatter = self._formatter_for_name(formatter) 1520 1521 is_structural = self.name in STRUCTURAL_TAGS 1522 is_inline = self.name in NON_BREAKING_INLINE_TAGS 1523 is_keepwhitespace = self.name in PRESERVE_WHITESPACE_TAGS 1524 indent_space = (indent_chars * (indent_level - 1)) 1525 last_char = "x" 1526 contains_block_tags = False 1527 1528 if is_structural or self.hidden: 1529 last_char = "\n" 1530 1531 s = [] 1532 1533 for c in self: 1534 text = None 1535 if isinstance(c, Comment): 1536 text = Comment(c).output_ready(formatter) 1537 s.append(text) 1538 elif isinstance(c, CData): 1539 text = CData(c).output_ready(formatter) 1540 s.append(text) 1541 elif isinstance(c, NavigableString): 1542 text = c.output_ready(formatter) 1543 tval = text 1544 is_whitespace = (tval.strip() == "") 1545 1546 # handle pure whitespace differently 1547 if is_whitespace: 1548 if is_keepwhitespace: 1549 s.append(text) 1550 elif is_inline or self.name in OTHER_TEXTHOLDING_TAGS: 1551 if last_char not in " \t\v\f\r\n": 1552 s.append(" ") 1553 else: 1554 s.append("") 1555 else: 1556 # ignore this whitespace 1557 s.append("") 1558 1559 # handle all other text 1560 else: 1561 if is_structural and last_char == "\n": 1562 s.append(indent_space) 1563 text = text.lstrip() 1564 s.append(text) 1565 1566 # handle tags 1567 elif isinstance(c, Tag): 1568 val = c.prettyprint_xhtml(indent_level, eventual_encoding, formatter, indent_chars) 1569 # track if contains block tags and append newline and prepend newline if needed 1570 if not c.name in NON_BREAKING_INLINE_TAGS: 1571 contains_block_tags = True 1572 if last_char != "\n": 1573 s.append("\n") 1574 last_char = "\n" 1575 # if child of a structual tag is inline and follows a newline, indent it properly 1576 if is_structural and c.name in NON_BREAKING_INLINE_TAGS and last_char == '\n': 1577 s.append(indent_space) 1578 val = val.lstrip() 1579 s.append(val) 1580 1581 else: 1582 s.append("") 1583 1584 # update last_char 1585 last_element = s[-1] 1586 if last_element != "": 1587 last_char = last_element[-1:] 1588 1589 # after processing all children, handle inline tags that contain block level tags 1590 if is_inline and contains_block_tags: 1591 if last_char != "\n": 1592 s.append("\n") 1593 s.append(indent_space) 1594 1595 return ''.join(s) 1596 1597 def encode_contents( 1598 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1599 formatter="minimal", indent_chars=" "): 1600 """Renders the contents of this tag as a bytestring. 1601 1602 :param indent_level: Each line of the rendering will be 1603 indented this many spaces. 1604 1605 :param eventual_encoding: The bytestring will be in this encoding. 1606 1607 :param formatter: The output formatter responsible for converting 1608 entities to Unicode characters. 1609 """ 1610 1611 contents = self.decode_contents(indent_level, encoding, formatter, indent_chars) 1612 return contents.encode(encoding) 1613 1614 # Old method for BS3 compatibility 1615 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1616 prettyPrint=False, indentLevel=0): 1617 if not prettyPrint: 1618 indentLevel = None 1619 return self.encode_contents( 1620 indent_level=indentLevel, encoding=encoding) 1621 1622 #Soup methods 1623 1624 def find(self, name=None, attrs=OrderedDict(), recursive=True, text=None, 1625 **kwargs): 1626 """Return only the first child of this Tag matching the given 1627 criteria.""" 1628 r = None 1629 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 1630 if l: 1631 r = l[0] 1632 return r 1633 findChild = find 1634 1635 def find_all(self, name=None, attrs=OrderedDict(), recursive=True, text=None, 1636 limit=None, **kwargs): 1637 """Extracts a list of Tag objects that match the given 1638 criteria. You can specify the name of the Tag and any 1639 attributes you want the Tag to have. 1640 1641 The value of a key-value pair in the 'attrs' map can be a 1642 string, a list of strings, a regular expression object, or a 1643 callable that takes a string and returns whether or not the 1644 string matches for some custom definition of 'matches'. The 1645 same is true of the tag name.""" 1646 1647 generator = self.descendants 1648 if not recursive: 1649 generator = self.children 1650 return self._find_all(name, attrs, text, limit, generator, **kwargs) 1651 findAll = find_all # BS3 1652 findChildren = find_all # BS2 1653 1654 #Generator methods 1655 @property 1656 def children(self): 1657 # return iter() to make the purpose of the method clear 1658 return iter(self.contents) # XXX This seems to be untested. 1659 1660 @property 1661 def descendants(self): 1662 if not len(self.contents): 1663 return 1664 stopNode = self._last_descendant().next_element 1665 current = self.contents[0] 1666 while current is not stopNode: 1667 yield current 1668 current = current.next_element 1669 1670 # CSS selector code 1671 1672 _selector_combinators = ['>', '+', '~'] 1673 _select_debug = False 1674 def select_one(self, selector): 1675 """Perform a CSS selection operation on the current element.""" 1676 value = self.select(selector, limit=1) 1677 if value: 1678 return value[0] 1679 return None 1680 1681 def select(self, selector, _candidate_generator=None, limit=None): 1682 """Perform a CSS selection operation on the current element.""" 1683 1684 # Remove whitespace directly after the grouping operator ',' 1685 # then split into tokens. 1686 tokens = re.sub(',[\s]*',',', selector).split() 1687 current_context = [self] 1688 1689 if tokens[-1] in self._selector_combinators: 1690 raise ValueError( 1691 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1692 1693 if self._select_debug: 1694 print('Running CSS selector "%s"' % selector) 1695 1696 for index, token_group in enumerate(tokens): 1697 new_context = [] 1698 new_context_ids = set([]) 1699 1700 # Grouping selectors, ie: p,a 1701 grouped_tokens = token_group.split(',') 1702 if '' in grouped_tokens: 1703 raise ValueError('Invalid group selection syntax: %s' % token_group) 1704 1705 if tokens[index-1] in self._selector_combinators: 1706 # This token was consumed by the previous combinator. Skip it. 1707 if self._select_debug: 1708 print(' Token was consumed by the previous combinator.') 1709 continue 1710 1711 for token in grouped_tokens: 1712 if self._select_debug: 1713 print(' Considering token "%s"' % token) 1714 recursive_candidate_generator = None 1715 tag_name = None 1716 1717 # Each operation corresponds to a checker function, a rule 1718 # for determining whether a candidate matches the 1719 # selector. Candidates are generated by the active 1720 # iterator. 1721 checker = None 1722 1723 m = self.attribselect_re.match(token) 1724 if m is not None: 1725 # Attribute selector 1726 tag_name, attribute, operator, value = m.groups() 1727 checker = self._attribute_checker(operator, attribute, value) 1728 1729 elif '#' in token: 1730 # ID selector 1731 tag_name, tag_id = token.split('#', 1) 1732 def id_matches(tag): 1733 return tag.get('id', None) == tag_id 1734 checker = id_matches 1735 1736 elif '.' in token: 1737 # Class selector 1738 tag_name, klass = token.split('.', 1) 1739 classes = set(klass.split('.')) 1740 def classes_match(candidate): 1741 return classes.issubset(candidate.get('class', [])) 1742 checker = classes_match 1743 1744 elif ':' in token: 1745 # Pseudo-class 1746 tag_name, pseudo = token.split(':', 1) 1747 if tag_name == '': 1748 raise ValueError( 1749 "A pseudo-class must be prefixed with a tag name.") 1750 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1751 found = [] 1752 if pseudo_attributes is None: 1753 pseudo_type = pseudo 1754 pseudo_value = None 1755 else: 1756 pseudo_type, pseudo_value = pseudo_attributes.groups() 1757 if pseudo_type == 'nth-of-type': 1758 try: 1759 pseudo_value = int(pseudo_value) 1760 except: 1761 raise NotImplementedError( 1762 'Only numeric values are currently supported for the nth-of-type pseudo-class.') 1763 if pseudo_value < 1: 1764 raise ValueError( 1765 'nth-of-type pseudo-class value must be at least 1.') 1766 class Counter(object): 1767 def __init__(self, destination): 1768 self.count = 0 1769 self.destination = destination 1770 1771 def nth_child_of_type(self, tag): 1772 self.count += 1 1773 if self.count == self.destination: 1774 return True 1775 if self.count > self.destination: 1776 # Stop the generator that's sending us 1777 # these things. 1778 raise StopIteration() 1779 return False 1780 checker = Counter(pseudo_value).nth_child_of_type 1781 else: 1782 raise NotImplementedError( 1783 'Only the following pseudo-classes are implemented: nth-of-type.') 1784 1785 elif token == '*': 1786 # Star selector -- matches everything 1787 pass 1788 elif token == '>': 1789 # Run the next token as a CSS selector against the 1790 # direct children of each tag in the current context. 1791 recursive_candidate_generator = lambda tag: tag.children 1792 elif token == '~': 1793 # Run the next token as a CSS selector against the 1794 # siblings of each tag in the current context. 1795 recursive_candidate_generator = lambda tag: tag.next_siblings 1796 elif token == '+': 1797 # For each tag in the current context, run the next 1798 # token as a CSS selector against the tag's next 1799 # sibling that's a tag. 1800 def next_tag_sibling(tag): 1801 yield tag.find_next_sibling(True) 1802 recursive_candidate_generator = next_tag_sibling 1803 1804 elif self.tag_name_re.match(token): 1805 # Just a tag name. 1806 tag_name = token 1807 else: 1808 raise ValueError( 1809 'Unsupported or invalid CSS selector: "%s"' % token) 1810 if recursive_candidate_generator: 1811 # This happens when the selector looks like "> foo". 1812 # 1813 # The generator calls select() recursively on every 1814 # member of the current context, passing in a different 1815 # candidate generator and a different selector. 1816 # 1817 # In the case of "> foo", the candidate generator is 1818 # one that yields a tag's direct children (">"), and 1819 # the selector is "foo". 1820 next_token = tokens[index+1] 1821 def recursive_select(tag): 1822 if self._select_debug: 1823 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) 1824 print('-' * 40) 1825 for i in tag.select(next_token, recursive_candidate_generator): 1826 if self._select_debug: 1827 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) 1828 yield i 1829 if self._select_debug: 1830 print('-' * 40) 1831 _use_candidate_generator = recursive_select 1832 elif _candidate_generator is None: 1833 # By default, a tag's candidates are all of its 1834 # children. If tag_name is defined, only yield tags 1835 # with that name. 1836 if self._select_debug: 1837 if tag_name: 1838 check = "[any]" 1839 else: 1840 check = tag_name 1841 print(' Default candidate generator, tag name="%s"' % check) 1842 if self._select_debug: 1843 # This is redundant with later code, but it stops 1844 # a bunch of bogus tags from cluttering up the 1845 # debug log. 1846 def default_candidate_generator(tag): 1847 for child in tag.descendants: 1848 if not isinstance(child, Tag): 1849 continue 1850 if tag_name and not child.name == tag_name: 1851 continue 1852 yield child 1853 _use_candidate_generator = default_candidate_generator 1854 else: 1855 _use_candidate_generator = lambda tag: tag.descendants 1856 else: 1857 _use_candidate_generator = _candidate_generator 1858 1859 count = 0 1860 for tag in current_context: 1861 if self._select_debug: 1862 print(" Running candidate generator on %s %s" % ( 1863 tag.name, repr(tag.attrs))) 1864 for candidate in _use_candidate_generator(tag): 1865 if not isinstance(candidate, Tag): 1866 continue 1867 if tag_name and candidate.name != tag_name: 1868 continue 1869 if checker is not None: 1870 try: 1871 result = checker(candidate) 1872 except StopIteration: 1873 # The checker has decided we should no longer 1874 # run the generator. 1875 break 1876 if checker is None or result: 1877 if self._select_debug: 1878 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) 1879 if id(candidate) not in new_context_ids: 1880 # If a tag matches a selector more than once, 1881 # don't include it in the context more than once. 1882 new_context.append(candidate) 1883 new_context_ids.add(id(candidate)) 1884 if limit and len(new_context) >= limit: 1885 break 1886 elif self._select_debug: 1887 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) 1888 1889 1890 current_context = new_context 1891 1892 if self._select_debug: 1893 print("Final verdict:") 1894 for i in current_context: 1895 print(" %s %s" % (i.name, i.attrs)) 1896 return current_context 1897 1898 # Old names for backwards compatibility 1899 def childGenerator(self): 1900 return self.children 1901 1902 def recursiveChildGenerator(self): 1903 return self.descendants 1904 1905 def has_key(self, key): 1906 """This was kind of misleading because has_key() (attributes) 1907 was different from __in__ (contents). has_key() is gone in 1908 Python 3, anyway.""" 1909 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 1910 key)) 1911 return self.has_attr(key) 1912 1913# Next, a couple classes to represent queries and their results. 1914class SoupStrainer(object): 1915 """Encapsulates a number of ways of matching a markup element (tag or 1916 text).""" 1917 1918 def __init__(self, name=None, attrs=OrderedDict(), text=None, **kwargs): 1919 self.name = self._normalize_search_value(name) 1920 if not isinstance(attrs, dict): 1921 # Treat a non-dict value for attrs as a search for the 'class' 1922 # attribute. 1923 kwargs['class'] = attrs 1924 attrs = None 1925 1926 if 'class_' in kwargs: 1927 # Treat class_="foo" as a search for the 'class' 1928 # attribute, overriding any non-dict value for attrs. 1929 kwargs['class'] = kwargs['class_'] 1930 del kwargs['class_'] 1931 1932 if kwargs: 1933 if attrs: 1934 attrs = attrs.copy() 1935 attrs.update(kwargs) 1936 else: 1937 attrs = kwargs 1938 normalized_attrs = OrderedDict() 1939 for key, value in list(attrs.items()): 1940 normalized_attrs[key] = self._normalize_search_value(value) 1941 1942 self.attrs = normalized_attrs 1943 self.text = self._normalize_search_value(text) 1944 1945 def _normalize_search_value(self, value): 1946 # Leave it alone if it's a Unicode string, a callable, a 1947 # regular expression, a boolean, or None. 1948 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') 1949 or isinstance(value, bool) or value is None): 1950 return value 1951 1952 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 1953 if isinstance(value, bytes): 1954 return value.decode("utf8") 1955 1956 # If it's listlike, convert it into a list of strings. 1957 if hasattr(value, '__iter__'): 1958 new_value = [] 1959 for v in value: 1960 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1961 and not isinstance(v, str)): 1962 # This is almost certainly the user's mistake. In the 1963 # interests of avoiding infinite loops, we'll let 1964 # it through as-is rather than doing a recursive call. 1965 new_value.append(v) 1966 else: 1967 new_value.append(self._normalize_search_value(v)) 1968 return new_value 1969 1970 # Otherwise, convert it into a Unicode string. 1971 return str(value) 1972 1973 def __str__(self): 1974 if self.text: 1975 return self.text 1976 else: 1977 return "%s|%s" % (self.name, self.attrs) 1978 1979 def search_tag(self, markup_name=None, markup_attrs=OrderedDict()): 1980 found = None 1981 markup = None 1982 if isinstance(markup_name, Tag): 1983 markup = markup_name 1984 markup_attrs = markup 1985 call_function_with_tag_data = ( 1986 isinstance(self.name, Callable) 1987 and not isinstance(markup_name, Tag)) 1988 1989 if ((not self.name) 1990 or call_function_with_tag_data 1991 or (markup and self._matches(markup, self.name)) 1992 or (not markup and self._matches(markup_name, self.name))): 1993 if call_function_with_tag_data: 1994 match = self.name(markup_name, markup_attrs) 1995 else: 1996 match = True 1997 markup_attr_map = None 1998 for attr, match_against in list(self.attrs.items()): 1999 if not markup_attr_map: 2000 if hasattr(markup_attrs, 'get'): 2001 markup_attr_map = markup_attrs 2002 else: 2003 markup_attr_map = OrderedDict() 2004 for k, v in markup_attrs: 2005 markup_attr_map[k] = v 2006 attr_value = markup_attr_map.get(attr) 2007 if not self._matches(attr_value, match_against): 2008 match = False 2009 break 2010 if match: 2011 if markup: 2012 found = markup 2013 else: 2014 found = markup_name 2015 if found and self.text and not self._matches(found.string, self.text): 2016 found = None 2017 return found 2018 searchTag = search_tag 2019 2020 def search(self, markup): 2021 # print 'looking for %s in %s' % (self, markup) 2022 found = None 2023 # If given a list of items, scan it for a text element that 2024 # matches. 2025 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 2026 for element in markup: 2027 if isinstance(element, NavigableString) \ 2028 and self.search(element): 2029 found = element 2030 break 2031 # If it's a Tag, make sure its name or attributes match. 2032 # Don't bother with Tags if we're searching for text. 2033 elif isinstance(markup, Tag): 2034 if not self.text or self.name or self.attrs: 2035 found = self.search_tag(markup) 2036 # If it's text, make sure the text matches. 2037 elif isinstance(markup, NavigableString) or \ 2038 isinstance(markup, str): 2039 if not self.name and not self.attrs and self._matches(markup, self.text): 2040 found = markup 2041 else: 2042 raise Exception( 2043 "I don't know how to match against a %s" % markup.__class__) 2044 return found 2045 2046 def _matches(self, markup, match_against): 2047 # print u"Matching %s against %s" % (markup, match_against) 2048 result = False 2049 if isinstance(markup, list) or isinstance(markup, tuple): 2050 # This should only happen when searching a multi-valued attribute 2051 # like 'class'. 2052 if (isinstance(match_against, str) 2053 and ' ' in match_against): 2054 # A bit of a special case. If they try to match "foo 2055 # bar" on a multivalue attribute's value, only accept 2056 # the literal value "foo bar" 2057 # 2058 # XXX This is going to be pretty slow because we keep 2059 # splitting match_against. But it shouldn't come up 2060 # too often. 2061 return (whitespace_re.split(match_against) == markup) 2062 else: 2063 for item in markup: 2064 if self._matches(item, match_against): 2065 return True 2066 return False 2067 2068 if match_against is True: 2069 # True matches any non-None value. 2070 return markup is not None 2071 2072 if isinstance(match_against, Callable): 2073 return match_against(markup) 2074 2075 # Custom callables take the tag as an argument, but all 2076 # other ways of matching match the tag name as a string. 2077 if isinstance(markup, Tag): 2078 markup = markup.name 2079 2080 # Ensure that `markup` is either a Unicode string, or None. 2081 markup = self._normalize_search_value(markup) 2082 2083 if markup is None: 2084 # None matches None, False, an empty string, an empty list, and so on. 2085 return not match_against 2086 2087 if isinstance(match_against, str): 2088 # Exact string match 2089 return markup == match_against 2090 2091 if hasattr(match_against, 'match'): 2092 # Regexp match 2093 return match_against.search(markup) 2094 2095 if hasattr(match_against, '__iter__'): 2096 # The markup must be an exact match against something 2097 # in the iterable. 2098 return markup in match_against 2099 2100 2101class ResultSet(list): 2102 """A ResultSet is just a list that keeps track of the SoupStrainer 2103 that created it.""" 2104 def __init__(self, source, result=()): 2105 super(ResultSet, self).__init__(result) 2106 self.source = source 2107