1# -*- coding: utf-8 -*- 2"""Tests for Beautiful Soup's tree traversal methods. 3 4The tree traversal methods are the main advantage of using Beautiful 5Soup over just using a parser. 6 7Different parsers will build different Beautiful Soup trees given the 8same markup, but all Beautiful Soup trees can be traversed with the 9methods tested here. 10""" 11 12from pdb import set_trace 13import copy 14import pickle 15import re 16import warnings 17from bs4 import BeautifulSoup 18from bs4.builder import ( 19 builder_registry, 20 HTMLParserTreeBuilder, 21) 22from bs4.element import ( 23 PY3K, 24 CData, 25 Comment, 26 Declaration, 27 Doctype, 28 Formatter, 29 NavigableString, 30 SoupStrainer, 31 Tag, 32) 33from bs4.testing import ( 34 SoupTest, 35 skipIf, 36) 37 38XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) 39LXML_PRESENT = (builder_registry.lookup("lxml") is not None) 40 41class TreeTest(SoupTest): 42 43 def assertSelects(self, tags, should_match): 44 """Make sure that the given tags have the correct text. 45 46 This is used in tests that define a bunch of tags, each 47 containing a single string, and then select certain strings by 48 some mechanism. 49 """ 50 self.assertEqual([tag.string for tag in tags], should_match) 51 52 def assertSelectsIDs(self, tags, should_match): 53 """Make sure that the given tags have the correct IDs. 54 55 This is used in tests that define a bunch of tags, each 56 containing a single string, and then select certain strings by 57 some mechanism. 58 """ 59 self.assertEqual([tag['id'] for tag in tags], should_match) 60 61 62class TestFind(TreeTest): 63 """Basic tests of the find() method. 64 65 find() just calls find_all() with limit=1, so it's not tested all 66 that thouroughly here. 67 """ 68 69 def test_find_tag(self): 70 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") 71 self.assertEqual(soup.find("b").string, "2") 72 73 def test_unicode_text_find(self): 74 soup = self.soup('<h1>Räksmörgås</h1>') 75 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') 76 77 def test_unicode_attribute_find(self): 78 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') 79 str(soup) 80 self.assertEqual("here it is", soup.find(id='Räksmörgås').text) 81 82 83 def test_find_everything(self): 84 """Test an optimization that finds all tags.""" 85 soup = self.soup("<a>foo</a><b>bar</b>") 86 self.assertEqual(2, len(soup.find_all())) 87 88 def test_find_everything_with_name(self): 89 """Test an optimization that finds all tags with a given name.""" 90 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") 91 self.assertEqual(2, len(soup.find_all('a'))) 92 93class TestFindAll(TreeTest): 94 """Basic tests of the find_all() method.""" 95 96 def test_find_all_text_nodes(self): 97 """You can search the tree for text nodes.""" 98 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") 99 # Exact match. 100 self.assertEqual(soup.find_all(string="bar"), ["bar"]) 101 self.assertEqual(soup.find_all(text="bar"), ["bar"]) 102 # Match any of a number of strings. 103 self.assertEqual( 104 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) 105 # Match a regular expression. 106 self.assertEqual(soup.find_all(text=re.compile('.*')), 107 ["Foo", "bar", '\xbb']) 108 # Match anything. 109 self.assertEqual(soup.find_all(text=True), 110 ["Foo", "bar", '\xbb']) 111 112 def test_find_all_limit(self): 113 """You can limit the number of items returned by find_all.""" 114 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") 115 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) 116 self.assertSelects(soup.find_all('a', limit=1), ["1"]) 117 self.assertSelects( 118 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) 119 120 # A limit of 0 means no limit. 121 self.assertSelects( 122 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) 123 124 def test_calling_a_tag_is_calling_findall(self): 125 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") 126 self.assertSelects(soup('a', limit=1), ["1"]) 127 self.assertSelects(soup.b(id="foo"), ["3"]) 128 129 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): 130 soup = self.soup("<a></a>") 131 # Create a self-referential list. 132 l = [] 133 l.append(l) 134 135 # Without special code in _normalize_search_value, this would cause infinite 136 # recursion. 137 self.assertEqual([], soup.find_all(l)) 138 139 def test_find_all_resultset(self): 140 """All find_all calls return a ResultSet""" 141 soup = self.soup("<a></a>") 142 result = soup.find_all("a") 143 self.assertTrue(hasattr(result, "source")) 144 145 result = soup.find_all(True) 146 self.assertTrue(hasattr(result, "source")) 147 148 result = soup.find_all(text="foo") 149 self.assertTrue(hasattr(result, "source")) 150 151 152class TestFindAllBasicNamespaces(TreeTest): 153 154 def test_find_by_namespaced_name(self): 155 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') 156 self.assertEqual("4", soup.find("mathml:msqrt").string) 157 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) 158 159 160class TestFindAllByName(TreeTest): 161 """Test ways of finding tags by tag name.""" 162 163 def setUp(self): 164 super(TreeTest, self).setUp() 165 self.tree = self.soup("""<a>First tag.</a> 166 <b>Second tag.</b> 167 <c>Third <a>Nested tag.</a> tag.</c>""") 168 169 def test_find_all_by_tag_name(self): 170 # Find all the <a> tags. 171 self.assertSelects( 172 self.tree.find_all('a'), ['First tag.', 'Nested tag.']) 173 174 def test_find_all_by_name_and_text(self): 175 self.assertSelects( 176 self.tree.find_all('a', text='First tag.'), ['First tag.']) 177 178 self.assertSelects( 179 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) 180 181 self.assertSelects( 182 self.tree.find_all('a', text=re.compile("tag")), 183 ['First tag.', 'Nested tag.']) 184 185 186 def test_find_all_on_non_root_element(self): 187 # You can call find_all on any node, not just the root. 188 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) 189 190 def test_calling_element_invokes_find_all(self): 191 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) 192 193 def test_find_all_by_tag_strainer(self): 194 self.assertSelects( 195 self.tree.find_all(SoupStrainer('a')), 196 ['First tag.', 'Nested tag.']) 197 198 def test_find_all_by_tag_names(self): 199 self.assertSelects( 200 self.tree.find_all(['a', 'b']), 201 ['First tag.', 'Second tag.', 'Nested tag.']) 202 203 def test_find_all_by_tag_dict(self): 204 self.assertSelects( 205 self.tree.find_all({'a' : True, 'b' : True}), 206 ['First tag.', 'Second tag.', 'Nested tag.']) 207 208 def test_find_all_by_tag_re(self): 209 self.assertSelects( 210 self.tree.find_all(re.compile('^[ab]$')), 211 ['First tag.', 'Second tag.', 'Nested tag.']) 212 213 def test_find_all_with_tags_matching_method(self): 214 # You can define an oracle method that determines whether 215 # a tag matches the search. 216 def id_matches_name(tag): 217 return tag.name == tag.get('id') 218 219 tree = self.soup("""<a id="a">Match 1.</a> 220 <a id="1">Does not match.</a> 221 <b id="b">Match 2.</a>""") 222 223 self.assertSelects( 224 tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) 225 226 def test_find_with_multi_valued_attribute(self): 227 soup = self.soup( 228 "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>" 229 ) 230 r1 = soup.find('div', 'a d'); 231 r2 = soup.find('div', re.compile(r'a d')); 232 r3, r4 = soup.find_all('div', ['a b', 'a d']); 233 self.assertEqual('3', r1.string) 234 self.assertEqual('3', r2.string) 235 self.assertEqual('1', r3.string) 236 self.assertEqual('3', r4.string) 237 238 239class TestFindAllByAttribute(TreeTest): 240 241 def test_find_all_by_attribute_name(self): 242 # You can pass in keyword arguments to find_all to search by 243 # attribute. 244 tree = self.soup(""" 245 <a id="first">Matching a.</a> 246 <a id="second"> 247 Non-matching <b id="first">Matching b.</b>a. 248 </a>""") 249 self.assertSelects(tree.find_all(id='first'), 250 ["Matching a.", "Matching b."]) 251 252 def test_find_all_by_utf8_attribute_value(self): 253 peace = "םולש".encode("utf8") 254 data = '<a title="םולש"></a>'.encode("utf8") 255 soup = self.soup(data) 256 self.assertEqual([soup.a], soup.find_all(title=peace)) 257 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) 258 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) 259 260 def test_find_all_by_attribute_dict(self): 261 # You can pass in a dictionary as the argument 'attrs'. This 262 # lets you search for attributes like 'name' (a fixed argument 263 # to find_all) and 'class' (a reserved word in Python.) 264 tree = self.soup(""" 265 <a name="name1" class="class1">Name match.</a> 266 <a name="name2" class="class2">Class match.</a> 267 <a name="name3" class="class3">Non-match.</a> 268 <name1>A tag called 'name1'.</name1> 269 """) 270 271 # This doesn't do what you want. 272 self.assertSelects(tree.find_all(name='name1'), 273 ["A tag called 'name1'."]) 274 # This does what you want. 275 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), 276 ["Name match."]) 277 278 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), 279 ["Class match."]) 280 281 def test_find_all_by_class(self): 282 tree = self.soup(""" 283 <a class="1">Class 1.</a> 284 <a class="2">Class 2.</a> 285 <b class="1">Class 1.</b> 286 <c class="3 4">Class 3 and 4.</c> 287 """) 288 289 # Passing in the class_ keyword argument will search against 290 # the 'class' attribute. 291 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) 292 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) 293 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) 294 295 # Passing in a string to 'attrs' will also search the CSS class. 296 self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) 297 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) 298 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) 299 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) 300 301 def test_find_by_class_when_multiple_classes_present(self): 302 tree = self.soup("<gar class='foo bar'>Found it</gar>") 303 304 f = tree.find_all("gar", class_=re.compile("o")) 305 self.assertSelects(f, ["Found it"]) 306 307 f = tree.find_all("gar", class_=re.compile("a")) 308 self.assertSelects(f, ["Found it"]) 309 310 # If the search fails to match the individual strings "foo" and "bar", 311 # it will be tried against the combined string "foo bar". 312 f = tree.find_all("gar", class_=re.compile("o b")) 313 self.assertSelects(f, ["Found it"]) 314 315 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): 316 soup = self.soup("<a class='bar'>Found it</a>") 317 318 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) 319 320 def big_attribute_value(value): 321 return len(value) > 3 322 323 self.assertSelects(soup.find_all("a", big_attribute_value), []) 324 325 def small_attribute_value(value): 326 return len(value) <= 3 327 328 self.assertSelects( 329 soup.find_all("a", small_attribute_value), ["Found it"]) 330 331 def test_find_all_with_string_for_attrs_finds_multiple_classes(self): 332 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') 333 a, a2 = soup.find_all("a") 334 self.assertEqual([a, a2], soup.find_all("a", "foo")) 335 self.assertEqual([a], soup.find_all("a", "bar")) 336 337 # If you specify the class as a string that contains a 338 # space, only that specific value will be found. 339 self.assertEqual([a], soup.find_all("a", class_="foo bar")) 340 self.assertEqual([a], soup.find_all("a", "foo bar")) 341 self.assertEqual([], soup.find_all("a", "bar foo")) 342 343 def test_find_all_by_attribute_soupstrainer(self): 344 tree = self.soup(""" 345 <a id="first">Match.</a> 346 <a id="second">Non-match.</a>""") 347 348 strainer = SoupStrainer(attrs={'id' : 'first'}) 349 self.assertSelects(tree.find_all(strainer), ['Match.']) 350 351 def test_find_all_with_missing_attribute(self): 352 # You can pass in None as the value of an attribute to find_all. 353 # This will match tags that do not have that attribute set. 354 tree = self.soup("""<a id="1">ID present.</a> 355 <a>No ID present.</a> 356 <a id="">ID is empty.</a>""") 357 self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) 358 359 def test_find_all_with_defined_attribute(self): 360 # You can pass in None as the value of an attribute to find_all. 361 # This will match tags that have that attribute set to any value. 362 tree = self.soup("""<a id="1">ID present.</a> 363 <a>No ID present.</a> 364 <a id="">ID is empty.</a>""") 365 self.assertSelects( 366 tree.find_all(id=True), ["ID present.", "ID is empty."]) 367 368 def test_find_all_with_numeric_attribute(self): 369 # If you search for a number, it's treated as a string. 370 tree = self.soup("""<a id=1>Unquoted attribute.</a> 371 <a id="1">Quoted attribute.</a>""") 372 373 expected = ["Unquoted attribute.", "Quoted attribute."] 374 self.assertSelects(tree.find_all(id=1), expected) 375 self.assertSelects(tree.find_all(id="1"), expected) 376 377 def test_find_all_with_list_attribute_values(self): 378 # You can pass a list of attribute values instead of just one, 379 # and you'll get tags that match any of the values. 380 tree = self.soup("""<a id="1">1</a> 381 <a id="2">2</a> 382 <a id="3">3</a> 383 <a>No ID.</a>""") 384 self.assertSelects(tree.find_all(id=["1", "3", "4"]), 385 ["1", "3"]) 386 387 def test_find_all_with_regular_expression_attribute_value(self): 388 # You can pass a regular expression as an attribute value, and 389 # you'll get tags whose values for that attribute match the 390 # regular expression. 391 tree = self.soup("""<a id="a">One a.</a> 392 <a id="aa">Two as.</a> 393 <a id="ab">Mixed as and bs.</a> 394 <a id="b">One b.</a> 395 <a>No ID.</a>""") 396 397 self.assertSelects(tree.find_all(id=re.compile("^a+$")), 398 ["One a.", "Two as."]) 399 400 def test_find_by_name_and_containing_string(self): 401 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") 402 a = soup.a 403 404 self.assertEqual([a], soup.find_all("a", text="foo")) 405 self.assertEqual([], soup.find_all("a", text="bar")) 406 self.assertEqual([], soup.find_all("a", text="bar")) 407 408 def test_find_by_name_and_containing_string_when_string_is_buried(self): 409 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") 410 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) 411 412 def test_find_by_attribute_and_containing_string(self): 413 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') 414 a = soup.a 415 416 self.assertEqual([a], soup.find_all(id=2, text="foo")) 417 self.assertEqual([], soup.find_all(id=1, text="bar")) 418 419 420class TestSmooth(TreeTest): 421 """Test Tag.smooth.""" 422 423 def test_smooth(self): 424 soup = self.soup("<div>a</div>") 425 div = soup.div 426 div.append("b") 427 div.append("c") 428 div.append(Comment("Comment 1")) 429 div.append(Comment("Comment 2")) 430 div.append("d") 431 builder = self.default_builder() 432 span = Tag(soup, builder, 'span') 433 span.append('1') 434 span.append('2') 435 div.append(span) 436 437 # At this point the tree has a bunch of adjacent 438 # NavigableStrings. This is normal, but it has no meaning in 439 # terms of HTML, so we may want to smooth things out for 440 # output. 441 442 # Since the <span> tag has two children, its .string is None. 443 self.assertEqual(None, div.span.string) 444 445 self.assertEqual(7, len(div.contents)) 446 div.smooth() 447 self.assertEqual(5, len(div.contents)) 448 449 # The three strings at the beginning of div.contents have been 450 # merged into on string. 451 # 452 self.assertEqual('abc', div.contents[0]) 453 454 # The call is recursive -- the <span> tag was also smoothed. 455 self.assertEqual('12', div.span.string) 456 457 # The two comments have _not_ been merged, even though 458 # comments are strings. Merging comments would change the 459 # meaning of the HTML. 460 self.assertEqual('Comment 1', div.contents[1]) 461 self.assertEqual('Comment 2', div.contents[2]) 462 463 464class TestIndex(TreeTest): 465 """Test Tag.index""" 466 def test_index(self): 467 tree = self.soup("""<div> 468 <a>Identical</a> 469 <b>Not identical</b> 470 <a>Identical</a> 471 472 <c><d>Identical with child</d></c> 473 <b>Also not identical</b> 474 <c><d>Identical with child</d></c> 475 </div>""") 476 div = tree.div 477 for i, element in enumerate(div.contents): 478 self.assertEqual(i, div.index(element)) 479 self.assertRaises(ValueError, tree.index, 1) 480 481 482class TestParentOperations(TreeTest): 483 """Test navigation and searching through an element's parents.""" 484 485 def setUp(self): 486 super(TestParentOperations, self).setUp() 487 self.tree = self.soup('''<ul id="empty"></ul> 488 <ul id="top"> 489 <ul id="middle"> 490 <ul id="bottom"> 491 <b>Start here</b> 492 </ul> 493 </ul>''') 494 self.start = self.tree.b 495 496 497 def test_parent(self): 498 self.assertEqual(self.start.parent['id'], 'bottom') 499 self.assertEqual(self.start.parent.parent['id'], 'middle') 500 self.assertEqual(self.start.parent.parent.parent['id'], 'top') 501 502 def test_parent_of_top_tag_is_soup_object(self): 503 top_tag = self.tree.contents[0] 504 self.assertEqual(top_tag.parent, self.tree) 505 506 def test_soup_object_has_no_parent(self): 507 self.assertEqual(None, self.tree.parent) 508 509 def test_find_parents(self): 510 self.assertSelectsIDs( 511 self.start.find_parents('ul'), ['bottom', 'middle', 'top']) 512 self.assertSelectsIDs( 513 self.start.find_parents('ul', id="middle"), ['middle']) 514 515 def test_find_parent(self): 516 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') 517 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') 518 519 def test_parent_of_text_element(self): 520 text = self.tree.find(text="Start here") 521 self.assertEqual(text.parent.name, 'b') 522 523 def test_text_element_find_parent(self): 524 text = self.tree.find(text="Start here") 525 self.assertEqual(text.find_parent('ul')['id'], 'bottom') 526 527 def test_parent_generator(self): 528 parents = [parent['id'] for parent in self.start.parents 529 if parent is not None and 'id' in parent.attrs] 530 self.assertEqual(parents, ['bottom', 'middle', 'top']) 531 532 533class ProximityTest(TreeTest): 534 535 def setUp(self): 536 super(TreeTest, self).setUp() 537 self.tree = self.soup( 538 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') 539 540 541class TestNextOperations(ProximityTest): 542 543 def setUp(self): 544 super(TestNextOperations, self).setUp() 545 self.start = self.tree.b 546 547 def test_next(self): 548 self.assertEqual(self.start.next_element, "One") 549 self.assertEqual(self.start.next_element.next_element['id'], "2") 550 551 def test_next_of_last_item_is_none(self): 552 last = self.tree.find(text="Three") 553 self.assertEqual(last.next_element, None) 554 555 def test_next_of_root_is_none(self): 556 # The document root is outside the next/previous chain. 557 self.assertEqual(self.tree.next_element, None) 558 559 def test_find_all_next(self): 560 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) 561 self.start.find_all_next(id=3) 562 self.assertSelects(self.start.find_all_next(id=3), ["Three"]) 563 564 def test_find_next(self): 565 self.assertEqual(self.start.find_next('b')['id'], '2') 566 self.assertEqual(self.start.find_next(text="Three"), "Three") 567 568 def test_find_next_for_text_element(self): 569 text = self.tree.find(text="One") 570 self.assertEqual(text.find_next("b").string, "Two") 571 self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) 572 573 def test_next_generator(self): 574 start = self.tree.find(text="Two") 575 successors = [node for node in start.next_elements] 576 # There are two successors: the final <b> tag and its text contents. 577 tag, contents = successors 578 self.assertEqual(tag['id'], '3') 579 self.assertEqual(contents, "Three") 580 581class TestPreviousOperations(ProximityTest): 582 583 def setUp(self): 584 super(TestPreviousOperations, self).setUp() 585 self.end = self.tree.find(text="Three") 586 587 def test_previous(self): 588 self.assertEqual(self.end.previous_element['id'], "3") 589 self.assertEqual(self.end.previous_element.previous_element, "Two") 590 591 def test_previous_of_first_item_is_none(self): 592 first = self.tree.find('html') 593 self.assertEqual(first.previous_element, None) 594 595 def test_previous_of_root_is_none(self): 596 # The document root is outside the next/previous chain. 597 # XXX This is broken! 598 #self.assertEqual(self.tree.previous_element, None) 599 pass 600 601 def test_find_all_previous(self): 602 # The <b> tag containing the "Three" node is the predecessor 603 # of the "Three" node itself, which is why "Three" shows up 604 # here. 605 self.assertSelects( 606 self.end.find_all_previous('b'), ["Three", "Two", "One"]) 607 self.assertSelects(self.end.find_all_previous(id=1), ["One"]) 608 609 def test_find_previous(self): 610 self.assertEqual(self.end.find_previous('b')['id'], '3') 611 self.assertEqual(self.end.find_previous(text="One"), "One") 612 613 def test_find_previous_for_text_element(self): 614 text = self.tree.find(text="Three") 615 self.assertEqual(text.find_previous("b").string, "Three") 616 self.assertSelects( 617 text.find_all_previous("b"), ["Three", "Two", "One"]) 618 619 def test_previous_generator(self): 620 start = self.tree.find(text="One") 621 predecessors = [node for node in start.previous_elements] 622 623 # There are four predecessors: the <b> tag containing "One" 624 # the <body> tag, the <head> tag, and the <html> tag. 625 b, body, head, html = predecessors 626 self.assertEqual(b['id'], '1') 627 self.assertEqual(body.name, "body") 628 self.assertEqual(head.name, "head") 629 self.assertEqual(html.name, "html") 630 631 632class SiblingTest(TreeTest): 633 634 def setUp(self): 635 super(SiblingTest, self).setUp() 636 markup = '''<html> 637 <span id="1"> 638 <span id="1.1"></span> 639 </span> 640 <span id="2"> 641 <span id="2.1"></span> 642 </span> 643 <span id="3"> 644 <span id="3.1"></span> 645 </span> 646 <span id="4"></span> 647 </html>''' 648 # All that whitespace looks good but makes the tests more 649 # difficult. Get rid of it. 650 markup = re.compile(r"\n\s*").sub("", markup) 651 self.tree = self.soup(markup) 652 653 654class TestNextSibling(SiblingTest): 655 656 def setUp(self): 657 super(TestNextSibling, self).setUp() 658 self.start = self.tree.find(id="1") 659 660 def test_next_sibling_of_root_is_none(self): 661 self.assertEqual(self.tree.next_sibling, None) 662 663 def test_next_sibling(self): 664 self.assertEqual(self.start.next_sibling['id'], '2') 665 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') 666 667 # Note the difference between next_sibling and next_element. 668 self.assertEqual(self.start.next_element['id'], '1.1') 669 670 def test_next_sibling_may_not_exist(self): 671 self.assertEqual(self.tree.html.next_sibling, None) 672 673 nested_span = self.tree.find(id="1.1") 674 self.assertEqual(nested_span.next_sibling, None) 675 676 last_span = self.tree.find(id="4") 677 self.assertEqual(last_span.next_sibling, None) 678 679 def test_find_next_sibling(self): 680 self.assertEqual(self.start.find_next_sibling('span')['id'], '2') 681 682 def test_next_siblings(self): 683 self.assertSelectsIDs(self.start.find_next_siblings("span"), 684 ['2', '3', '4']) 685 686 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) 687 688 def test_next_sibling_for_text_element(self): 689 soup = self.soup("Foo<b>bar</b>baz") 690 start = soup.find(text="Foo") 691 self.assertEqual(start.next_sibling.name, 'b') 692 self.assertEqual(start.next_sibling.next_sibling, 'baz') 693 694 self.assertSelects(start.find_next_siblings('b'), ['bar']) 695 self.assertEqual(start.find_next_sibling(text="baz"), "baz") 696 self.assertEqual(start.find_next_sibling(text="nonesuch"), None) 697 698 699class TestPreviousSibling(SiblingTest): 700 701 def setUp(self): 702 super(TestPreviousSibling, self).setUp() 703 self.end = self.tree.find(id="4") 704 705 def test_previous_sibling_of_root_is_none(self): 706 self.assertEqual(self.tree.previous_sibling, None) 707 708 def test_previous_sibling(self): 709 self.assertEqual(self.end.previous_sibling['id'], '3') 710 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') 711 712 # Note the difference between previous_sibling and previous_element. 713 self.assertEqual(self.end.previous_element['id'], '3.1') 714 715 def test_previous_sibling_may_not_exist(self): 716 self.assertEqual(self.tree.html.previous_sibling, None) 717 718 nested_span = self.tree.find(id="1.1") 719 self.assertEqual(nested_span.previous_sibling, None) 720 721 first_span = self.tree.find(id="1") 722 self.assertEqual(first_span.previous_sibling, None) 723 724 def test_find_previous_sibling(self): 725 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') 726 727 def test_previous_siblings(self): 728 self.assertSelectsIDs(self.end.find_previous_siblings("span"), 729 ['3', '2', '1']) 730 731 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) 732 733 def test_previous_sibling_for_text_element(self): 734 soup = self.soup("Foo<b>bar</b>baz") 735 start = soup.find(text="baz") 736 self.assertEqual(start.previous_sibling.name, 'b') 737 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') 738 739 self.assertSelects(start.find_previous_siblings('b'), ['bar']) 740 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") 741 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) 742 743 744class TestTagCreation(SoupTest): 745 """Test the ability to create new tags.""" 746 def test_new_tag(self): 747 soup = self.soup("") 748 new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) 749 self.assertTrue(isinstance(new_tag, Tag)) 750 self.assertEqual("foo", new_tag.name) 751 self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs) 752 self.assertEqual(None, new_tag.parent) 753 754 def test_tag_inherits_self_closing_rules_from_builder(self): 755 if XML_BUILDER_PRESENT: 756 xml_soup = BeautifulSoup("", "lxml-xml") 757 xml_br = xml_soup.new_tag("br") 758 xml_p = xml_soup.new_tag("p") 759 760 # Both the <br> and <p> tag are empty-element, just because 761 # they have no contents. 762 self.assertEqual(b"<br/>", xml_br.encode()) 763 self.assertEqual(b"<p/>", xml_p.encode()) 764 765 html_soup = BeautifulSoup("", "html.parser") 766 html_br = html_soup.new_tag("br") 767 html_p = html_soup.new_tag("p") 768 769 # The HTML builder users HTML's rules about which tags are 770 # empty-element tags, and the new tags reflect these rules. 771 self.assertEqual(b"<br/>", html_br.encode()) 772 self.assertEqual(b"<p></p>", html_p.encode()) 773 774 def test_new_string_creates_navigablestring(self): 775 soup = self.soup("") 776 s = soup.new_string("foo") 777 self.assertEqual("foo", s) 778 self.assertTrue(isinstance(s, NavigableString)) 779 780 def test_new_string_can_create_navigablestring_subclass(self): 781 soup = self.soup("") 782 s = soup.new_string("foo", Comment) 783 self.assertEqual("foo", s) 784 self.assertTrue(isinstance(s, Comment)) 785 786class TestTreeModification(SoupTest): 787 788 def test_attribute_modification(self): 789 soup = self.soup('<a id="1"></a>') 790 soup.a['id'] = 2 791 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) 792 del(soup.a['id']) 793 self.assertEqual(soup.decode(), self.document_for('<a></a>')) 794 soup.a['id2'] = 'foo' 795 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) 796 797 def test_new_tag_creation(self): 798 builder = builder_registry.lookup('html')() 799 soup = self.soup("<body></body>", builder=builder) 800 a = Tag(soup, builder, 'a') 801 ol = Tag(soup, builder, 'ol') 802 a['href'] = 'http://foo.com/' 803 soup.body.insert(0, a) 804 soup.body.insert(1, ol) 805 self.assertEqual( 806 soup.body.encode(), 807 b'<body><a href="http://foo.com/"></a><ol></ol></body>') 808 809 def test_append_to_contents_moves_tag(self): 810 doc = """<p id="1">Don't leave me <b>here</b>.</p> 811 <p id="2">Don\'t leave!</p>""" 812 soup = self.soup(doc) 813 second_para = soup.find(id='2') 814 bold = soup.b 815 816 # Move the <b> tag to the end of the second paragraph. 817 soup.find(id='2').append(soup.b) 818 819 # The <b> tag is now a child of the second paragraph. 820 self.assertEqual(bold.parent, second_para) 821 822 self.assertEqual( 823 soup.decode(), self.document_for( 824 '<p id="1">Don\'t leave me .</p>\n' 825 '<p id="2">Don\'t leave!<b>here</b></p>')) 826 827 def test_replace_with_returns_thing_that_was_replaced(self): 828 text = "<a></a><b><c></c></b>" 829 soup = self.soup(text) 830 a = soup.a 831 new_a = a.replace_with(soup.c) 832 self.assertEqual(a, new_a) 833 834 def test_unwrap_returns_thing_that_was_replaced(self): 835 text = "<a><b></b><c></c></a>" 836 soup = self.soup(text) 837 a = soup.a 838 new_a = a.unwrap() 839 self.assertEqual(a, new_a) 840 841 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): 842 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") 843 a = soup.a 844 a.extract() 845 self.assertEqual(None, a.parent) 846 self.assertRaises(ValueError, a.unwrap) 847 self.assertRaises(ValueError, a.replace_with, soup.c) 848 849 def test_replace_tag_with_itself(self): 850 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" 851 soup = self.soup(text) 852 c = soup.c 853 soup.c.replace_with(c) 854 self.assertEqual(soup.decode(), self.document_for(text)) 855 856 def test_replace_tag_with_its_parent_raises_exception(self): 857 text = "<a><b></b></a>" 858 soup = self.soup(text) 859 self.assertRaises(ValueError, soup.b.replace_with, soup.a) 860 861 def test_insert_tag_into_itself_raises_exception(self): 862 text = "<a><b></b></a>" 863 soup = self.soup(text) 864 self.assertRaises(ValueError, soup.a.insert, 0, soup.a) 865 866 def test_insert_beautifulsoup_object_inserts_children(self): 867 """Inserting one BeautifulSoup object into another actually inserts all 868 of its children -- you'll never combine BeautifulSoup objects. 869 """ 870 soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") 871 872 text = "<p>p2</p><p>p3</p>" 873 to_insert = self.soup(text) 874 soup.insert(1, to_insert) 875 876 for i in soup.descendants: 877 assert not isinstance(i, BeautifulSoup) 878 879 p1, p2, p3, p4 = list(soup.children) 880 self.assertEqual("And now, a word:", p1.string) 881 self.assertEqual("p2", p2.string) 882 self.assertEqual("p3", p3.string) 883 self.assertEqual("And we're back.", p4.string) 884 885 886 def test_replace_with_maintains_next_element_throughout(self): 887 soup = self.soup('<p><a>one</a><b>three</b></p>') 888 a = soup.a 889 b = a.contents[0] 890 # Make it so the <a> tag has two text children. 891 a.insert(1, "two") 892 893 # Now replace each one with the empty string. 894 left, right = a.contents 895 left.replaceWith('') 896 right.replaceWith('') 897 898 # The <b> tag is still connected to the tree. 899 self.assertEqual("three", soup.b.string) 900 901 def test_replace_final_node(self): 902 soup = self.soup("<b>Argh!</b>") 903 soup.find(text="Argh!").replace_with("Hooray!") 904 new_text = soup.find(text="Hooray!") 905 b = soup.b 906 self.assertEqual(new_text.previous_element, b) 907 self.assertEqual(new_text.parent, b) 908 self.assertEqual(new_text.previous_element.next_element, new_text) 909 self.assertEqual(new_text.next_element, None) 910 911 def test_consecutive_text_nodes(self): 912 # A builder should never create two consecutive text nodes, 913 # but if you insert one next to another, Beautiful Soup will 914 # handle it correctly. 915 soup = self.soup("<a><b>Argh!</b><c></c></a>") 916 soup.b.insert(1, "Hooray!") 917 918 self.assertEqual( 919 soup.decode(), self.document_for( 920 "<a><b>Argh!Hooray!</b><c></c></a>")) 921 922 new_text = soup.find(text="Hooray!") 923 self.assertEqual(new_text.previous_element, "Argh!") 924 self.assertEqual(new_text.previous_element.next_element, new_text) 925 926 self.assertEqual(new_text.previous_sibling, "Argh!") 927 self.assertEqual(new_text.previous_sibling.next_sibling, new_text) 928 929 self.assertEqual(new_text.next_sibling, None) 930 self.assertEqual(new_text.next_element, soup.c) 931 932 def test_insert_string(self): 933 soup = self.soup("<a></a>") 934 soup.a.insert(0, "bar") 935 soup.a.insert(0, "foo") 936 # The string were added to the tag. 937 self.assertEqual(["foo", "bar"], soup.a.contents) 938 # And they were converted to NavigableStrings. 939 self.assertEqual(soup.a.contents[0].next_element, "bar") 940 941 def test_insert_tag(self): 942 builder = self.default_builder() 943 soup = self.soup( 944 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) 945 magic_tag = Tag(soup, builder, 'magictag') 946 magic_tag.insert(0, "the") 947 soup.a.insert(1, magic_tag) 948 949 self.assertEqual( 950 soup.decode(), self.document_for( 951 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) 952 953 # Make sure all the relationships are hooked up correctly. 954 b_tag = soup.b 955 self.assertEqual(b_tag.next_sibling, magic_tag) 956 self.assertEqual(magic_tag.previous_sibling, b_tag) 957 958 find = b_tag.find(text="Find") 959 self.assertEqual(find.next_element, magic_tag) 960 self.assertEqual(magic_tag.previous_element, find) 961 962 c_tag = soup.c 963 self.assertEqual(magic_tag.next_sibling, c_tag) 964 self.assertEqual(c_tag.previous_sibling, magic_tag) 965 966 the = magic_tag.find(text="the") 967 self.assertEqual(the.parent, magic_tag) 968 self.assertEqual(the.next_element, c_tag) 969 self.assertEqual(c_tag.previous_element, the) 970 971 def test_append_child_thats_already_at_the_end(self): 972 data = "<a><b></b></a>" 973 soup = self.soup(data) 974 soup.a.append(soup.b) 975 self.assertEqual(data, soup.decode()) 976 977 def test_extend(self): 978 data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>" 979 soup = self.soup(data) 980 l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] 981 soup.a.extend(l) 982 self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode()) 983 984 def test_move_tag_to_beginning_of_parent(self): 985 data = "<a><b></b><c></c><d></d></a>" 986 soup = self.soup(data) 987 soup.a.insert(0, soup.d) 988 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) 989 990 def test_insert_works_on_empty_element_tag(self): 991 # This is a little strange, since most HTML parsers don't allow 992 # markup like this to come through. But in general, we don't 993 # know what the parser would or wouldn't have allowed, so 994 # I'm letting this succeed for now. 995 soup = self.soup("<br/>") 996 soup.br.insert(1, "Contents") 997 self.assertEqual(str(soup.br), "<br>Contents</br>") 998 999 def test_insert_before(self): 1000 soup = self.soup("<a>foo</a><b>bar</b>") 1001 soup.b.insert_before("BAZ") 1002 soup.a.insert_before("QUUX") 1003 self.assertEqual( 1004 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) 1005 1006 soup.a.insert_before(soup.b) 1007 self.assertEqual( 1008 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 1009 1010 # Can't insert an element before itself. 1011 b = soup.b 1012 self.assertRaises(ValueError, b.insert_before, b) 1013 1014 # Can't insert before if an element has no parent. 1015 b.extract() 1016 self.assertRaises(ValueError, b.insert_before, "nope") 1017 1018 # Can insert an identical element 1019 soup = self.soup("<a>") 1020 soup.a.insert_before(soup.new_tag("a")) 1021 1022 def test_insert_multiple_before(self): 1023 soup = self.soup("<a>foo</a><b>bar</b>") 1024 soup.b.insert_before("BAZ", " ", "QUUX") 1025 soup.a.insert_before("QUUX", " ", "BAZ") 1026 self.assertEqual( 1027 soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>")) 1028 1029 soup.a.insert_before(soup.b, "FOO") 1030 self.assertEqual( 1031 soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX")) 1032 1033 def test_insert_after(self): 1034 soup = self.soup("<a>foo</a><b>bar</b>") 1035 soup.b.insert_after("BAZ") 1036 soup.a.insert_after("QUUX") 1037 self.assertEqual( 1038 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) 1039 soup.b.insert_after(soup.a) 1040 self.assertEqual( 1041 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) 1042 1043 # Can't insert an element after itself. 1044 b = soup.b 1045 self.assertRaises(ValueError, b.insert_after, b) 1046 1047 # Can't insert after if an element has no parent. 1048 b.extract() 1049 self.assertRaises(ValueError, b.insert_after, "nope") 1050 1051 # Can insert an identical element 1052 soup = self.soup("<a>") 1053 soup.a.insert_before(soup.new_tag("a")) 1054 1055 def test_insert_multiple_after(self): 1056 soup = self.soup("<a>foo</a><b>bar</b>") 1057 soup.b.insert_after("BAZ", " ", "QUUX") 1058 soup.a.insert_after("QUUX", " ", "BAZ") 1059 self.assertEqual( 1060 soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX")) 1061 soup.b.insert_after(soup.a, "FOO ") 1062 self.assertEqual( 1063 soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX")) 1064 1065 def test_insert_after_raises_exception_if_after_has_no_meaning(self): 1066 soup = self.soup("") 1067 tag = soup.new_tag("a") 1068 string = soup.new_string("") 1069 self.assertRaises(ValueError, string.insert_after, tag) 1070 self.assertRaises(NotImplementedError, soup.insert_after, tag) 1071 self.assertRaises(ValueError, tag.insert_after, tag) 1072 1073 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): 1074 soup = self.soup("") 1075 tag = soup.new_tag("a") 1076 string = soup.new_string("") 1077 self.assertRaises(ValueError, string.insert_before, tag) 1078 self.assertRaises(NotImplementedError, soup.insert_before, tag) 1079 self.assertRaises(ValueError, tag.insert_before, tag) 1080 1081 def test_replace_with(self): 1082 soup = self.soup( 1083 "<p>There's <b>no</b> business like <b>show</b> business</p>") 1084 no, show = soup.find_all('b') 1085 show.replace_with(no) 1086 self.assertEqual( 1087 soup.decode(), 1088 self.document_for( 1089 "<p>There's business like <b>no</b> business</p>")) 1090 1091 self.assertEqual(show.parent, None) 1092 self.assertEqual(no.parent, soup.p) 1093 self.assertEqual(no.next_element, "no") 1094 self.assertEqual(no.next_sibling, " business") 1095 1096 def test_replace_first_child(self): 1097 data = "<a><b></b><c></c></a>" 1098 soup = self.soup(data) 1099 soup.b.replace_with(soup.c) 1100 self.assertEqual("<a><c></c></a>", soup.decode()) 1101 1102 def test_replace_last_child(self): 1103 data = "<a><b></b><c></c></a>" 1104 soup = self.soup(data) 1105 soup.c.replace_with(soup.b) 1106 self.assertEqual("<a><b></b></a>", soup.decode()) 1107 1108 def test_nested_tag_replace_with(self): 1109 soup = self.soup( 1110 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") 1111 1112 # Replace the entire <b> tag and its contents ("reserve the 1113 # right") with the <f> tag ("refuse"). 1114 remove_tag = soup.b 1115 move_tag = soup.f 1116 remove_tag.replace_with(move_tag) 1117 1118 self.assertEqual( 1119 soup.decode(), self.document_for( 1120 "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) 1121 1122 # The <b> tag is now an orphan. 1123 self.assertEqual(remove_tag.parent, None) 1124 self.assertEqual(remove_tag.find(text="right").next_element, None) 1125 self.assertEqual(remove_tag.previous_element, None) 1126 self.assertEqual(remove_tag.next_sibling, None) 1127 self.assertEqual(remove_tag.previous_sibling, None) 1128 1129 # The <f> tag is now connected to the <a> tag. 1130 self.assertEqual(move_tag.parent, soup.a) 1131 self.assertEqual(move_tag.previous_element, "We") 1132 self.assertEqual(move_tag.next_element.next_element, soup.e) 1133 self.assertEqual(move_tag.next_sibling, None) 1134 1135 # The gap where the <f> tag used to be has been mended, and 1136 # the word "to" is now connected to the <g> tag. 1137 to_text = soup.find(text="to") 1138 g_tag = soup.g 1139 self.assertEqual(to_text.next_element, g_tag) 1140 self.assertEqual(to_text.next_sibling, g_tag) 1141 self.assertEqual(g_tag.previous_element, to_text) 1142 self.assertEqual(g_tag.previous_sibling, to_text) 1143 1144 def test_unwrap(self): 1145 tree = self.soup(""" 1146 <p>Unneeded <em>formatting</em> is unneeded</p> 1147 """) 1148 tree.em.unwrap() 1149 self.assertEqual(tree.em, None) 1150 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") 1151 1152 def test_wrap(self): 1153 soup = self.soup("I wish I was bold.") 1154 value = soup.string.wrap(soup.new_tag("b")) 1155 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") 1156 self.assertEqual( 1157 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1158 1159 def test_wrap_extracts_tag_from_elsewhere(self): 1160 soup = self.soup("<b></b>I wish I was bold.") 1161 soup.b.next_sibling.wrap(soup.b) 1162 self.assertEqual( 1163 soup.decode(), self.document_for("<b>I wish I was bold.</b>")) 1164 1165 def test_wrap_puts_new_contents_at_the_end(self): 1166 soup = self.soup("<b>I like being bold.</b>I wish I was bold.") 1167 soup.b.next_sibling.wrap(soup.b) 1168 self.assertEqual(2, len(soup.b.contents)) 1169 self.assertEqual( 1170 soup.decode(), self.document_for( 1171 "<b>I like being bold.I wish I was bold.</b>")) 1172 1173 def test_extract(self): 1174 soup = self.soup( 1175 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') 1176 1177 self.assertEqual(len(soup.body.contents), 3) 1178 extracted = soup.find(id="nav").extract() 1179 1180 self.assertEqual( 1181 soup.decode(), "<html><body>Some content. More content.</body></html>") 1182 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') 1183 1184 # The extracted tag is now an orphan. 1185 self.assertEqual(len(soup.body.contents), 2) 1186 self.assertEqual(extracted.parent, None) 1187 self.assertEqual(extracted.previous_element, None) 1188 self.assertEqual(extracted.next_element.next_element, None) 1189 1190 # The gap where the extracted tag used to be has been mended. 1191 content_1 = soup.find(text="Some content. ") 1192 content_2 = soup.find(text=" More content.") 1193 self.assertEqual(content_1.next_element, content_2) 1194 self.assertEqual(content_1.next_sibling, content_2) 1195 self.assertEqual(content_2.previous_element, content_1) 1196 self.assertEqual(content_2.previous_sibling, content_1) 1197 1198 def test_extract_distinguishes_between_identical_strings(self): 1199 soup = self.soup("<a>foo</a><b>bar</b>") 1200 foo_1 = soup.a.string 1201 bar_1 = soup.b.string 1202 foo_2 = soup.new_string("foo") 1203 bar_2 = soup.new_string("bar") 1204 soup.a.append(foo_2) 1205 soup.b.append(bar_2) 1206 1207 # Now there are two identical strings in the <a> tag, and two 1208 # in the <b> tag. Let's remove the first "foo" and the second 1209 # "bar". 1210 foo_1.extract() 1211 bar_2.extract() 1212 self.assertEqual(foo_2, soup.a.string) 1213 self.assertEqual(bar_2, soup.b.string) 1214 1215 def test_extract_multiples_of_same_tag(self): 1216 soup = self.soup(""" 1217<html> 1218<head> 1219<script>foo</script> 1220</head> 1221<body> 1222 <script>bar</script> 1223 <a></a> 1224</body> 1225<script>baz</script> 1226</html>""") 1227 [soup.script.extract() for i in soup.find_all("script")] 1228 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) 1229 1230 1231 def test_extract_works_when_element_is_surrounded_by_identical_strings(self): 1232 soup = self.soup( 1233 '<html>\n' 1234 '<body>hi</body>\n' 1235 '</html>') 1236 soup.find('body').extract() 1237 self.assertEqual(None, soup.find('body')) 1238 1239 1240 def test_clear(self): 1241 """Tag.clear()""" 1242 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") 1243 # clear using extract() 1244 a = soup.a 1245 soup.p.clear() 1246 self.assertEqual(len(soup.p.contents), 0) 1247 self.assertTrue(hasattr(a, "contents")) 1248 1249 # clear using decompose() 1250 em = a.em 1251 a.clear(decompose=True) 1252 self.assertEqual(0, len(em.contents)) 1253 1254 def test_string_set(self): 1255 """Tag.string = 'string'""" 1256 soup = self.soup("<a></a> <b><c></c></b>") 1257 soup.a.string = "foo" 1258 self.assertEqual(soup.a.contents, ["foo"]) 1259 soup.b.string = "bar" 1260 self.assertEqual(soup.b.contents, ["bar"]) 1261 1262 def test_string_set_does_not_affect_original_string(self): 1263 soup = self.soup("<a><b>foo</b><c>bar</c>") 1264 soup.b.string = soup.c.string 1265 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") 1266 1267 def test_set_string_preserves_class_of_string(self): 1268 soup = self.soup("<a></a>") 1269 cdata = CData("foo") 1270 soup.a.string = cdata 1271 self.assertTrue(isinstance(soup.a.string, CData)) 1272 1273class TestElementObjects(SoupTest): 1274 """Test various features of element objects.""" 1275 1276 def test_len(self): 1277 """The length of an element is its number of children.""" 1278 soup = self.soup("<top>1<b>2</b>3</top>") 1279 1280 # The BeautifulSoup object itself contains one element: the 1281 # <top> tag. 1282 self.assertEqual(len(soup.contents), 1) 1283 self.assertEqual(len(soup), 1) 1284 1285 # The <top> tag contains three elements: the text node "1", the 1286 # <b> tag, and the text node "3". 1287 self.assertEqual(len(soup.top), 3) 1288 self.assertEqual(len(soup.top.contents), 3) 1289 1290 def test_member_access_invokes_find(self): 1291 """Accessing a Python member .foo invokes find('foo')""" 1292 soup = self.soup('<b><i></i></b>') 1293 self.assertEqual(soup.b, soup.find('b')) 1294 self.assertEqual(soup.b.i, soup.find('b').find('i')) 1295 self.assertEqual(soup.a, None) 1296 1297 def test_deprecated_member_access(self): 1298 soup = self.soup('<b><i></i></b>') 1299 with warnings.catch_warnings(record=True) as w: 1300 tag = soup.bTag 1301 self.assertEqual(soup.b, tag) 1302 self.assertEqual( 1303 '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', 1304 str(w[0].message)) 1305 1306 def test_has_attr(self): 1307 """has_attr() checks for the presence of an attribute. 1308 1309 Please note note: has_attr() is different from 1310 __in__. has_attr() checks the tag's attributes and __in__ 1311 checks the tag's chidlren. 1312 """ 1313 soup = self.soup("<foo attr='bar'>") 1314 self.assertTrue(soup.foo.has_attr('attr')) 1315 self.assertFalse(soup.foo.has_attr('attr2')) 1316 1317 1318 def test_attributes_come_out_in_alphabetical_order(self): 1319 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' 1320 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') 1321 1322 def test_string(self): 1323 # A tag that contains only a text node makes that node 1324 # available as .string. 1325 soup = self.soup("<b>foo</b>") 1326 self.assertEqual(soup.b.string, 'foo') 1327 1328 def test_empty_tag_has_no_string(self): 1329 # A tag with no children has no .stirng. 1330 soup = self.soup("<b></b>") 1331 self.assertEqual(soup.b.string, None) 1332 1333 def test_tag_with_multiple_children_has_no_string(self): 1334 # A tag with no children has no .string. 1335 soup = self.soup("<a>foo<b></b><b></b></b>") 1336 self.assertEqual(soup.b.string, None) 1337 1338 soup = self.soup("<a>foo<b></b>bar</b>") 1339 self.assertEqual(soup.b.string, None) 1340 1341 # Even if all the children are strings, due to trickery, 1342 # it won't work--but this would be a good optimization. 1343 soup = self.soup("<a>foo</b>") 1344 soup.a.insert(1, "bar") 1345 self.assertEqual(soup.a.string, None) 1346 1347 def test_tag_with_recursive_string_has_string(self): 1348 # A tag with a single child which has a .string inherits that 1349 # .string. 1350 soup = self.soup("<a><b>foo</b></a>") 1351 self.assertEqual(soup.a.string, "foo") 1352 self.assertEqual(soup.string, "foo") 1353 1354 def test_lack_of_string(self): 1355 """Only a tag containing a single text node has a .string.""" 1356 soup = self.soup("<b>f<i>e</i>o</b>") 1357 self.assertFalse(soup.b.string) 1358 1359 soup = self.soup("<b></b>") 1360 self.assertFalse(soup.b.string) 1361 1362 def test_all_text(self): 1363 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" 1364 soup = self.soup("<a>a<b>r</b> <r> t </r></a>") 1365 self.assertEqual(soup.a.text, "ar t ") 1366 self.assertEqual(soup.a.get_text(strip=True), "art") 1367 self.assertEqual(soup.a.get_text(","), "a,r, , t ") 1368 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") 1369 1370 def test_get_text_ignores_comments(self): 1371 soup = self.soup("foo<!--IGNORE-->bar") 1372 self.assertEqual(soup.get_text(), "foobar") 1373 1374 self.assertEqual( 1375 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") 1376 self.assertEqual( 1377 soup.get_text(types=None), "fooIGNOREbar") 1378 1379 def test_all_strings_ignores_comments(self): 1380 soup = self.soup("foo<!--IGNORE-->bar") 1381 self.assertEqual(['foo', 'bar'], list(soup.strings)) 1382 1383class TestCDAtaListAttributes(SoupTest): 1384 1385 """Testing cdata-list attributes like 'class'. 1386 """ 1387 def test_single_value_becomes_list(self): 1388 soup = self.soup("<a class='foo'>") 1389 self.assertEqual(["foo"],soup.a['class']) 1390 1391 def test_multiple_values_becomes_list(self): 1392 soup = self.soup("<a class='foo bar'>") 1393 self.assertEqual(["foo", "bar"], soup.a['class']) 1394 1395 def test_multiple_values_separated_by_weird_whitespace(self): 1396 soup = self.soup("<a class='foo\tbar\nbaz'>") 1397 self.assertEqual(["foo", "bar", "baz"],soup.a['class']) 1398 1399 def test_attributes_joined_into_string_on_output(self): 1400 soup = self.soup("<a class='foo\tbar'>") 1401 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) 1402 1403 def test_get_attribute_list(self): 1404 soup = self.soup("<a id='abc def'>") 1405 self.assertEqual(['abc def'], soup.a.get_attribute_list('id')) 1406 1407 def test_accept_charset(self): 1408 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') 1409 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) 1410 1411 def test_cdata_attribute_applying_only_to_one_tag(self): 1412 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' 1413 soup = self.soup(data) 1414 # We saw in another test that accept-charset is a cdata-list 1415 # attribute for the <form> tag. But it's not a cdata-list 1416 # attribute for any other tag. 1417 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) 1418 1419 def test_string_has_immutable_name_property(self): 1420 string = self.soup("s").string 1421 self.assertEqual(None, string.name) 1422 def t(): 1423 string.name = 'foo' 1424 self.assertRaises(AttributeError, t) 1425 1426class TestPersistence(SoupTest): 1427 "Testing features like pickle and deepcopy." 1428 1429 def setUp(self): 1430 super(TestPersistence, self).setUp() 1431 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" 1432"http://www.w3.org/TR/REC-html40/transitional.dtd"> 1433<html> 1434<head> 1435<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 1436<title>Beautiful Soup: We called him Tortoise because he taught us.</title> 1437<link rev="made" href="mailto:leonardr@segfault.org"> 1438<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> 1439<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> 1440<meta name="author" content="Leonard Richardson"> 1441</head> 1442<body> 1443<a href="foo">foo</a> 1444<a href="foo"><b>bar</b></a> 1445</body> 1446</html>""" 1447 self.tree = self.soup(self.page) 1448 1449 def test_pickle_and_unpickle_identity(self): 1450 # Pickling a tree, then unpickling it, yields a tree identical 1451 # to the original. 1452 dumped = pickle.dumps(self.tree, 2) 1453 loaded = pickle.loads(dumped) 1454 self.assertEqual(loaded.__class__, BeautifulSoup) 1455 self.assertEqual(loaded.decode(), self.tree.decode()) 1456 1457 def test_deepcopy_identity(self): 1458 # Making a deepcopy of a tree yields an identical tree. 1459 copied = copy.deepcopy(self.tree) 1460 self.assertEqual(copied.decode(), self.tree.decode()) 1461 1462 def test_copy_preserves_encoding(self): 1463 soup = BeautifulSoup(b'<p> </p>', 'html.parser') 1464 encoding = soup.original_encoding 1465 copy = soup.__copy__() 1466 self.assertEqual("<p> </p>", str(copy)) 1467 self.assertEqual(encoding, copy.original_encoding) 1468 1469 def test_unicode_pickle(self): 1470 # A tree containing Unicode characters can be pickled. 1471 html = "<b>\N{SNOWMAN}</b>" 1472 soup = self.soup(html) 1473 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) 1474 loaded = pickle.loads(dumped) 1475 self.assertEqual(loaded.decode(), soup.decode()) 1476 1477 def test_copy_navigablestring_is_not_attached_to_tree(self): 1478 html = "<b>Foo<a></a></b><b>Bar</b>" 1479 soup = self.soup(html) 1480 s1 = soup.find(string="Foo") 1481 s2 = copy.copy(s1) 1482 self.assertEqual(s1, s2) 1483 self.assertEqual(None, s2.parent) 1484 self.assertEqual(None, s2.next_element) 1485 self.assertNotEqual(None, s1.next_sibling) 1486 self.assertEqual(None, s2.next_sibling) 1487 self.assertEqual(None, s2.previous_element) 1488 1489 def test_copy_navigablestring_subclass_has_same_type(self): 1490 html = "<b><!--Foo--></b>" 1491 soup = self.soup(html) 1492 s1 = soup.string 1493 s2 = copy.copy(s1) 1494 self.assertEqual(s1, s2) 1495 self.assertTrue(isinstance(s2, Comment)) 1496 1497 def test_copy_entire_soup(self): 1498 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" 1499 soup = self.soup(html) 1500 soup_copy = copy.copy(soup) 1501 self.assertEqual(soup, soup_copy) 1502 1503 def test_copy_tag_copies_contents(self): 1504 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" 1505 soup = self.soup(html) 1506 div = soup.div 1507 div_copy = copy.copy(div) 1508 1509 # The two tags look the same, and evaluate to equal. 1510 self.assertEqual(str(div), str(div_copy)) 1511 self.assertEqual(div, div_copy) 1512 1513 # But they're not the same object. 1514 self.assertFalse(div is div_copy) 1515 1516 # And they don't have the same relation to the parse tree. The 1517 # copy is not associated with a parse tree at all. 1518 self.assertEqual(None, div_copy.parent) 1519 self.assertEqual(None, div_copy.previous_element) 1520 self.assertEqual(None, div_copy.find(string='Bar').next_element) 1521 self.assertNotEqual(None, div.find(string='Bar').next_element) 1522 1523class TestSubstitutions(SoupTest): 1524 1525 def test_default_formatter_is_minimal(self): 1526 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1527 soup = self.soup(markup) 1528 decoded = soup.decode(formatter="minimal") 1529 # The < is converted back into < but the e-with-acute is left alone. 1530 self.assertEqual( 1531 decoded, 1532 self.document_for( 1533 "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1534 1535 def test_formatter_html(self): 1536 markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1537 soup = self.soup(markup) 1538 decoded = soup.decode(formatter="html") 1539 self.assertEqual( 1540 decoded, 1541 self.document_for("<br/><b><<Sacré bleu!>></b>")) 1542 1543 def test_formatter_html5(self): 1544 markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1545 soup = self.soup(markup) 1546 decoded = soup.decode(formatter="html5") 1547 self.assertEqual( 1548 decoded, 1549 self.document_for("<br><b><<Sacré bleu!>></b>")) 1550 1551 def test_formatter_minimal(self): 1552 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1553 soup = self.soup(markup) 1554 decoded = soup.decode(formatter="minimal") 1555 # The < is converted back into < but the e-with-acute is left alone. 1556 self.assertEqual( 1557 decoded, 1558 self.document_for( 1559 "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1560 1561 def test_formatter_null(self): 1562 markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" 1563 soup = self.soup(markup) 1564 decoded = soup.decode(formatter=None) 1565 # Neither the angle brackets nor the e-with-acute are converted. 1566 # This is not valid HTML, but it's what the user wanted. 1567 self.assertEqual(decoded, 1568 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) 1569 1570 def test_formatter_custom(self): 1571 markup = "<b><foo></b><b>bar</b><br/>" 1572 soup = self.soup(markup) 1573 decoded = soup.decode(formatter = lambda x: x.upper()) 1574 # Instead of normal entity conversion code, the custom 1575 # callable is called on every string. 1576 self.assertEqual( 1577 decoded, 1578 self.document_for("<b><FOO></b><b>BAR</b><br/>")) 1579 1580 def test_formatter_is_run_on_attribute_values(self): 1581 markup = '<a href="http://a.com?a=b&c=é">e</a>' 1582 soup = self.soup(markup) 1583 a = soup.a 1584 1585 expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' 1586 1587 self.assertEqual(expect_minimal, a.decode()) 1588 self.assertEqual(expect_minimal, a.decode(formatter="minimal")) 1589 1590 expect_html = '<a href="http://a.com?a=b&c=é">e</a>' 1591 self.assertEqual(expect_html, a.decode(formatter="html")) 1592 1593 self.assertEqual(markup, a.decode(formatter=None)) 1594 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' 1595 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) 1596 1597 def test_formatter_skips_script_tag_for_html_documents(self): 1598 doc = """ 1599 <script type="text/javascript"> 1600 console.log("< < hey > > "); 1601 </script> 1602""" 1603 encoded = BeautifulSoup(doc, 'html.parser').encode() 1604 self.assertTrue(b"< < hey > >" in encoded) 1605 1606 def test_formatter_skips_style_tag_for_html_documents(self): 1607 doc = """ 1608 <style type="text/css"> 1609 console.log("< < hey > > "); 1610 </style> 1611""" 1612 encoded = BeautifulSoup(doc, 'html.parser').encode() 1613 self.assertTrue(b"< < hey > >" in encoded) 1614 1615 def test_prettify_leaves_preformatted_text_alone(self): 1616 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") 1617 # Everything outside the <pre> tag is reformatted, but everything 1618 # inside is left alone. 1619 self.assertEqual( 1620 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>', 1621 soup.div.prettify()) 1622 1623 def test_prettify_accepts_formatter_function(self): 1624 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') 1625 pretty = soup.prettify(formatter = lambda x: x.upper()) 1626 self.assertTrue("FOO" in pretty) 1627 1628 def test_prettify_outputs_unicode_by_default(self): 1629 soup = self.soup("<a></a>") 1630 self.assertEqual(str, type(soup.prettify())) 1631 1632 def test_prettify_can_encode_data(self): 1633 soup = self.soup("<a></a>") 1634 self.assertEqual(bytes, type(soup.prettify("utf-8"))) 1635 1636 def test_html_entity_substitution_off_by_default(self): 1637 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" 1638 soup = self.soup(markup) 1639 encoded = soup.b.encode("utf-8") 1640 self.assertEqual(encoded, markup.encode('utf-8')) 1641 1642 def test_encoding_substitution(self): 1643 # Here's the <meta> tag saying that a document is 1644 # encoded in Shift-JIS. 1645 meta_tag = ('<meta content="text/html; charset=x-sjis" ' 1646 'http-equiv="Content-type"/>') 1647 soup = self.soup(meta_tag) 1648 1649 # Parse the document, and the charset apprears unchanged. 1650 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') 1651 1652 # Encode the document into some encoding, and the encoding is 1653 # substituted into the meta tag. 1654 utf_8 = soup.encode("utf-8") 1655 self.assertTrue(b"charset=utf-8" in utf_8) 1656 1657 euc_jp = soup.encode("euc_jp") 1658 self.assertTrue(b"charset=euc_jp" in euc_jp) 1659 1660 shift_jis = soup.encode("shift-jis") 1661 self.assertTrue(b"charset=shift-jis" in shift_jis) 1662 1663 utf_16_u = soup.encode("utf-16").decode("utf-16") 1664 self.assertTrue("charset=utf-16" in utf_16_u) 1665 1666 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): 1667 markup = ('<head><meta content="text/html; charset=x-sjis" ' 1668 'http-equiv="Content-type"/></head><pre>foo</pre>') 1669 1670 # Beautiful Soup used to try to rewrite the meta tag even if the 1671 # meta tag got filtered out by the strainer. This test makes 1672 # sure that doesn't happen. 1673 strainer = SoupStrainer('pre') 1674 soup = self.soup(markup, parse_only=strainer) 1675 self.assertEqual(soup.contents[0].name, 'pre') 1676 1677class TestEncoding(SoupTest): 1678 """Test the ability to encode objects into strings.""" 1679 1680 def test_unicode_string_can_be_encoded(self): 1681 html = "<b>\N{SNOWMAN}</b>" 1682 soup = self.soup(html) 1683 self.assertEqual(soup.b.string.encode("utf-8"), 1684 "\N{SNOWMAN}".encode("utf-8")) 1685 1686 def test_tag_containing_unicode_string_can_be_encoded(self): 1687 html = "<b>\N{SNOWMAN}</b>" 1688 soup = self.soup(html) 1689 self.assertEqual( 1690 soup.b.encode("utf-8"), html.encode("utf-8")) 1691 1692 def test_encoding_substitutes_unrecognized_characters_by_default(self): 1693 html = "<b>\N{SNOWMAN}</b>" 1694 soup = self.soup(html) 1695 self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") 1696 1697 def test_encoding_can_be_made_strict(self): 1698 html = "<b>\N{SNOWMAN}</b>" 1699 soup = self.soup(html) 1700 self.assertRaises( 1701 UnicodeEncodeError, soup.encode, "ascii", errors="strict") 1702 1703 def test_decode_contents(self): 1704 html = "<b>\N{SNOWMAN}</b>" 1705 soup = self.soup(html) 1706 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) 1707 1708 def test_encode_contents(self): 1709 html = "<b>\N{SNOWMAN}</b>" 1710 soup = self.soup(html) 1711 self.assertEqual( 1712 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( 1713 encoding="utf8")) 1714 1715 def test_deprecated_renderContents(self): 1716 html = "<b>\N{SNOWMAN}</b>" 1717 soup = self.soup(html) 1718 self.assertEqual( 1719 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) 1720 1721 def test_repr(self): 1722 html = "<b>\N{SNOWMAN}</b>" 1723 soup = self.soup(html) 1724 if PY3K: 1725 self.assertEqual(html, repr(soup)) 1726 else: 1727 self.assertEqual(b'<b>\\u2603</b>', repr(soup)) 1728 1729class TestFormatter(SoupTest): 1730 1731 def test_sort_attributes(self): 1732 # Test the ability to override Formatter.attributes() to, 1733 # e.g., disable the normal sorting of attributes. 1734 class UnsortedFormatter(Formatter): 1735 def attributes(self, tag): 1736 self.called_with = tag 1737 for k, v in sorted(tag.attrs.items()): 1738 if k == 'ignore': 1739 continue 1740 yield k,v 1741 1742 soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') 1743 formatter = UnsortedFormatter() 1744 decoded = soup.decode(formatter=formatter) 1745 1746 # attributes() was called on the <p> tag. It filtered out one 1747 # attribute and sorted the other two. 1748 self.assertEqual(formatter.called_with, soup.p) 1749 self.assertEqual('<p aval="2" cval="1"></p>', decoded) 1750 1751 1752class TestNavigableStringSubclasses(SoupTest): 1753 1754 def test_cdata(self): 1755 # None of the current builders turn CDATA sections into CData 1756 # objects, but you can create them manually. 1757 soup = self.soup("") 1758 cdata = CData("foo") 1759 soup.insert(1, cdata) 1760 self.assertEqual(str(soup), "<![CDATA[foo]]>") 1761 self.assertEqual(soup.find(text="foo"), "foo") 1762 self.assertEqual(soup.contents[0], "foo") 1763 1764 def test_cdata_is_never_formatted(self): 1765 """Text inside a CData object is passed into the formatter. 1766 1767 But the return value is ignored. 1768 """ 1769 1770 self.count = 0 1771 def increment(*args): 1772 self.count += 1 1773 return "BITTER FAILURE" 1774 1775 soup = self.soup("") 1776 cdata = CData("<><><>") 1777 soup.insert(1, cdata) 1778 self.assertEqual( 1779 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) 1780 self.assertEqual(1, self.count) 1781 1782 def test_doctype_ends_in_newline(self): 1783 # Unlike other NavigableString subclasses, a DOCTYPE always ends 1784 # in a newline. 1785 doctype = Doctype("foo") 1786 soup = self.soup("") 1787 soup.insert(1, doctype) 1788 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") 1789 1790 def test_declaration(self): 1791 d = Declaration("foo") 1792 self.assertEqual("<?foo?>", d.output_ready()) 1793 1794class TestSoupSelector(TreeTest): 1795 1796 HTML = """ 1797<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 1798"http://www.w3.org/TR/html4/strict.dtd"> 1799<html> 1800<head> 1801<title>The title</title> 1802<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> 1803</head> 1804<body> 1805<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> 1806<div id="main" class="fancy"> 1807<div id="inner"> 1808<h1 id="header1">An H1</h1> 1809<p>Some text</p> 1810<p class="onep" id="p1">Some more text</p> 1811<h2 id="header2">An H2</h2> 1812<p class="class1 class2 class3" id="pmulti">Another</p> 1813<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> 1814<h2 id="header3">Another H2</h2> 1815<a id="me" href="http://simonwillison.net/" rel="me">me</a> 1816<span class="s1"> 1817<a href="#" id="s1a1">span1a1</a> 1818<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> 1819<span class="span2"> 1820<a href="#" id="s2a1">span2a1</a> 1821</span> 1822<span class="span3"></span> 1823<custom-dashed-tag class="dashed" id="dash2"/> 1824<div data-tag="dashedvalue" id="data1"/> 1825</span> 1826</div> 1827<x id="xid"> 1828<z id="zida"/> 1829<z id="zidab"/> 1830<z id="zidac"/> 1831</x> 1832<y id="yid"> 1833<z id="zidb"/> 1834</y> 1835<p lang="en" id="lang-en">English</p> 1836<p lang="en-gb" id="lang-en-gb">English UK</p> 1837<p lang="en-us" id="lang-en-us">English US</p> 1838<p lang="fr" id="lang-fr">French</p> 1839</div> 1840 1841<div id="footer"> 1842</div> 1843""" 1844 1845 def setUp(self): 1846 self.soup = BeautifulSoup(self.HTML, 'html.parser') 1847 1848 def assertSelects(self, selector, expected_ids, **kwargs): 1849 el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] 1850 el_ids.sort() 1851 expected_ids.sort() 1852 self.assertEqual(expected_ids, el_ids, 1853 "Selector %s, expected [%s], got [%s]" % ( 1854 selector, ', '.join(expected_ids), ', '.join(el_ids) 1855 ) 1856 ) 1857 1858 assertSelect = assertSelects 1859 1860 def assertSelectMultiple(self, *tests): 1861 for selector, expected_ids in tests: 1862 self.assertSelect(selector, expected_ids) 1863 1864 def test_one_tag_one(self): 1865 els = self.soup.select('title') 1866 self.assertEqual(len(els), 1) 1867 self.assertEqual(els[0].name, 'title') 1868 self.assertEqual(els[0].contents, ['The title']) 1869 1870 def test_one_tag_many(self): 1871 els = self.soup.select('div') 1872 self.assertEqual(len(els), 4) 1873 for div in els: 1874 self.assertEqual(div.name, 'div') 1875 1876 el = self.soup.select_one('div') 1877 self.assertEqual('main', el['id']) 1878 1879 def test_select_one_returns_none_if_no_match(self): 1880 match = self.soup.select_one('nonexistenttag') 1881 self.assertEqual(None, match) 1882 1883 1884 def test_tag_in_tag_one(self): 1885 els = self.soup.select('div div') 1886 self.assertSelects('div div', ['inner', 'data1']) 1887 1888 def test_tag_in_tag_many(self): 1889 for selector in ('html div', 'html body div', 'body div'): 1890 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) 1891 1892 1893 def test_limit(self): 1894 self.assertSelects('html div', ['main'], limit=1) 1895 self.assertSelects('html body div', ['inner', 'main'], limit=2) 1896 self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'], 1897 limit=10) 1898 1899 def test_tag_no_match(self): 1900 self.assertEqual(len(self.soup.select('del')), 0) 1901 1902 def test_invalid_tag(self): 1903 self.assertRaises(SyntaxError, self.soup.select, 'tag%t') 1904 1905 def test_select_dashed_tag_ids(self): 1906 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) 1907 1908 def test_select_dashed_by_id(self): 1909 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') 1910 self.assertEqual(dashed[0].name, 'custom-dashed-tag') 1911 self.assertEqual(dashed[0]['id'], 'dash2') 1912 1913 def test_dashed_tag_text(self): 1914 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') 1915 1916 def test_select_dashed_matches_find_all(self): 1917 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) 1918 1919 def test_header_tags(self): 1920 self.assertSelectMultiple( 1921 ('h1', ['header1']), 1922 ('h2', ['header2', 'header3']), 1923 ) 1924 1925 def test_class_one(self): 1926 for selector in ('.onep', 'p.onep', 'html p.onep'): 1927 els = self.soup.select(selector) 1928 self.assertEqual(len(els), 1) 1929 self.assertEqual(els[0].name, 'p') 1930 self.assertEqual(els[0]['class'], ['onep']) 1931 1932 def test_class_mismatched_tag(self): 1933 els = self.soup.select('div.onep') 1934 self.assertEqual(len(els), 0) 1935 1936 def test_one_id(self): 1937 for selector in ('div#inner', '#inner', 'div div#inner'): 1938 self.assertSelects(selector, ['inner']) 1939 1940 def test_bad_id(self): 1941 els = self.soup.select('#doesnotexist') 1942 self.assertEqual(len(els), 0) 1943 1944 def test_items_in_id(self): 1945 els = self.soup.select('div#inner p') 1946 self.assertEqual(len(els), 3) 1947 for el in els: 1948 self.assertEqual(el.name, 'p') 1949 self.assertEqual(els[1]['class'], ['onep']) 1950 self.assertFalse(els[0].has_attr('class')) 1951 1952 def test_a_bunch_of_emptys(self): 1953 for selector in ('div#main del', 'div#main div.oops', 'div div#main'): 1954 self.assertEqual(len(self.soup.select(selector)), 0) 1955 1956 def test_multi_class_support(self): 1957 for selector in ('.class1', 'p.class1', '.class2', 'p.class2', 1958 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): 1959 self.assertSelects(selector, ['pmulti']) 1960 1961 def test_multi_class_selection(self): 1962 for selector in ('.class1.class3', '.class3.class2', 1963 '.class1.class2.class3'): 1964 self.assertSelects(selector, ['pmulti']) 1965 1966 def test_child_selector(self): 1967 self.assertSelects('.s1 > a', ['s1a1', 's1a2']) 1968 self.assertSelects('.s1 > a span', ['s1a2s1']) 1969 1970 def test_child_selector_id(self): 1971 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) 1972 1973 def test_attribute_equals(self): 1974 self.assertSelectMultiple( 1975 ('p[class="onep"]', ['p1']), 1976 ('p[id="p1"]', ['p1']), 1977 ('[class="onep"]', ['p1']), 1978 ('[id="p1"]', ['p1']), 1979 ('link[rel="stylesheet"]', ['l1']), 1980 ('link[type="text/css"]', ['l1']), 1981 ('link[href="blah.css"]', ['l1']), 1982 ('link[href="no-blah.css"]', []), 1983 ('[rel="stylesheet"]', ['l1']), 1984 ('[type="text/css"]', ['l1']), 1985 ('[href="blah.css"]', ['l1']), 1986 ('[href="no-blah.css"]', []), 1987 ('p[href="no-blah.css"]', []), 1988 ('[href="no-blah.css"]', []), 1989 ) 1990 1991 def test_attribute_tilde(self): 1992 self.assertSelectMultiple( 1993 ('p[class~="class1"]', ['pmulti']), 1994 ('p[class~="class2"]', ['pmulti']), 1995 ('p[class~="class3"]', ['pmulti']), 1996 ('[class~="class1"]', ['pmulti']), 1997 ('[class~="class2"]', ['pmulti']), 1998 ('[class~="class3"]', ['pmulti']), 1999 ('a[rel~="friend"]', ['bob']), 2000 ('a[rel~="met"]', ['bob']), 2001 ('[rel~="friend"]', ['bob']), 2002 ('[rel~="met"]', ['bob']), 2003 ) 2004 2005 def test_attribute_startswith(self): 2006 self.assertSelectMultiple( 2007 ('[rel^="style"]', ['l1']), 2008 ('link[rel^="style"]', ['l1']), 2009 ('notlink[rel^="notstyle"]', []), 2010 ('[rel^="notstyle"]', []), 2011 ('link[rel^="notstyle"]', []), 2012 ('link[href^="bla"]', ['l1']), 2013 ('a[href^="http://"]', ['bob', 'me']), 2014 ('[href^="http://"]', ['bob', 'me']), 2015 ('[id^="p"]', ['pmulti', 'p1']), 2016 ('[id^="m"]', ['me', 'main']), 2017 ('div[id^="m"]', ['main']), 2018 ('a[id^="m"]', ['me']), 2019 ('div[data-tag^="dashed"]', ['data1']) 2020 ) 2021 2022 def test_attribute_endswith(self): 2023 self.assertSelectMultiple( 2024 ('[href$=".css"]', ['l1']), 2025 ('link[href$=".css"]', ['l1']), 2026 ('link[id$="1"]', ['l1']), 2027 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), 2028 ('div[id$="1"]', ['data1']), 2029 ('[id$="noending"]', []), 2030 ) 2031 2032 def test_attribute_contains(self): 2033 self.assertSelectMultiple( 2034 # From test_attribute_startswith 2035 ('[rel*="style"]', ['l1']), 2036 ('link[rel*="style"]', ['l1']), 2037 ('notlink[rel*="notstyle"]', []), 2038 ('[rel*="notstyle"]', []), 2039 ('link[rel*="notstyle"]', []), 2040 ('link[href*="bla"]', ['l1']), 2041 ('[href*="http://"]', ['bob', 'me']), 2042 ('[id*="p"]', ['pmulti', 'p1']), 2043 ('div[id*="m"]', ['main']), 2044 ('a[id*="m"]', ['me']), 2045 # From test_attribute_endswith 2046 ('[href*=".css"]', ['l1']), 2047 ('link[href*=".css"]', ['l1']), 2048 ('link[id*="1"]', ['l1']), 2049 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), 2050 ('div[id*="1"]', ['data1']), 2051 ('[id*="noending"]', []), 2052 # New for this test 2053 ('[href*="."]', ['bob', 'me', 'l1']), 2054 ('a[href*="."]', ['bob', 'me']), 2055 ('link[href*="."]', ['l1']), 2056 ('div[id*="n"]', ['main', 'inner']), 2057 ('div[id*="nn"]', ['inner']), 2058 ('div[data-tag*="edval"]', ['data1']) 2059 ) 2060 2061 def test_attribute_exact_or_hypen(self): 2062 self.assertSelectMultiple( 2063 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 2064 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), 2065 ('p[lang|="fr"]', ['lang-fr']), 2066 ('p[lang|="gb"]', []), 2067 ) 2068 2069 def test_attribute_exists(self): 2070 self.assertSelectMultiple( 2071 ('[rel]', ['l1', 'bob', 'me']), 2072 ('link[rel]', ['l1']), 2073 ('a[rel]', ['bob', 'me']), 2074 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), 2075 ('p[class]', ['p1', 'pmulti']), 2076 ('[blah]', []), 2077 ('p[blah]', []), 2078 ('div[data-tag]', ['data1']) 2079 ) 2080 2081 def test_quoted_space_in_selector_name(self): 2082 html = """<div style="display: wrong">nope</div> 2083 <div style="display: right">yes</div> 2084 """ 2085 soup = BeautifulSoup(html, 'html.parser') 2086 [chosen] = soup.select('div[style="display: right"]') 2087 self.assertEqual("yes", chosen.string) 2088 2089 def test_unsupported_pseudoclass(self): 2090 self.assertRaises( 2091 NotImplementedError, self.soup.select, "a:no-such-pseudoclass") 2092 2093 self.assertRaises( 2094 SyntaxError, self.soup.select, "a:nth-of-type(a)") 2095 2096 def test_nth_of_type(self): 2097 # Try to select first paragraph 2098 els = self.soup.select('div#inner p:nth-of-type(1)') 2099 self.assertEqual(len(els), 1) 2100 self.assertEqual(els[0].string, 'Some text') 2101 2102 # Try to select third paragraph 2103 els = self.soup.select('div#inner p:nth-of-type(3)') 2104 self.assertEqual(len(els), 1) 2105 self.assertEqual(els[0].string, 'Another') 2106 2107 # Try to select (non-existent!) fourth paragraph 2108 els = self.soup.select('div#inner p:nth-of-type(4)') 2109 self.assertEqual(len(els), 0) 2110 2111 # Zero will select no tags. 2112 els = self.soup.select('div p:nth-of-type(0)') 2113 self.assertEqual(len(els), 0) 2114 2115 def test_nth_of_type_direct_descendant(self): 2116 els = self.soup.select('div#inner > p:nth-of-type(1)') 2117 self.assertEqual(len(els), 1) 2118 self.assertEqual(els[0].string, 'Some text') 2119 2120 def test_id_child_selector_nth_of_type(self): 2121 self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) 2122 2123 def test_select_on_element(self): 2124 # Other tests operate on the tree; this operates on an element 2125 # within the tree. 2126 inner = self.soup.find("div", id="main") 2127 selected = inner.select("div") 2128 # The <div id="inner"> tag was selected. The <div id="footer"> 2129 # tag was not. 2130 self.assertSelectsIDs(selected, ['inner', 'data1']) 2131 2132 def test_overspecified_child_id(self): 2133 self.assertSelects(".fancy #inner", ['inner']) 2134 self.assertSelects(".normal #inner", []) 2135 2136 def test_adjacent_sibling_selector(self): 2137 self.assertSelects('#p1 + h2', ['header2']) 2138 self.assertSelects('#p1 + h2 + p', ['pmulti']) 2139 self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) 2140 self.assertEqual([], self.soup.select('#p1 + p')) 2141 2142 def test_general_sibling_selector(self): 2143 self.assertSelects('#p1 ~ h2', ['header2', 'header3']) 2144 self.assertSelects('#p1 ~ #header2', ['header2']) 2145 self.assertSelects('#p1 ~ h2 + a', ['me']) 2146 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) 2147 self.assertEqual([], self.soup.select('#inner ~ h2')) 2148 2149 def test_dangling_combinator(self): 2150 self.assertRaises(SyntaxError, self.soup.select, 'h1 >') 2151 2152 def test_sibling_combinator_wont_select_same_tag_twice(self): 2153 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) 2154 2155 # Test the selector grouping operator (the comma) 2156 def test_multiple_select(self): 2157 self.assertSelects('x, y', ['xid', 'yid']) 2158 2159 def test_multiple_select_with_no_space(self): 2160 self.assertSelects('x,y', ['xid', 'yid']) 2161 2162 def test_multiple_select_with_more_space(self): 2163 self.assertSelects('x, y', ['xid', 'yid']) 2164 2165 def test_multiple_select_duplicated(self): 2166 self.assertSelects('x, x', ['xid']) 2167 2168 def test_multiple_select_sibling(self): 2169 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) 2170 2171 def test_multiple_select_tag_and_direct_descendant(self): 2172 self.assertSelects('x, y > z', ['xid', 'zidb']) 2173 2174 def test_multiple_select_direct_descendant_and_tags(self): 2175 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) 2176 2177 def test_multiple_select_indirect_descendant(self): 2178 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) 2179 2180 def test_invalid_multiple_select(self): 2181 self.assertRaises(SyntaxError, self.soup.select, ',x, y') 2182 self.assertRaises(SyntaxError, self.soup.select, 'x,,y') 2183 2184 def test_multiple_select_attrs(self): 2185 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) 2186 2187 def test_multiple_select_ids(self): 2188 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) 2189 2190 def test_multiple_select_nested(self): 2191 self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) 2192 2193 def test_select_duplicate_elements(self): 2194 # When markup contains duplicate elements, a multiple select 2195 # will find all of them. 2196 markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' 2197 soup = BeautifulSoup(markup, 'html.parser') 2198 selected = soup.select(".c1, .c2") 2199 self.assertEqual(3, len(selected)) 2200 2201 # Verify that find_all finds the same elements, though because 2202 # of an implementation detail it finds them in a different 2203 # order. 2204 for element in soup.find_all(class_=['c1', 'c2']): 2205 assert element in selected 2206