1# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12from pdb import set_trace
13import copy
14import pickle
15import re
16import warnings
17from bs4 import BeautifulSoup
18from bs4.builder import (
19    builder_registry,
20    HTMLParserTreeBuilder,
21)
22from bs4.element import (
23    PY3K,
24    CData,
25    Comment,
26    Declaration,
27    Doctype,
28    Formatter,
29    NavigableString,
30    SoupStrainer,
31    Tag,
32)
33from bs4.testing import (
34    SoupTest,
35    skipIf,
36)
37
38XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
39LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
40
41class TreeTest(SoupTest):
42
43    def assertSelects(self, tags, should_match):
44        """Make sure that the given tags have the correct text.
45
46        This is used in tests that define a bunch of tags, each
47        containing a single string, and then select certain strings by
48        some mechanism.
49        """
50        self.assertEqual([tag.string for tag in tags], should_match)
51
52    def assertSelectsIDs(self, tags, should_match):
53        """Make sure that the given tags have the correct IDs.
54
55        This is used in tests that define a bunch of tags, each
56        containing a single string, and then select certain strings by
57        some mechanism.
58        """
59        self.assertEqual([tag['id'] for tag in tags], should_match)
60
61
62class TestFind(TreeTest):
63    """Basic tests of the find() method.
64
65    find() just calls find_all() with limit=1, so it's not tested all
66    that thouroughly here.
67    """
68
69    def test_find_tag(self):
70        soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
71        self.assertEqual(soup.find("b").string, "2")
72
73    def test_unicode_text_find(self):
74        soup = self.soup('<h1>Räksmörgås</h1>')
75        self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
76
77    def test_unicode_attribute_find(self):
78        soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
79        str(soup)
80        self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
81
82
83    def test_find_everything(self):
84        """Test an optimization that finds all tags."""
85        soup = self.soup("<a>foo</a><b>bar</b>")
86        self.assertEqual(2, len(soup.find_all()))
87
88    def test_find_everything_with_name(self):
89        """Test an optimization that finds all tags with a given name."""
90        soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
91        self.assertEqual(2, len(soup.find_all('a')))
92
93class TestFindAll(TreeTest):
94    """Basic tests of the find_all() method."""
95
96    def test_find_all_text_nodes(self):
97        """You can search the tree for text nodes."""
98        soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
99        # Exact match.
100        self.assertEqual(soup.find_all(string="bar"), ["bar"])
101        self.assertEqual(soup.find_all(text="bar"), ["bar"])
102        # Match any of a number of strings.
103        self.assertEqual(
104            soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
105        # Match a regular expression.
106        self.assertEqual(soup.find_all(text=re.compile('.*')),
107                         ["Foo", "bar", '\xbb'])
108        # Match anything.
109        self.assertEqual(soup.find_all(text=True),
110                         ["Foo", "bar", '\xbb'])
111
112    def test_find_all_limit(self):
113        """You can limit the number of items returned by find_all."""
114        soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
115        self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
116        self.assertSelects(soup.find_all('a', limit=1), ["1"])
117        self.assertSelects(
118            soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
119
120        # A limit of 0 means no limit.
121        self.assertSelects(
122            soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
123
124    def test_calling_a_tag_is_calling_findall(self):
125        soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
126        self.assertSelects(soup('a', limit=1), ["1"])
127        self.assertSelects(soup.b(id="foo"), ["3"])
128
129    def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
130        soup = self.soup("<a></a>")
131        # Create a self-referential list.
132        l = []
133        l.append(l)
134
135        # Without special code in _normalize_search_value, this would cause infinite
136        # recursion.
137        self.assertEqual([], soup.find_all(l))
138
139    def test_find_all_resultset(self):
140        """All find_all calls return a ResultSet"""
141        soup = self.soup("<a></a>")
142        result = soup.find_all("a")
143        self.assertTrue(hasattr(result, "source"))
144
145        result = soup.find_all(True)
146        self.assertTrue(hasattr(result, "source"))
147
148        result = soup.find_all(text="foo")
149        self.assertTrue(hasattr(result, "source"))
150
151
152class TestFindAllBasicNamespaces(TreeTest):
153
154    def test_find_by_namespaced_name(self):
155        soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
156        self.assertEqual("4", soup.find("mathml:msqrt").string)
157        self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
158
159
160class TestFindAllByName(TreeTest):
161    """Test ways of finding tags by tag name."""
162
163    def setUp(self):
164        super(TreeTest, self).setUp()
165        self.tree =  self.soup("""<a>First tag.</a>
166                                  <b>Second tag.</b>
167                                  <c>Third <a>Nested tag.</a> tag.</c>""")
168
169    def test_find_all_by_tag_name(self):
170        # Find all the <a> tags.
171        self.assertSelects(
172            self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
173
174    def test_find_all_by_name_and_text(self):
175        self.assertSelects(
176            self.tree.find_all('a', text='First tag.'), ['First tag.'])
177
178        self.assertSelects(
179            self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
180
181        self.assertSelects(
182            self.tree.find_all('a', text=re.compile("tag")),
183            ['First tag.', 'Nested tag.'])
184
185
186    def test_find_all_on_non_root_element(self):
187        # You can call find_all on any node, not just the root.
188        self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
189
190    def test_calling_element_invokes_find_all(self):
191        self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
192
193    def test_find_all_by_tag_strainer(self):
194        self.assertSelects(
195            self.tree.find_all(SoupStrainer('a')),
196            ['First tag.', 'Nested tag.'])
197
198    def test_find_all_by_tag_names(self):
199        self.assertSelects(
200            self.tree.find_all(['a', 'b']),
201            ['First tag.', 'Second tag.', 'Nested tag.'])
202
203    def test_find_all_by_tag_dict(self):
204        self.assertSelects(
205            self.tree.find_all({'a' : True, 'b' : True}),
206            ['First tag.', 'Second tag.', 'Nested tag.'])
207
208    def test_find_all_by_tag_re(self):
209        self.assertSelects(
210            self.tree.find_all(re.compile('^[ab]$')),
211            ['First tag.', 'Second tag.', 'Nested tag.'])
212
213    def test_find_all_with_tags_matching_method(self):
214        # You can define an oracle method that determines whether
215        # a tag matches the search.
216        def id_matches_name(tag):
217            return tag.name == tag.get('id')
218
219        tree = self.soup("""<a id="a">Match 1.</a>
220                            <a id="1">Does not match.</a>
221                            <b id="b">Match 2.</a>""")
222
223        self.assertSelects(
224            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
225
226    def test_find_with_multi_valued_attribute(self):
227        soup = self.soup(
228            "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
229        )
230        r1 = soup.find('div', 'a d');
231        r2 = soup.find('div', re.compile(r'a d'));
232        r3, r4 = soup.find_all('div', ['a b', 'a d']);
233        self.assertEqual('3', r1.string)
234        self.assertEqual('3', r2.string)
235        self.assertEqual('1', r3.string)
236        self.assertEqual('3', r4.string)
237
238
239class TestFindAllByAttribute(TreeTest):
240
241    def test_find_all_by_attribute_name(self):
242        # You can pass in keyword arguments to find_all to search by
243        # attribute.
244        tree = self.soup("""
245                         <a id="first">Matching a.</a>
246                         <a id="second">
247                          Non-matching <b id="first">Matching b.</b>a.
248                         </a>""")
249        self.assertSelects(tree.find_all(id='first'),
250                           ["Matching a.", "Matching b."])
251
252    def test_find_all_by_utf8_attribute_value(self):
253        peace = "םולש".encode("utf8")
254        data = '<a title="םולש"></a>'.encode("utf8")
255        soup = self.soup(data)
256        self.assertEqual([soup.a], soup.find_all(title=peace))
257        self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
258        self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
259
260    def test_find_all_by_attribute_dict(self):
261        # You can pass in a dictionary as the argument 'attrs'. This
262        # lets you search for attributes like 'name' (a fixed argument
263        # to find_all) and 'class' (a reserved word in Python.)
264        tree = self.soup("""
265                         <a name="name1" class="class1">Name match.</a>
266                         <a name="name2" class="class2">Class match.</a>
267                         <a name="name3" class="class3">Non-match.</a>
268                         <name1>A tag called 'name1'.</name1>
269                         """)
270
271        # This doesn't do what you want.
272        self.assertSelects(tree.find_all(name='name1'),
273                           ["A tag called 'name1'."])
274        # This does what you want.
275        self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
276                           ["Name match."])
277
278        self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
279                           ["Class match."])
280
281    def test_find_all_by_class(self):
282        tree = self.soup("""
283                         <a class="1">Class 1.</a>
284                         <a class="2">Class 2.</a>
285                         <b class="1">Class 1.</b>
286                         <c class="3 4">Class 3 and 4.</c>
287                         """)
288
289        # Passing in the class_ keyword argument will search against
290        # the 'class' attribute.
291        self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
292        self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
293        self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
294
295        # Passing in a string to 'attrs' will also search the CSS class.
296        self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
297        self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
298        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
299        self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
300
301    def test_find_by_class_when_multiple_classes_present(self):
302        tree = self.soup("<gar class='foo bar'>Found it</gar>")
303
304        f = tree.find_all("gar", class_=re.compile("o"))
305        self.assertSelects(f, ["Found it"])
306
307        f = tree.find_all("gar", class_=re.compile("a"))
308        self.assertSelects(f, ["Found it"])
309
310        # If the search fails to match the individual strings "foo" and "bar",
311        # it will be tried against the combined string "foo bar".
312        f = tree.find_all("gar", class_=re.compile("o b"))
313        self.assertSelects(f, ["Found it"])
314
315    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
316        soup = self.soup("<a class='bar'>Found it</a>")
317
318        self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
319
320        def big_attribute_value(value):
321            return len(value) > 3
322
323        self.assertSelects(soup.find_all("a", big_attribute_value), [])
324
325        def small_attribute_value(value):
326            return len(value) <= 3
327
328        self.assertSelects(
329            soup.find_all("a", small_attribute_value), ["Found it"])
330
331    def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
332        soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
333        a, a2 = soup.find_all("a")
334        self.assertEqual([a, a2], soup.find_all("a", "foo"))
335        self.assertEqual([a], soup.find_all("a", "bar"))
336
337        # If you specify the class as a string that contains a
338        # space, only that specific value will be found.
339        self.assertEqual([a], soup.find_all("a", class_="foo bar"))
340        self.assertEqual([a], soup.find_all("a", "foo bar"))
341        self.assertEqual([], soup.find_all("a", "bar foo"))
342
343    def test_find_all_by_attribute_soupstrainer(self):
344        tree = self.soup("""
345                         <a id="first">Match.</a>
346                         <a id="second">Non-match.</a>""")
347
348        strainer = SoupStrainer(attrs={'id' : 'first'})
349        self.assertSelects(tree.find_all(strainer), ['Match.'])
350
351    def test_find_all_with_missing_attribute(self):
352        # You can pass in None as the value of an attribute to find_all.
353        # This will match tags that do not have that attribute set.
354        tree = self.soup("""<a id="1">ID present.</a>
355                            <a>No ID present.</a>
356                            <a id="">ID is empty.</a>""")
357        self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
358
359    def test_find_all_with_defined_attribute(self):
360        # You can pass in None as the value of an attribute to find_all.
361        # This will match tags that have that attribute set to any value.
362        tree = self.soup("""<a id="1">ID present.</a>
363                            <a>No ID present.</a>
364                            <a id="">ID is empty.</a>""")
365        self.assertSelects(
366            tree.find_all(id=True), ["ID present.", "ID is empty."])
367
368    def test_find_all_with_numeric_attribute(self):
369        # If you search for a number, it's treated as a string.
370        tree = self.soup("""<a id=1>Unquoted attribute.</a>
371                            <a id="1">Quoted attribute.</a>""")
372
373        expected = ["Unquoted attribute.", "Quoted attribute."]
374        self.assertSelects(tree.find_all(id=1), expected)
375        self.assertSelects(tree.find_all(id="1"), expected)
376
377    def test_find_all_with_list_attribute_values(self):
378        # You can pass a list of attribute values instead of just one,
379        # and you'll get tags that match any of the values.
380        tree = self.soup("""<a id="1">1</a>
381                            <a id="2">2</a>
382                            <a id="3">3</a>
383                            <a>No ID.</a>""")
384        self.assertSelects(tree.find_all(id=["1", "3", "4"]),
385                           ["1", "3"])
386
387    def test_find_all_with_regular_expression_attribute_value(self):
388        # You can pass a regular expression as an attribute value, and
389        # you'll get tags whose values for that attribute match the
390        # regular expression.
391        tree = self.soup("""<a id="a">One a.</a>
392                            <a id="aa">Two as.</a>
393                            <a id="ab">Mixed as and bs.</a>
394                            <a id="b">One b.</a>
395                            <a>No ID.</a>""")
396
397        self.assertSelects(tree.find_all(id=re.compile("^a+$")),
398                           ["One a.", "Two as."])
399
400    def test_find_by_name_and_containing_string(self):
401        soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
402        a = soup.a
403
404        self.assertEqual([a], soup.find_all("a", text="foo"))
405        self.assertEqual([], soup.find_all("a", text="bar"))
406        self.assertEqual([], soup.find_all("a", text="bar"))
407
408    def test_find_by_name_and_containing_string_when_string_is_buried(self):
409        soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
410        self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
411
412    def test_find_by_attribute_and_containing_string(self):
413        soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
414        a = soup.a
415
416        self.assertEqual([a], soup.find_all(id=2, text="foo"))
417        self.assertEqual([], soup.find_all(id=1, text="bar"))
418
419
420class TestSmooth(TreeTest):
421    """Test Tag.smooth."""
422
423    def test_smooth(self):
424        soup = self.soup("<div>a</div>")
425        div = soup.div
426        div.append("b")
427        div.append("c")
428        div.append(Comment("Comment 1"))
429        div.append(Comment("Comment 2"))
430        div.append("d")
431        builder = self.default_builder()
432        span = Tag(soup, builder, 'span')
433        span.append('1')
434        span.append('2')
435        div.append(span)
436
437        # At this point the tree has a bunch of adjacent
438        # NavigableStrings. This is normal, but it has no meaning in
439        # terms of HTML, so we may want to smooth things out for
440        # output.
441
442        # Since the <span> tag has two children, its .string is None.
443        self.assertEqual(None, div.span.string)
444
445        self.assertEqual(7, len(div.contents))
446        div.smooth()
447        self.assertEqual(5, len(div.contents))
448
449        # The three strings at the beginning of div.contents have been
450        # merged into on string.
451        #
452        self.assertEqual('abc', div.contents[0])
453
454        # The call is recursive -- the <span> tag was also smoothed.
455        self.assertEqual('12', div.span.string)
456
457        # The two comments have _not_ been merged, even though
458        # comments are strings. Merging comments would change the
459        # meaning of the HTML.
460        self.assertEqual('Comment 1', div.contents[1])
461        self.assertEqual('Comment 2', div.contents[2])
462
463
464class TestIndex(TreeTest):
465    """Test Tag.index"""
466    def test_index(self):
467        tree = self.soup("""<div>
468                            <a>Identical</a>
469                            <b>Not identical</b>
470                            <a>Identical</a>
471
472                            <c><d>Identical with child</d></c>
473                            <b>Also not identical</b>
474                            <c><d>Identical with child</d></c>
475                            </div>""")
476        div = tree.div
477        for i, element in enumerate(div.contents):
478            self.assertEqual(i, div.index(element))
479        self.assertRaises(ValueError, tree.index, 1)
480
481
482class TestParentOperations(TreeTest):
483    """Test navigation and searching through an element's parents."""
484
485    def setUp(self):
486        super(TestParentOperations, self).setUp()
487        self.tree = self.soup('''<ul id="empty"></ul>
488                                 <ul id="top">
489                                  <ul id="middle">
490                                   <ul id="bottom">
491                                    <b>Start here</b>
492                                   </ul>
493                                  </ul>''')
494        self.start = self.tree.b
495
496
497    def test_parent(self):
498        self.assertEqual(self.start.parent['id'], 'bottom')
499        self.assertEqual(self.start.parent.parent['id'], 'middle')
500        self.assertEqual(self.start.parent.parent.parent['id'], 'top')
501
502    def test_parent_of_top_tag_is_soup_object(self):
503        top_tag = self.tree.contents[0]
504        self.assertEqual(top_tag.parent, self.tree)
505
506    def test_soup_object_has_no_parent(self):
507        self.assertEqual(None, self.tree.parent)
508
509    def test_find_parents(self):
510        self.assertSelectsIDs(
511            self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
512        self.assertSelectsIDs(
513            self.start.find_parents('ul', id="middle"), ['middle'])
514
515    def test_find_parent(self):
516        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
517        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
518
519    def test_parent_of_text_element(self):
520        text = self.tree.find(text="Start here")
521        self.assertEqual(text.parent.name, 'b')
522
523    def test_text_element_find_parent(self):
524        text = self.tree.find(text="Start here")
525        self.assertEqual(text.find_parent('ul')['id'], 'bottom')
526
527    def test_parent_generator(self):
528        parents = [parent['id'] for parent in self.start.parents
529                   if parent is not None and 'id' in parent.attrs]
530        self.assertEqual(parents, ['bottom', 'middle', 'top'])
531
532
533class ProximityTest(TreeTest):
534
535    def setUp(self):
536        super(TreeTest, self).setUp()
537        self.tree = self.soup(
538            '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
539
540
541class TestNextOperations(ProximityTest):
542
543    def setUp(self):
544        super(TestNextOperations, self).setUp()
545        self.start = self.tree.b
546
547    def test_next(self):
548        self.assertEqual(self.start.next_element, "One")
549        self.assertEqual(self.start.next_element.next_element['id'], "2")
550
551    def test_next_of_last_item_is_none(self):
552        last = self.tree.find(text="Three")
553        self.assertEqual(last.next_element, None)
554
555    def test_next_of_root_is_none(self):
556        # The document root is outside the next/previous chain.
557        self.assertEqual(self.tree.next_element, None)
558
559    def test_find_all_next(self):
560        self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
561        self.start.find_all_next(id=3)
562        self.assertSelects(self.start.find_all_next(id=3), ["Three"])
563
564    def test_find_next(self):
565        self.assertEqual(self.start.find_next('b')['id'], '2')
566        self.assertEqual(self.start.find_next(text="Three"), "Three")
567
568    def test_find_next_for_text_element(self):
569        text = self.tree.find(text="One")
570        self.assertEqual(text.find_next("b").string, "Two")
571        self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
572
573    def test_next_generator(self):
574        start = self.tree.find(text="Two")
575        successors = [node for node in start.next_elements]
576        # There are two successors: the final <b> tag and its text contents.
577        tag, contents = successors
578        self.assertEqual(tag['id'], '3')
579        self.assertEqual(contents, "Three")
580
581class TestPreviousOperations(ProximityTest):
582
583    def setUp(self):
584        super(TestPreviousOperations, self).setUp()
585        self.end = self.tree.find(text="Three")
586
587    def test_previous(self):
588        self.assertEqual(self.end.previous_element['id'], "3")
589        self.assertEqual(self.end.previous_element.previous_element, "Two")
590
591    def test_previous_of_first_item_is_none(self):
592        first = self.tree.find('html')
593        self.assertEqual(first.previous_element, None)
594
595    def test_previous_of_root_is_none(self):
596        # The document root is outside the next/previous chain.
597        # XXX This is broken!
598        #self.assertEqual(self.tree.previous_element, None)
599        pass
600
601    def test_find_all_previous(self):
602        # The <b> tag containing the "Three" node is the predecessor
603        # of the "Three" node itself, which is why "Three" shows up
604        # here.
605        self.assertSelects(
606            self.end.find_all_previous('b'), ["Three", "Two", "One"])
607        self.assertSelects(self.end.find_all_previous(id=1), ["One"])
608
609    def test_find_previous(self):
610        self.assertEqual(self.end.find_previous('b')['id'], '3')
611        self.assertEqual(self.end.find_previous(text="One"), "One")
612
613    def test_find_previous_for_text_element(self):
614        text = self.tree.find(text="Three")
615        self.assertEqual(text.find_previous("b").string, "Three")
616        self.assertSelects(
617            text.find_all_previous("b"), ["Three", "Two", "One"])
618
619    def test_previous_generator(self):
620        start = self.tree.find(text="One")
621        predecessors = [node for node in start.previous_elements]
622
623        # There are four predecessors: the <b> tag containing "One"
624        # the <body> tag, the <head> tag, and the <html> tag.
625        b, body, head, html = predecessors
626        self.assertEqual(b['id'], '1')
627        self.assertEqual(body.name, "body")
628        self.assertEqual(head.name, "head")
629        self.assertEqual(html.name, "html")
630
631
632class SiblingTest(TreeTest):
633
634    def setUp(self):
635        super(SiblingTest, self).setUp()
636        markup = '''<html>
637                    <span id="1">
638                     <span id="1.1"></span>
639                    </span>
640                    <span id="2">
641                     <span id="2.1"></span>
642                    </span>
643                    <span id="3">
644                     <span id="3.1"></span>
645                    </span>
646                    <span id="4"></span>
647                    </html>'''
648        # All that whitespace looks good but makes the tests more
649        # difficult. Get rid of it.
650        markup = re.compile(r"\n\s*").sub("", markup)
651        self.tree = self.soup(markup)
652
653
654class TestNextSibling(SiblingTest):
655
656    def setUp(self):
657        super(TestNextSibling, self).setUp()
658        self.start = self.tree.find(id="1")
659
660    def test_next_sibling_of_root_is_none(self):
661        self.assertEqual(self.tree.next_sibling, None)
662
663    def test_next_sibling(self):
664        self.assertEqual(self.start.next_sibling['id'], '2')
665        self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
666
667        # Note the difference between next_sibling and next_element.
668        self.assertEqual(self.start.next_element['id'], '1.1')
669
670    def test_next_sibling_may_not_exist(self):
671        self.assertEqual(self.tree.html.next_sibling, None)
672
673        nested_span = self.tree.find(id="1.1")
674        self.assertEqual(nested_span.next_sibling, None)
675
676        last_span = self.tree.find(id="4")
677        self.assertEqual(last_span.next_sibling, None)
678
679    def test_find_next_sibling(self):
680        self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
681
682    def test_next_siblings(self):
683        self.assertSelectsIDs(self.start.find_next_siblings("span"),
684                              ['2', '3', '4'])
685
686        self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
687
688    def test_next_sibling_for_text_element(self):
689        soup = self.soup("Foo<b>bar</b>baz")
690        start = soup.find(text="Foo")
691        self.assertEqual(start.next_sibling.name, 'b')
692        self.assertEqual(start.next_sibling.next_sibling, 'baz')
693
694        self.assertSelects(start.find_next_siblings('b'), ['bar'])
695        self.assertEqual(start.find_next_sibling(text="baz"), "baz")
696        self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
697
698
699class TestPreviousSibling(SiblingTest):
700
701    def setUp(self):
702        super(TestPreviousSibling, self).setUp()
703        self.end = self.tree.find(id="4")
704
705    def test_previous_sibling_of_root_is_none(self):
706        self.assertEqual(self.tree.previous_sibling, None)
707
708    def test_previous_sibling(self):
709        self.assertEqual(self.end.previous_sibling['id'], '3')
710        self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
711
712        # Note the difference between previous_sibling and previous_element.
713        self.assertEqual(self.end.previous_element['id'], '3.1')
714
715    def test_previous_sibling_may_not_exist(self):
716        self.assertEqual(self.tree.html.previous_sibling, None)
717
718        nested_span = self.tree.find(id="1.1")
719        self.assertEqual(nested_span.previous_sibling, None)
720
721        first_span = self.tree.find(id="1")
722        self.assertEqual(first_span.previous_sibling, None)
723
724    def test_find_previous_sibling(self):
725        self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
726
727    def test_previous_siblings(self):
728        self.assertSelectsIDs(self.end.find_previous_siblings("span"),
729                              ['3', '2', '1'])
730
731        self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
732
733    def test_previous_sibling_for_text_element(self):
734        soup = self.soup("Foo<b>bar</b>baz")
735        start = soup.find(text="baz")
736        self.assertEqual(start.previous_sibling.name, 'b')
737        self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
738
739        self.assertSelects(start.find_previous_siblings('b'), ['bar'])
740        self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
741        self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
742
743
744class TestTagCreation(SoupTest):
745    """Test the ability to create new tags."""
746    def test_new_tag(self):
747        soup = self.soup("")
748        new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
749        self.assertTrue(isinstance(new_tag, Tag))
750        self.assertEqual("foo", new_tag.name)
751        self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
752        self.assertEqual(None, new_tag.parent)
753
754    def test_tag_inherits_self_closing_rules_from_builder(self):
755        if XML_BUILDER_PRESENT:
756            xml_soup = BeautifulSoup("", "lxml-xml")
757            xml_br = xml_soup.new_tag("br")
758            xml_p = xml_soup.new_tag("p")
759
760            # Both the <br> and <p> tag are empty-element, just because
761            # they have no contents.
762            self.assertEqual(b"<br/>", xml_br.encode())
763            self.assertEqual(b"<p/>", xml_p.encode())
764
765        html_soup = BeautifulSoup("", "html.parser")
766        html_br = html_soup.new_tag("br")
767        html_p = html_soup.new_tag("p")
768
769        # The HTML builder users HTML's rules about which tags are
770        # empty-element tags, and the new tags reflect these rules.
771        self.assertEqual(b"<br/>", html_br.encode())
772        self.assertEqual(b"<p></p>", html_p.encode())
773
774    def test_new_string_creates_navigablestring(self):
775        soup = self.soup("")
776        s = soup.new_string("foo")
777        self.assertEqual("foo", s)
778        self.assertTrue(isinstance(s, NavigableString))
779
780    def test_new_string_can_create_navigablestring_subclass(self):
781        soup = self.soup("")
782        s = soup.new_string("foo", Comment)
783        self.assertEqual("foo", s)
784        self.assertTrue(isinstance(s, Comment))
785
786class TestTreeModification(SoupTest):
787
788    def test_attribute_modification(self):
789        soup = self.soup('<a id="1"></a>')
790        soup.a['id'] = 2
791        self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
792        del(soup.a['id'])
793        self.assertEqual(soup.decode(), self.document_for('<a></a>'))
794        soup.a['id2'] = 'foo'
795        self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
796
797    def test_new_tag_creation(self):
798        builder = builder_registry.lookup('html')()
799        soup = self.soup("<body></body>", builder=builder)
800        a = Tag(soup, builder, 'a')
801        ol = Tag(soup, builder, 'ol')
802        a['href'] = 'http://foo.com/'
803        soup.body.insert(0, a)
804        soup.body.insert(1, ol)
805        self.assertEqual(
806            soup.body.encode(),
807            b'<body><a href="http://foo.com/"></a><ol></ol></body>')
808
809    def test_append_to_contents_moves_tag(self):
810        doc = """<p id="1">Don't leave me <b>here</b>.</p>
811                <p id="2">Don\'t leave!</p>"""
812        soup = self.soup(doc)
813        second_para = soup.find(id='2')
814        bold = soup.b
815
816        # Move the <b> tag to the end of the second paragraph.
817        soup.find(id='2').append(soup.b)
818
819        # The <b> tag is now a child of the second paragraph.
820        self.assertEqual(bold.parent, second_para)
821
822        self.assertEqual(
823            soup.decode(), self.document_for(
824                '<p id="1">Don\'t leave me .</p>\n'
825                '<p id="2">Don\'t leave!<b>here</b></p>'))
826
827    def test_replace_with_returns_thing_that_was_replaced(self):
828        text = "<a></a><b><c></c></b>"
829        soup = self.soup(text)
830        a = soup.a
831        new_a = a.replace_with(soup.c)
832        self.assertEqual(a, new_a)
833
834    def test_unwrap_returns_thing_that_was_replaced(self):
835        text = "<a><b></b><c></c></a>"
836        soup = self.soup(text)
837        a = soup.a
838        new_a = a.unwrap()
839        self.assertEqual(a, new_a)
840
841    def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
842        soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
843        a = soup.a
844        a.extract()
845        self.assertEqual(None, a.parent)
846        self.assertRaises(ValueError, a.unwrap)
847        self.assertRaises(ValueError, a.replace_with, soup.c)
848
849    def test_replace_tag_with_itself(self):
850        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
851        soup = self.soup(text)
852        c = soup.c
853        soup.c.replace_with(c)
854        self.assertEqual(soup.decode(), self.document_for(text))
855
856    def test_replace_tag_with_its_parent_raises_exception(self):
857        text = "<a><b></b></a>"
858        soup = self.soup(text)
859        self.assertRaises(ValueError, soup.b.replace_with, soup.a)
860
861    def test_insert_tag_into_itself_raises_exception(self):
862        text = "<a><b></b></a>"
863        soup = self.soup(text)
864        self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
865
866    def test_insert_beautifulsoup_object_inserts_children(self):
867        """Inserting one BeautifulSoup object into another actually inserts all
868        of its children -- you'll never combine BeautifulSoup objects.
869        """
870        soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
871
872        text = "<p>p2</p><p>p3</p>"
873        to_insert = self.soup(text)
874        soup.insert(1, to_insert)
875
876        for i in soup.descendants:
877            assert not isinstance(i, BeautifulSoup)
878
879        p1, p2, p3, p4 = list(soup.children)
880        self.assertEqual("And now, a word:", p1.string)
881        self.assertEqual("p2", p2.string)
882        self.assertEqual("p3", p3.string)
883        self.assertEqual("And we're back.", p4.string)
884
885
886    def test_replace_with_maintains_next_element_throughout(self):
887        soup = self.soup('<p><a>one</a><b>three</b></p>')
888        a = soup.a
889        b = a.contents[0]
890        # Make it so the <a> tag has two text children.
891        a.insert(1, "two")
892
893        # Now replace each one with the empty string.
894        left, right = a.contents
895        left.replaceWith('')
896        right.replaceWith('')
897
898        # The <b> tag is still connected to the tree.
899        self.assertEqual("three", soup.b.string)
900
901    def test_replace_final_node(self):
902        soup = self.soup("<b>Argh!</b>")
903        soup.find(text="Argh!").replace_with("Hooray!")
904        new_text = soup.find(text="Hooray!")
905        b = soup.b
906        self.assertEqual(new_text.previous_element, b)
907        self.assertEqual(new_text.parent, b)
908        self.assertEqual(new_text.previous_element.next_element, new_text)
909        self.assertEqual(new_text.next_element, None)
910
911    def test_consecutive_text_nodes(self):
912        # A builder should never create two consecutive text nodes,
913        # but if you insert one next to another, Beautiful Soup will
914        # handle it correctly.
915        soup = self.soup("<a><b>Argh!</b><c></c></a>")
916        soup.b.insert(1, "Hooray!")
917
918        self.assertEqual(
919            soup.decode(), self.document_for(
920                "<a><b>Argh!Hooray!</b><c></c></a>"))
921
922        new_text = soup.find(text="Hooray!")
923        self.assertEqual(new_text.previous_element, "Argh!")
924        self.assertEqual(new_text.previous_element.next_element, new_text)
925
926        self.assertEqual(new_text.previous_sibling, "Argh!")
927        self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
928
929        self.assertEqual(new_text.next_sibling, None)
930        self.assertEqual(new_text.next_element, soup.c)
931
932    def test_insert_string(self):
933        soup = self.soup("<a></a>")
934        soup.a.insert(0, "bar")
935        soup.a.insert(0, "foo")
936        # The string were added to the tag.
937        self.assertEqual(["foo", "bar"], soup.a.contents)
938        # And they were converted to NavigableStrings.
939        self.assertEqual(soup.a.contents[0].next_element, "bar")
940
941    def test_insert_tag(self):
942        builder = self.default_builder()
943        soup = self.soup(
944            "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
945        magic_tag = Tag(soup, builder, 'magictag')
946        magic_tag.insert(0, "the")
947        soup.a.insert(1, magic_tag)
948
949        self.assertEqual(
950            soup.decode(), self.document_for(
951                "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
952
953        # Make sure all the relationships are hooked up correctly.
954        b_tag = soup.b
955        self.assertEqual(b_tag.next_sibling, magic_tag)
956        self.assertEqual(magic_tag.previous_sibling, b_tag)
957
958        find = b_tag.find(text="Find")
959        self.assertEqual(find.next_element, magic_tag)
960        self.assertEqual(magic_tag.previous_element, find)
961
962        c_tag = soup.c
963        self.assertEqual(magic_tag.next_sibling, c_tag)
964        self.assertEqual(c_tag.previous_sibling, magic_tag)
965
966        the = magic_tag.find(text="the")
967        self.assertEqual(the.parent, magic_tag)
968        self.assertEqual(the.next_element, c_tag)
969        self.assertEqual(c_tag.previous_element, the)
970
971    def test_append_child_thats_already_at_the_end(self):
972        data = "<a><b></b></a>"
973        soup = self.soup(data)
974        soup.a.append(soup.b)
975        self.assertEqual(data, soup.decode())
976
977    def test_extend(self):
978        data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
979        soup = self.soup(data)
980        l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
981        soup.a.extend(l)
982        self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
983
984    def test_move_tag_to_beginning_of_parent(self):
985        data = "<a><b></b><c></c><d></d></a>"
986        soup = self.soup(data)
987        soup.a.insert(0, soup.d)
988        self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
989
990    def test_insert_works_on_empty_element_tag(self):
991        # This is a little strange, since most HTML parsers don't allow
992        # markup like this to come through. But in general, we don't
993        # know what the parser would or wouldn't have allowed, so
994        # I'm letting this succeed for now.
995        soup = self.soup("<br/>")
996        soup.br.insert(1, "Contents")
997        self.assertEqual(str(soup.br), "<br>Contents</br>")
998
999    def test_insert_before(self):
1000        soup = self.soup("<a>foo</a><b>bar</b>")
1001        soup.b.insert_before("BAZ")
1002        soup.a.insert_before("QUUX")
1003        self.assertEqual(
1004            soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
1005
1006        soup.a.insert_before(soup.b)
1007        self.assertEqual(
1008            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
1009
1010        # Can't insert an element before itself.
1011        b = soup.b
1012        self.assertRaises(ValueError, b.insert_before, b)
1013
1014        # Can't insert before if an element has no parent.
1015        b.extract()
1016        self.assertRaises(ValueError, b.insert_before, "nope")
1017
1018        # Can insert an identical element
1019        soup = self.soup("<a>")
1020        soup.a.insert_before(soup.new_tag("a"))
1021
1022    def test_insert_multiple_before(self):
1023        soup = self.soup("<a>foo</a><b>bar</b>")
1024        soup.b.insert_before("BAZ", " ", "QUUX")
1025        soup.a.insert_before("QUUX", " ", "BAZ")
1026        self.assertEqual(
1027            soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
1028
1029        soup.a.insert_before(soup.b, "FOO")
1030        self.assertEqual(
1031            soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
1032
1033    def test_insert_after(self):
1034        soup = self.soup("<a>foo</a><b>bar</b>")
1035        soup.b.insert_after("BAZ")
1036        soup.a.insert_after("QUUX")
1037        self.assertEqual(
1038            soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
1039        soup.b.insert_after(soup.a)
1040        self.assertEqual(
1041            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
1042
1043        # Can't insert an element after itself.
1044        b = soup.b
1045        self.assertRaises(ValueError, b.insert_after, b)
1046
1047        # Can't insert after if an element has no parent.
1048        b.extract()
1049        self.assertRaises(ValueError, b.insert_after, "nope")
1050
1051        # Can insert an identical element
1052        soup = self.soup("<a>")
1053        soup.a.insert_before(soup.new_tag("a"))
1054
1055    def test_insert_multiple_after(self):
1056        soup = self.soup("<a>foo</a><b>bar</b>")
1057        soup.b.insert_after("BAZ", " ", "QUUX")
1058        soup.a.insert_after("QUUX", " ", "BAZ")
1059        self.assertEqual(
1060            soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
1061        soup.b.insert_after(soup.a, "FOO ")
1062        self.assertEqual(
1063            soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
1064
1065    def test_insert_after_raises_exception_if_after_has_no_meaning(self):
1066        soup = self.soup("")
1067        tag = soup.new_tag("a")
1068        string = soup.new_string("")
1069        self.assertRaises(ValueError, string.insert_after, tag)
1070        self.assertRaises(NotImplementedError, soup.insert_after, tag)
1071        self.assertRaises(ValueError, tag.insert_after, tag)
1072
1073    def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
1074        soup = self.soup("")
1075        tag = soup.new_tag("a")
1076        string = soup.new_string("")
1077        self.assertRaises(ValueError, string.insert_before, tag)
1078        self.assertRaises(NotImplementedError, soup.insert_before, tag)
1079        self.assertRaises(ValueError, tag.insert_before, tag)
1080
1081    def test_replace_with(self):
1082        soup = self.soup(
1083                "<p>There's <b>no</b> business like <b>show</b> business</p>")
1084        no, show = soup.find_all('b')
1085        show.replace_with(no)
1086        self.assertEqual(
1087            soup.decode(),
1088            self.document_for(
1089                "<p>There's  business like <b>no</b> business</p>"))
1090
1091        self.assertEqual(show.parent, None)
1092        self.assertEqual(no.parent, soup.p)
1093        self.assertEqual(no.next_element, "no")
1094        self.assertEqual(no.next_sibling, " business")
1095
1096    def test_replace_first_child(self):
1097        data = "<a><b></b><c></c></a>"
1098        soup = self.soup(data)
1099        soup.b.replace_with(soup.c)
1100        self.assertEqual("<a><c></c></a>", soup.decode())
1101
1102    def test_replace_last_child(self):
1103        data = "<a><b></b><c></c></a>"
1104        soup = self.soup(data)
1105        soup.c.replace_with(soup.b)
1106        self.assertEqual("<a><b></b></a>", soup.decode())
1107
1108    def test_nested_tag_replace_with(self):
1109        soup = self.soup(
1110            """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
1111
1112        # Replace the entire <b> tag and its contents ("reserve the
1113        # right") with the <f> tag ("refuse").
1114        remove_tag = soup.b
1115        move_tag = soup.f
1116        remove_tag.replace_with(move_tag)
1117
1118        self.assertEqual(
1119            soup.decode(), self.document_for(
1120                "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
1121
1122        # The <b> tag is now an orphan.
1123        self.assertEqual(remove_tag.parent, None)
1124        self.assertEqual(remove_tag.find(text="right").next_element, None)
1125        self.assertEqual(remove_tag.previous_element, None)
1126        self.assertEqual(remove_tag.next_sibling, None)
1127        self.assertEqual(remove_tag.previous_sibling, None)
1128
1129        # The <f> tag is now connected to the <a> tag.
1130        self.assertEqual(move_tag.parent, soup.a)
1131        self.assertEqual(move_tag.previous_element, "We")
1132        self.assertEqual(move_tag.next_element.next_element, soup.e)
1133        self.assertEqual(move_tag.next_sibling, None)
1134
1135        # The gap where the <f> tag used to be has been mended, and
1136        # the word "to" is now connected to the <g> tag.
1137        to_text = soup.find(text="to")
1138        g_tag = soup.g
1139        self.assertEqual(to_text.next_element, g_tag)
1140        self.assertEqual(to_text.next_sibling, g_tag)
1141        self.assertEqual(g_tag.previous_element, to_text)
1142        self.assertEqual(g_tag.previous_sibling, to_text)
1143
1144    def test_unwrap(self):
1145        tree = self.soup("""
1146            <p>Unneeded <em>formatting</em> is unneeded</p>
1147            """)
1148        tree.em.unwrap()
1149        self.assertEqual(tree.em, None)
1150        self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1151
1152    def test_wrap(self):
1153        soup = self.soup("I wish I was bold.")
1154        value = soup.string.wrap(soup.new_tag("b"))
1155        self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1156        self.assertEqual(
1157            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1158
1159    def test_wrap_extracts_tag_from_elsewhere(self):
1160        soup = self.soup("<b></b>I wish I was bold.")
1161        soup.b.next_sibling.wrap(soup.b)
1162        self.assertEqual(
1163            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1164
1165    def test_wrap_puts_new_contents_at_the_end(self):
1166        soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1167        soup.b.next_sibling.wrap(soup.b)
1168        self.assertEqual(2, len(soup.b.contents))
1169        self.assertEqual(
1170            soup.decode(), self.document_for(
1171                "<b>I like being bold.I wish I was bold.</b>"))
1172
1173    def test_extract(self):
1174        soup = self.soup(
1175            '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1176
1177        self.assertEqual(len(soup.body.contents), 3)
1178        extracted = soup.find(id="nav").extract()
1179
1180        self.assertEqual(
1181            soup.decode(), "<html><body>Some content.  More content.</body></html>")
1182        self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1183
1184        # The extracted tag is now an orphan.
1185        self.assertEqual(len(soup.body.contents), 2)
1186        self.assertEqual(extracted.parent, None)
1187        self.assertEqual(extracted.previous_element, None)
1188        self.assertEqual(extracted.next_element.next_element, None)
1189
1190        # The gap where the extracted tag used to be has been mended.
1191        content_1 = soup.find(text="Some content. ")
1192        content_2 = soup.find(text=" More content.")
1193        self.assertEqual(content_1.next_element, content_2)
1194        self.assertEqual(content_1.next_sibling, content_2)
1195        self.assertEqual(content_2.previous_element, content_1)
1196        self.assertEqual(content_2.previous_sibling, content_1)
1197
1198    def test_extract_distinguishes_between_identical_strings(self):
1199        soup = self.soup("<a>foo</a><b>bar</b>")
1200        foo_1 = soup.a.string
1201        bar_1 = soup.b.string
1202        foo_2 = soup.new_string("foo")
1203        bar_2 = soup.new_string("bar")
1204        soup.a.append(foo_2)
1205        soup.b.append(bar_2)
1206
1207        # Now there are two identical strings in the <a> tag, and two
1208        # in the <b> tag. Let's remove the first "foo" and the second
1209        # "bar".
1210        foo_1.extract()
1211        bar_2.extract()
1212        self.assertEqual(foo_2, soup.a.string)
1213        self.assertEqual(bar_2, soup.b.string)
1214
1215    def test_extract_multiples_of_same_tag(self):
1216        soup = self.soup("""
1217<html>
1218<head>
1219<script>foo</script>
1220</head>
1221<body>
1222 <script>bar</script>
1223 <a></a>
1224</body>
1225<script>baz</script>
1226</html>""")
1227        [soup.script.extract() for i in soup.find_all("script")]
1228        self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1229
1230
1231    def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1232        soup = self.soup(
1233 '<html>\n'
1234 '<body>hi</body>\n'
1235 '</html>')
1236        soup.find('body').extract()
1237        self.assertEqual(None, soup.find('body'))
1238
1239
1240    def test_clear(self):
1241        """Tag.clear()"""
1242        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1243        # clear using extract()
1244        a = soup.a
1245        soup.p.clear()
1246        self.assertEqual(len(soup.p.contents), 0)
1247        self.assertTrue(hasattr(a, "contents"))
1248
1249        # clear using decompose()
1250        em = a.em
1251        a.clear(decompose=True)
1252        self.assertEqual(0, len(em.contents))
1253
1254    def test_string_set(self):
1255        """Tag.string = 'string'"""
1256        soup = self.soup("<a></a> <b><c></c></b>")
1257        soup.a.string = "foo"
1258        self.assertEqual(soup.a.contents, ["foo"])
1259        soup.b.string = "bar"
1260        self.assertEqual(soup.b.contents, ["bar"])
1261
1262    def test_string_set_does_not_affect_original_string(self):
1263        soup = self.soup("<a><b>foo</b><c>bar</c>")
1264        soup.b.string = soup.c.string
1265        self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1266
1267    def test_set_string_preserves_class_of_string(self):
1268        soup = self.soup("<a></a>")
1269        cdata = CData("foo")
1270        soup.a.string = cdata
1271        self.assertTrue(isinstance(soup.a.string, CData))
1272
1273class TestElementObjects(SoupTest):
1274    """Test various features of element objects."""
1275
1276    def test_len(self):
1277        """The length of an element is its number of children."""
1278        soup = self.soup("<top>1<b>2</b>3</top>")
1279
1280        # The BeautifulSoup object itself contains one element: the
1281        # <top> tag.
1282        self.assertEqual(len(soup.contents), 1)
1283        self.assertEqual(len(soup), 1)
1284
1285        # The <top> tag contains three elements: the text node "1", the
1286        # <b> tag, and the text node "3".
1287        self.assertEqual(len(soup.top), 3)
1288        self.assertEqual(len(soup.top.contents), 3)
1289
1290    def test_member_access_invokes_find(self):
1291        """Accessing a Python member .foo invokes find('foo')"""
1292        soup = self.soup('<b><i></i></b>')
1293        self.assertEqual(soup.b, soup.find('b'))
1294        self.assertEqual(soup.b.i, soup.find('b').find('i'))
1295        self.assertEqual(soup.a, None)
1296
1297    def test_deprecated_member_access(self):
1298        soup = self.soup('<b><i></i></b>')
1299        with warnings.catch_warnings(record=True) as w:
1300            tag = soup.bTag
1301        self.assertEqual(soup.b, tag)
1302        self.assertEqual(
1303            '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
1304            str(w[0].message))
1305
1306    def test_has_attr(self):
1307        """has_attr() checks for the presence of an attribute.
1308
1309        Please note note: has_attr() is different from
1310        __in__. has_attr() checks the tag's attributes and __in__
1311        checks the tag's chidlren.
1312        """
1313        soup = self.soup("<foo attr='bar'>")
1314        self.assertTrue(soup.foo.has_attr('attr'))
1315        self.assertFalse(soup.foo.has_attr('attr2'))
1316
1317
1318    def test_attributes_come_out_in_alphabetical_order(self):
1319        markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1320        self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1321
1322    def test_string(self):
1323        # A tag that contains only a text node makes that node
1324        # available as .string.
1325        soup = self.soup("<b>foo</b>")
1326        self.assertEqual(soup.b.string, 'foo')
1327
1328    def test_empty_tag_has_no_string(self):
1329        # A tag with no children has no .stirng.
1330        soup = self.soup("<b></b>")
1331        self.assertEqual(soup.b.string, None)
1332
1333    def test_tag_with_multiple_children_has_no_string(self):
1334        # A tag with no children has no .string.
1335        soup = self.soup("<a>foo<b></b><b></b></b>")
1336        self.assertEqual(soup.b.string, None)
1337
1338        soup = self.soup("<a>foo<b></b>bar</b>")
1339        self.assertEqual(soup.b.string, None)
1340
1341        # Even if all the children are strings, due to trickery,
1342        # it won't work--but this would be a good optimization.
1343        soup = self.soup("<a>foo</b>")
1344        soup.a.insert(1, "bar")
1345        self.assertEqual(soup.a.string, None)
1346
1347    def test_tag_with_recursive_string_has_string(self):
1348        # A tag with a single child which has a .string inherits that
1349        # .string.
1350        soup = self.soup("<a><b>foo</b></a>")
1351        self.assertEqual(soup.a.string, "foo")
1352        self.assertEqual(soup.string, "foo")
1353
1354    def test_lack_of_string(self):
1355        """Only a tag containing a single text node has a .string."""
1356        soup = self.soup("<b>f<i>e</i>o</b>")
1357        self.assertFalse(soup.b.string)
1358
1359        soup = self.soup("<b></b>")
1360        self.assertFalse(soup.b.string)
1361
1362    def test_all_text(self):
1363        """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1364        soup = self.soup("<a>a<b>r</b>   <r> t </r></a>")
1365        self.assertEqual(soup.a.text, "ar  t ")
1366        self.assertEqual(soup.a.get_text(strip=True), "art")
1367        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1368        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1369
1370    def test_get_text_ignores_comments(self):
1371        soup = self.soup("foo<!--IGNORE-->bar")
1372        self.assertEqual(soup.get_text(), "foobar")
1373
1374        self.assertEqual(
1375            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1376        self.assertEqual(
1377            soup.get_text(types=None), "fooIGNOREbar")
1378
1379    def test_all_strings_ignores_comments(self):
1380        soup = self.soup("foo<!--IGNORE-->bar")
1381        self.assertEqual(['foo', 'bar'], list(soup.strings))
1382
1383class TestCDAtaListAttributes(SoupTest):
1384
1385    """Testing cdata-list attributes like 'class'.
1386    """
1387    def test_single_value_becomes_list(self):
1388        soup = self.soup("<a class='foo'>")
1389        self.assertEqual(["foo"],soup.a['class'])
1390
1391    def test_multiple_values_becomes_list(self):
1392        soup = self.soup("<a class='foo bar'>")
1393        self.assertEqual(["foo", "bar"], soup.a['class'])
1394
1395    def test_multiple_values_separated_by_weird_whitespace(self):
1396        soup = self.soup("<a class='foo\tbar\nbaz'>")
1397        self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1398
1399    def test_attributes_joined_into_string_on_output(self):
1400        soup = self.soup("<a class='foo\tbar'>")
1401        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1402
1403    def test_get_attribute_list(self):
1404        soup = self.soup("<a id='abc def'>")
1405        self.assertEqual(['abc def'], soup.a.get_attribute_list('id'))
1406
1407    def test_accept_charset(self):
1408        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1409        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1410
1411    def test_cdata_attribute_applying_only_to_one_tag(self):
1412        data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1413        soup = self.soup(data)
1414        # We saw in another test that accept-charset is a cdata-list
1415        # attribute for the <form> tag. But it's not a cdata-list
1416        # attribute for any other tag.
1417        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1418
1419    def test_string_has_immutable_name_property(self):
1420        string = self.soup("s").string
1421        self.assertEqual(None, string.name)
1422        def t():
1423            string.name = 'foo'
1424        self.assertRaises(AttributeError, t)
1425
1426class TestPersistence(SoupTest):
1427    "Testing features like pickle and deepcopy."
1428
1429    def setUp(self):
1430        super(TestPersistence, self).setUp()
1431        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1432"http://www.w3.org/TR/REC-html40/transitional.dtd">
1433<html>
1434<head>
1435<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1436<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1437<link rev="made" href="mailto:leonardr@segfault.org">
1438<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1439<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1440<meta name="author" content="Leonard Richardson">
1441</head>
1442<body>
1443<a href="foo">foo</a>
1444<a href="foo"><b>bar</b></a>
1445</body>
1446</html>"""
1447        self.tree = self.soup(self.page)
1448
1449    def test_pickle_and_unpickle_identity(self):
1450        # Pickling a tree, then unpickling it, yields a tree identical
1451        # to the original.
1452        dumped = pickle.dumps(self.tree, 2)
1453        loaded = pickle.loads(dumped)
1454        self.assertEqual(loaded.__class__, BeautifulSoup)
1455        self.assertEqual(loaded.decode(), self.tree.decode())
1456
1457    def test_deepcopy_identity(self):
1458        # Making a deepcopy of a tree yields an identical tree.
1459        copied = copy.deepcopy(self.tree)
1460        self.assertEqual(copied.decode(), self.tree.decode())
1461
1462    def test_copy_preserves_encoding(self):
1463        soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
1464        encoding = soup.original_encoding
1465        copy = soup.__copy__()
1466        self.assertEqual("<p> </p>", str(copy))
1467        self.assertEqual(encoding, copy.original_encoding)
1468
1469    def test_unicode_pickle(self):
1470        # A tree containing Unicode characters can be pickled.
1471        html = "<b>\N{SNOWMAN}</b>"
1472        soup = self.soup(html)
1473        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1474        loaded = pickle.loads(dumped)
1475        self.assertEqual(loaded.decode(), soup.decode())
1476
1477    def test_copy_navigablestring_is_not_attached_to_tree(self):
1478        html = "<b>Foo<a></a></b><b>Bar</b>"
1479        soup = self.soup(html)
1480        s1 = soup.find(string="Foo")
1481        s2 = copy.copy(s1)
1482        self.assertEqual(s1, s2)
1483        self.assertEqual(None, s2.parent)
1484        self.assertEqual(None, s2.next_element)
1485        self.assertNotEqual(None, s1.next_sibling)
1486        self.assertEqual(None, s2.next_sibling)
1487        self.assertEqual(None, s2.previous_element)
1488
1489    def test_copy_navigablestring_subclass_has_same_type(self):
1490        html = "<b><!--Foo--></b>"
1491        soup = self.soup(html)
1492        s1 = soup.string
1493        s2 = copy.copy(s1)
1494        self.assertEqual(s1, s2)
1495        self.assertTrue(isinstance(s2, Comment))
1496
1497    def test_copy_entire_soup(self):
1498        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1499        soup = self.soup(html)
1500        soup_copy = copy.copy(soup)
1501        self.assertEqual(soup, soup_copy)
1502
1503    def test_copy_tag_copies_contents(self):
1504        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1505        soup = self.soup(html)
1506        div = soup.div
1507        div_copy = copy.copy(div)
1508
1509        # The two tags look the same, and evaluate to equal.
1510        self.assertEqual(str(div), str(div_copy))
1511        self.assertEqual(div, div_copy)
1512
1513        # But they're not the same object.
1514        self.assertFalse(div is div_copy)
1515
1516        # And they don't have the same relation to the parse tree. The
1517        # copy is not associated with a parse tree at all.
1518        self.assertEqual(None, div_copy.parent)
1519        self.assertEqual(None, div_copy.previous_element)
1520        self.assertEqual(None, div_copy.find(string='Bar').next_element)
1521        self.assertNotEqual(None, div.find(string='Bar').next_element)
1522
1523class TestSubstitutions(SoupTest):
1524
1525    def test_default_formatter_is_minimal(self):
1526        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1527        soup = self.soup(markup)
1528        decoded = soup.decode(formatter="minimal")
1529        # The < is converted back into &lt; but the e-with-acute is left alone.
1530        self.assertEqual(
1531            decoded,
1532            self.document_for(
1533                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1534
1535    def test_formatter_html(self):
1536        markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1537        soup = self.soup(markup)
1538        decoded = soup.decode(formatter="html")
1539        self.assertEqual(
1540            decoded,
1541            self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1542
1543    def test_formatter_html5(self):
1544        markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1545        soup = self.soup(markup)
1546        decoded = soup.decode(formatter="html5")
1547        self.assertEqual(
1548            decoded,
1549            self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1550
1551    def test_formatter_minimal(self):
1552        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1553        soup = self.soup(markup)
1554        decoded = soup.decode(formatter="minimal")
1555        # The < is converted back into &lt; but the e-with-acute is left alone.
1556        self.assertEqual(
1557            decoded,
1558            self.document_for(
1559                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1560
1561    def test_formatter_null(self):
1562        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1563        soup = self.soup(markup)
1564        decoded = soup.decode(formatter=None)
1565        # Neither the angle brackets nor the e-with-acute are converted.
1566        # This is not valid HTML, but it's what the user wanted.
1567        self.assertEqual(decoded,
1568                          self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1569
1570    def test_formatter_custom(self):
1571        markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
1572        soup = self.soup(markup)
1573        decoded = soup.decode(formatter = lambda x: x.upper())
1574        # Instead of normal entity conversion code, the custom
1575        # callable is called on every string.
1576        self.assertEqual(
1577            decoded,
1578            self.document_for("<b><FOO></b><b>BAR</b><br/>"))
1579
1580    def test_formatter_is_run_on_attribute_values(self):
1581        markup = '<a href="http://a.com?a=b&c=é">e</a>'
1582        soup = self.soup(markup)
1583        a = soup.a
1584
1585        expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1586
1587        self.assertEqual(expect_minimal, a.decode())
1588        self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1589
1590        expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1591        self.assertEqual(expect_html, a.decode(formatter="html"))
1592
1593        self.assertEqual(markup, a.decode(formatter=None))
1594        expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1595        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1596
1597    def test_formatter_skips_script_tag_for_html_documents(self):
1598        doc = """
1599  <script type="text/javascript">
1600   console.log("< < hey > > ");
1601  </script>
1602"""
1603        encoded = BeautifulSoup(doc, 'html.parser').encode()
1604        self.assertTrue(b"< < hey > >" in encoded)
1605
1606    def test_formatter_skips_style_tag_for_html_documents(self):
1607        doc = """
1608  <style type="text/css">
1609   console.log("< < hey > > ");
1610  </style>
1611"""
1612        encoded = BeautifulSoup(doc, 'html.parser').encode()
1613        self.assertTrue(b"< < hey > >" in encoded)
1614
1615    def test_prettify_leaves_preformatted_text_alone(self):
1616        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  <textarea> eee\nfff\t</textarea></div>")
1617        # Everything outside the <pre> tag is reformatted, but everything
1618        # inside is left alone.
1619        self.assertEqual(
1620            '<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
1621            soup.div.prettify())
1622
1623    def test_prettify_accepts_formatter_function(self):
1624        soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1625        pretty = soup.prettify(formatter = lambda x: x.upper())
1626        self.assertTrue("FOO" in pretty)
1627
1628    def test_prettify_outputs_unicode_by_default(self):
1629        soup = self.soup("<a></a>")
1630        self.assertEqual(str, type(soup.prettify()))
1631
1632    def test_prettify_can_encode_data(self):
1633        soup = self.soup("<a></a>")
1634        self.assertEqual(bytes, type(soup.prettify("utf-8")))
1635
1636    def test_html_entity_substitution_off_by_default(self):
1637        markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1638        soup = self.soup(markup)
1639        encoded = soup.b.encode("utf-8")
1640        self.assertEqual(encoded, markup.encode('utf-8'))
1641
1642    def test_encoding_substitution(self):
1643        # Here's the <meta> tag saying that a document is
1644        # encoded in Shift-JIS.
1645        meta_tag = ('<meta content="text/html; charset=x-sjis" '
1646                    'http-equiv="Content-type"/>')
1647        soup = self.soup(meta_tag)
1648
1649        # Parse the document, and the charset apprears unchanged.
1650        self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1651
1652        # Encode the document into some encoding, and the encoding is
1653        # substituted into the meta tag.
1654        utf_8 = soup.encode("utf-8")
1655        self.assertTrue(b"charset=utf-8" in utf_8)
1656
1657        euc_jp = soup.encode("euc_jp")
1658        self.assertTrue(b"charset=euc_jp" in euc_jp)
1659
1660        shift_jis = soup.encode("shift-jis")
1661        self.assertTrue(b"charset=shift-jis" in shift_jis)
1662
1663        utf_16_u = soup.encode("utf-16").decode("utf-16")
1664        self.assertTrue("charset=utf-16" in utf_16_u)
1665
1666    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1667        markup = ('<head><meta content="text/html; charset=x-sjis" '
1668                    'http-equiv="Content-type"/></head><pre>foo</pre>')
1669
1670        # Beautiful Soup used to try to rewrite the meta tag even if the
1671        # meta tag got filtered out by the strainer. This test makes
1672        # sure that doesn't happen.
1673        strainer = SoupStrainer('pre')
1674        soup = self.soup(markup, parse_only=strainer)
1675        self.assertEqual(soup.contents[0].name, 'pre')
1676
1677class TestEncoding(SoupTest):
1678    """Test the ability to encode objects into strings."""
1679
1680    def test_unicode_string_can_be_encoded(self):
1681        html = "<b>\N{SNOWMAN}</b>"
1682        soup = self.soup(html)
1683        self.assertEqual(soup.b.string.encode("utf-8"),
1684                          "\N{SNOWMAN}".encode("utf-8"))
1685
1686    def test_tag_containing_unicode_string_can_be_encoded(self):
1687        html = "<b>\N{SNOWMAN}</b>"
1688        soup = self.soup(html)
1689        self.assertEqual(
1690            soup.b.encode("utf-8"), html.encode("utf-8"))
1691
1692    def test_encoding_substitutes_unrecognized_characters_by_default(self):
1693        html = "<b>\N{SNOWMAN}</b>"
1694        soup = self.soup(html)
1695        self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1696
1697    def test_encoding_can_be_made_strict(self):
1698        html = "<b>\N{SNOWMAN}</b>"
1699        soup = self.soup(html)
1700        self.assertRaises(
1701            UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1702
1703    def test_decode_contents(self):
1704        html = "<b>\N{SNOWMAN}</b>"
1705        soup = self.soup(html)
1706        self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1707
1708    def test_encode_contents(self):
1709        html = "<b>\N{SNOWMAN}</b>"
1710        soup = self.soup(html)
1711        self.assertEqual(
1712            "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1713                encoding="utf8"))
1714
1715    def test_deprecated_renderContents(self):
1716        html = "<b>\N{SNOWMAN}</b>"
1717        soup = self.soup(html)
1718        self.assertEqual(
1719            "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1720
1721    def test_repr(self):
1722        html = "<b>\N{SNOWMAN}</b>"
1723        soup = self.soup(html)
1724        if PY3K:
1725            self.assertEqual(html, repr(soup))
1726        else:
1727            self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1728
1729class TestFormatter(SoupTest):
1730
1731    def test_sort_attributes(self):
1732        # Test the ability to override Formatter.attributes() to,
1733        # e.g., disable the normal sorting of attributes.
1734        class UnsortedFormatter(Formatter):
1735            def attributes(self, tag):
1736                self.called_with = tag
1737                for k, v in sorted(tag.attrs.items()):
1738                    if k == 'ignore':
1739                        continue
1740                    yield k,v
1741
1742        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
1743        formatter = UnsortedFormatter()
1744        decoded = soup.decode(formatter=formatter)
1745
1746        # attributes() was called on the <p> tag. It filtered out one
1747        # attribute and sorted the other two.
1748        self.assertEqual(formatter.called_with, soup.p)
1749        self.assertEqual('<p aval="2" cval="1"></p>', decoded)
1750
1751
1752class TestNavigableStringSubclasses(SoupTest):
1753
1754    def test_cdata(self):
1755        # None of the current builders turn CDATA sections into CData
1756        # objects, but you can create them manually.
1757        soup = self.soup("")
1758        cdata = CData("foo")
1759        soup.insert(1, cdata)
1760        self.assertEqual(str(soup), "<![CDATA[foo]]>")
1761        self.assertEqual(soup.find(text="foo"), "foo")
1762        self.assertEqual(soup.contents[0], "foo")
1763
1764    def test_cdata_is_never_formatted(self):
1765        """Text inside a CData object is passed into the formatter.
1766
1767        But the return value is ignored.
1768        """
1769
1770        self.count = 0
1771        def increment(*args):
1772            self.count += 1
1773            return "BITTER FAILURE"
1774
1775        soup = self.soup("")
1776        cdata = CData("<><><>")
1777        soup.insert(1, cdata)
1778        self.assertEqual(
1779            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1780        self.assertEqual(1, self.count)
1781
1782    def test_doctype_ends_in_newline(self):
1783        # Unlike other NavigableString subclasses, a DOCTYPE always ends
1784        # in a newline.
1785        doctype = Doctype("foo")
1786        soup = self.soup("")
1787        soup.insert(1, doctype)
1788        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1789
1790    def test_declaration(self):
1791        d = Declaration("foo")
1792        self.assertEqual("<?foo?>", d.output_ready())
1793
1794class TestSoupSelector(TreeTest):
1795
1796    HTML = """
1797<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1798"http://www.w3.org/TR/html4/strict.dtd">
1799<html>
1800<head>
1801<title>The title</title>
1802<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1803</head>
1804<body>
1805<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1806<div id="main" class="fancy">
1807<div id="inner">
1808<h1 id="header1">An H1</h1>
1809<p>Some text</p>
1810<p class="onep" id="p1">Some more text</p>
1811<h2 id="header2">An H2</h2>
1812<p class="class1 class2 class3" id="pmulti">Another</p>
1813<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1814<h2 id="header3">Another H2</h2>
1815<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1816<span class="s1">
1817<a href="#" id="s1a1">span1a1</a>
1818<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1819<span class="span2">
1820<a href="#" id="s2a1">span2a1</a>
1821</span>
1822<span class="span3"></span>
1823<custom-dashed-tag class="dashed" id="dash2"/>
1824<div data-tag="dashedvalue" id="data1"/>
1825</span>
1826</div>
1827<x id="xid">
1828<z id="zida"/>
1829<z id="zidab"/>
1830<z id="zidac"/>
1831</x>
1832<y id="yid">
1833<z id="zidb"/>
1834</y>
1835<p lang="en" id="lang-en">English</p>
1836<p lang="en-gb" id="lang-en-gb">English UK</p>
1837<p lang="en-us" id="lang-en-us">English US</p>
1838<p lang="fr" id="lang-fr">French</p>
1839</div>
1840
1841<div id="footer">
1842</div>
1843"""
1844
1845    def setUp(self):
1846        self.soup = BeautifulSoup(self.HTML, 'html.parser')
1847
1848    def assertSelects(self, selector, expected_ids, **kwargs):
1849        el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
1850        el_ids.sort()
1851        expected_ids.sort()
1852        self.assertEqual(expected_ids, el_ids,
1853            "Selector %s, expected [%s], got [%s]" % (
1854                selector, ', '.join(expected_ids), ', '.join(el_ids)
1855            )
1856        )
1857
1858    assertSelect = assertSelects
1859
1860    def assertSelectMultiple(self, *tests):
1861        for selector, expected_ids in tests:
1862            self.assertSelect(selector, expected_ids)
1863
1864    def test_one_tag_one(self):
1865        els = self.soup.select('title')
1866        self.assertEqual(len(els), 1)
1867        self.assertEqual(els[0].name, 'title')
1868        self.assertEqual(els[0].contents, ['The title'])
1869
1870    def test_one_tag_many(self):
1871        els = self.soup.select('div')
1872        self.assertEqual(len(els), 4)
1873        for div in els:
1874            self.assertEqual(div.name, 'div')
1875
1876        el = self.soup.select_one('div')
1877        self.assertEqual('main', el['id'])
1878
1879    def test_select_one_returns_none_if_no_match(self):
1880        match = self.soup.select_one('nonexistenttag')
1881        self.assertEqual(None, match)
1882
1883
1884    def test_tag_in_tag_one(self):
1885        els = self.soup.select('div div')
1886        self.assertSelects('div div', ['inner', 'data1'])
1887
1888    def test_tag_in_tag_many(self):
1889        for selector in ('html div', 'html body div', 'body div'):
1890            self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1891
1892
1893    def test_limit(self):
1894        self.assertSelects('html div', ['main'], limit=1)
1895        self.assertSelects('html body div', ['inner', 'main'], limit=2)
1896        self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'],
1897                           limit=10)
1898
1899    def test_tag_no_match(self):
1900        self.assertEqual(len(self.soup.select('del')), 0)
1901
1902    def test_invalid_tag(self):
1903        self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
1904
1905    def test_select_dashed_tag_ids(self):
1906        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1907
1908    def test_select_dashed_by_id(self):
1909        dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1910        self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1911        self.assertEqual(dashed[0]['id'], 'dash2')
1912
1913    def test_dashed_tag_text(self):
1914        self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1915
1916    def test_select_dashed_matches_find_all(self):
1917        self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1918
1919    def test_header_tags(self):
1920        self.assertSelectMultiple(
1921            ('h1', ['header1']),
1922            ('h2', ['header2', 'header3']),
1923        )
1924
1925    def test_class_one(self):
1926        for selector in ('.onep', 'p.onep', 'html p.onep'):
1927            els = self.soup.select(selector)
1928            self.assertEqual(len(els), 1)
1929            self.assertEqual(els[0].name, 'p')
1930            self.assertEqual(els[0]['class'], ['onep'])
1931
1932    def test_class_mismatched_tag(self):
1933        els = self.soup.select('div.onep')
1934        self.assertEqual(len(els), 0)
1935
1936    def test_one_id(self):
1937        for selector in ('div#inner', '#inner', 'div div#inner'):
1938            self.assertSelects(selector, ['inner'])
1939
1940    def test_bad_id(self):
1941        els = self.soup.select('#doesnotexist')
1942        self.assertEqual(len(els), 0)
1943
1944    def test_items_in_id(self):
1945        els = self.soup.select('div#inner p')
1946        self.assertEqual(len(els), 3)
1947        for el in els:
1948            self.assertEqual(el.name, 'p')
1949        self.assertEqual(els[1]['class'], ['onep'])
1950        self.assertFalse(els[0].has_attr('class'))
1951
1952    def test_a_bunch_of_emptys(self):
1953        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1954            self.assertEqual(len(self.soup.select(selector)), 0)
1955
1956    def test_multi_class_support(self):
1957        for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1958            '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1959            self.assertSelects(selector, ['pmulti'])
1960
1961    def test_multi_class_selection(self):
1962        for selector in ('.class1.class3', '.class3.class2',
1963                         '.class1.class2.class3'):
1964            self.assertSelects(selector, ['pmulti'])
1965
1966    def test_child_selector(self):
1967        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1968        self.assertSelects('.s1 > a span', ['s1a2s1'])
1969
1970    def test_child_selector_id(self):
1971        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1972
1973    def test_attribute_equals(self):
1974        self.assertSelectMultiple(
1975            ('p[class="onep"]', ['p1']),
1976            ('p[id="p1"]', ['p1']),
1977            ('[class="onep"]', ['p1']),
1978            ('[id="p1"]', ['p1']),
1979            ('link[rel="stylesheet"]', ['l1']),
1980            ('link[type="text/css"]', ['l1']),
1981            ('link[href="blah.css"]', ['l1']),
1982            ('link[href="no-blah.css"]', []),
1983            ('[rel="stylesheet"]', ['l1']),
1984            ('[type="text/css"]', ['l1']),
1985            ('[href="blah.css"]', ['l1']),
1986            ('[href="no-blah.css"]', []),
1987            ('p[href="no-blah.css"]', []),
1988            ('[href="no-blah.css"]', []),
1989        )
1990
1991    def test_attribute_tilde(self):
1992        self.assertSelectMultiple(
1993            ('p[class~="class1"]', ['pmulti']),
1994            ('p[class~="class2"]', ['pmulti']),
1995            ('p[class~="class3"]', ['pmulti']),
1996            ('[class~="class1"]', ['pmulti']),
1997            ('[class~="class2"]', ['pmulti']),
1998            ('[class~="class3"]', ['pmulti']),
1999            ('a[rel~="friend"]', ['bob']),
2000            ('a[rel~="met"]', ['bob']),
2001            ('[rel~="friend"]', ['bob']),
2002            ('[rel~="met"]', ['bob']),
2003        )
2004
2005    def test_attribute_startswith(self):
2006        self.assertSelectMultiple(
2007            ('[rel^="style"]', ['l1']),
2008            ('link[rel^="style"]', ['l1']),
2009            ('notlink[rel^="notstyle"]', []),
2010            ('[rel^="notstyle"]', []),
2011            ('link[rel^="notstyle"]', []),
2012            ('link[href^="bla"]', ['l1']),
2013            ('a[href^="http://"]', ['bob', 'me']),
2014            ('[href^="http://"]', ['bob', 'me']),
2015            ('[id^="p"]', ['pmulti', 'p1']),
2016            ('[id^="m"]', ['me', 'main']),
2017            ('div[id^="m"]', ['main']),
2018            ('a[id^="m"]', ['me']),
2019            ('div[data-tag^="dashed"]', ['data1'])
2020        )
2021
2022    def test_attribute_endswith(self):
2023        self.assertSelectMultiple(
2024            ('[href$=".css"]', ['l1']),
2025            ('link[href$=".css"]', ['l1']),
2026            ('link[id$="1"]', ['l1']),
2027            ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
2028            ('div[id$="1"]', ['data1']),
2029            ('[id$="noending"]', []),
2030        )
2031
2032    def test_attribute_contains(self):
2033        self.assertSelectMultiple(
2034            # From test_attribute_startswith
2035            ('[rel*="style"]', ['l1']),
2036            ('link[rel*="style"]', ['l1']),
2037            ('notlink[rel*="notstyle"]', []),
2038            ('[rel*="notstyle"]', []),
2039            ('link[rel*="notstyle"]', []),
2040            ('link[href*="bla"]', ['l1']),
2041            ('[href*="http://"]', ['bob', 'me']),
2042            ('[id*="p"]', ['pmulti', 'p1']),
2043            ('div[id*="m"]', ['main']),
2044            ('a[id*="m"]', ['me']),
2045            # From test_attribute_endswith
2046            ('[href*=".css"]', ['l1']),
2047            ('link[href*=".css"]', ['l1']),
2048            ('link[id*="1"]', ['l1']),
2049            ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
2050            ('div[id*="1"]', ['data1']),
2051            ('[id*="noending"]', []),
2052            # New for this test
2053            ('[href*="."]', ['bob', 'me', 'l1']),
2054            ('a[href*="."]', ['bob', 'me']),
2055            ('link[href*="."]', ['l1']),
2056            ('div[id*="n"]', ['main', 'inner']),
2057            ('div[id*="nn"]', ['inner']),
2058            ('div[data-tag*="edval"]', ['data1'])
2059        )
2060
2061    def test_attribute_exact_or_hypen(self):
2062        self.assertSelectMultiple(
2063            ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
2064            ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
2065            ('p[lang|="fr"]', ['lang-fr']),
2066            ('p[lang|="gb"]', []),
2067        )
2068
2069    def test_attribute_exists(self):
2070        self.assertSelectMultiple(
2071            ('[rel]', ['l1', 'bob', 'me']),
2072            ('link[rel]', ['l1']),
2073            ('a[rel]', ['bob', 'me']),
2074            ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
2075            ('p[class]', ['p1', 'pmulti']),
2076            ('[blah]', []),
2077            ('p[blah]', []),
2078            ('div[data-tag]', ['data1'])
2079        )
2080
2081    def test_quoted_space_in_selector_name(self):
2082        html = """<div style="display: wrong">nope</div>
2083        <div style="display: right">yes</div>
2084        """
2085        soup = BeautifulSoup(html, 'html.parser')
2086        [chosen] = soup.select('div[style="display: right"]')
2087        self.assertEqual("yes", chosen.string)
2088
2089    def test_unsupported_pseudoclass(self):
2090        self.assertRaises(
2091            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
2092
2093        self.assertRaises(
2094            SyntaxError, self.soup.select, "a:nth-of-type(a)")
2095
2096    def test_nth_of_type(self):
2097        # Try to select first paragraph
2098        els = self.soup.select('div#inner p:nth-of-type(1)')
2099        self.assertEqual(len(els), 1)
2100        self.assertEqual(els[0].string, 'Some text')
2101
2102        # Try to select third paragraph
2103        els = self.soup.select('div#inner p:nth-of-type(3)')
2104        self.assertEqual(len(els), 1)
2105        self.assertEqual(els[0].string, 'Another')
2106
2107        # Try to select (non-existent!) fourth paragraph
2108        els = self.soup.select('div#inner p:nth-of-type(4)')
2109        self.assertEqual(len(els), 0)
2110
2111        # Zero will select no tags.
2112        els = self.soup.select('div p:nth-of-type(0)')
2113        self.assertEqual(len(els), 0)
2114
2115    def test_nth_of_type_direct_descendant(self):
2116        els = self.soup.select('div#inner > p:nth-of-type(1)')
2117        self.assertEqual(len(els), 1)
2118        self.assertEqual(els[0].string, 'Some text')
2119
2120    def test_id_child_selector_nth_of_type(self):
2121        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
2122
2123    def test_select_on_element(self):
2124        # Other tests operate on the tree; this operates on an element
2125        # within the tree.
2126        inner = self.soup.find("div", id="main")
2127        selected = inner.select("div")
2128        # The <div id="inner"> tag was selected. The <div id="footer">
2129        # tag was not.
2130        self.assertSelectsIDs(selected, ['inner', 'data1'])
2131
2132    def test_overspecified_child_id(self):
2133        self.assertSelects(".fancy #inner", ['inner'])
2134        self.assertSelects(".normal #inner", [])
2135
2136    def test_adjacent_sibling_selector(self):
2137        self.assertSelects('#p1 + h2', ['header2'])
2138        self.assertSelects('#p1 + h2 + p', ['pmulti'])
2139        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
2140        self.assertEqual([], self.soup.select('#p1 + p'))
2141
2142    def test_general_sibling_selector(self):
2143        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
2144        self.assertSelects('#p1 ~ #header2', ['header2'])
2145        self.assertSelects('#p1 ~ h2 + a', ['me'])
2146        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
2147        self.assertEqual([], self.soup.select('#inner ~ h2'))
2148
2149    def test_dangling_combinator(self):
2150        self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
2151
2152    def test_sibling_combinator_wont_select_same_tag_twice(self):
2153        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
2154
2155    # Test the selector grouping operator (the comma)
2156    def test_multiple_select(self):
2157        self.assertSelects('x, y', ['xid', 'yid'])
2158
2159    def test_multiple_select_with_no_space(self):
2160        self.assertSelects('x,y', ['xid', 'yid'])
2161
2162    def test_multiple_select_with_more_space(self):
2163        self.assertSelects('x,    y', ['xid', 'yid'])
2164
2165    def test_multiple_select_duplicated(self):
2166        self.assertSelects('x, x', ['xid'])
2167
2168    def test_multiple_select_sibling(self):
2169        self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
2170
2171    def test_multiple_select_tag_and_direct_descendant(self):
2172        self.assertSelects('x, y > z', ['xid', 'zidb'])
2173
2174    def test_multiple_select_direct_descendant_and_tags(self):
2175        self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
2176
2177    def test_multiple_select_indirect_descendant(self):
2178        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
2179
2180    def test_invalid_multiple_select(self):
2181        self.assertRaises(SyntaxError, self.soup.select, ',x, y')
2182        self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
2183
2184    def test_multiple_select_attrs(self):
2185        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
2186
2187    def test_multiple_select_ids(self):
2188        self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
2189
2190    def test_multiple_select_nested(self):
2191        self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2192
2193    def test_select_duplicate_elements(self):
2194        # When markup contains duplicate elements, a multiple select
2195        # will find all of them.
2196        markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
2197        soup = BeautifulSoup(markup, 'html.parser')
2198        selected = soup.select(".c1, .c2")
2199        self.assertEqual(3, len(selected))
2200
2201        # Verify that find_all finds the same elements, though because
2202        # of an implementation detail it finds them in a different
2203        # order.
2204        for element in soup.find_all(class_=['c1', 'c2']):
2205            assert element in selected
2206