1from functools import partial
2from importlib import reload
3from io import BytesIO, StringIO
4import os
5from pathlib import Path
6import re
7import threading
8from urllib.error import URLError
9
10import numpy as np
11import pytest
12
13from pandas.compat import is_platform_windows
14from pandas.errors import ParserError
15import pandas.util._test_decorators as td
16
17from pandas import (
18    DataFrame,
19    MultiIndex,
20    Series,
21    Timestamp,
22    date_range,
23    read_csv,
24    to_datetime,
25)
26import pandas._testing as tm
27
28from pandas.io.common import file_path_to_url
29import pandas.io.html
30from pandas.io.html import read_html
31
32HERE = os.path.dirname(__file__)
33
34
35@pytest.fixture(
36    params=[
37        "chinese_utf-16.html",
38        "chinese_utf-32.html",
39        "chinese_utf-8.html",
40        "letz_latin1.html",
41    ]
42)
43def html_encoding_file(request, datapath):
44    """Parametrized fixture for HTML encoding test filenames."""
45    return datapath("io", "data", "html_encoding", request.param)
46
47
48def assert_framelist_equal(list1, list2, *args, **kwargs):
49    assert len(list1) == len(list2), (
50        "lists are not of equal size "
51        f"len(list1) == {len(list1)}, "
52        f"len(list2) == {len(list2)}"
53    )
54    msg = "not all list elements are DataFrames"
55    both_frames = all(
56        map(
57            lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame),
58            list1,
59            list2,
60        )
61    )
62    assert both_frames, msg
63    for frame_i, frame_j in zip(list1, list2):
64        tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
65        assert not frame_i.empty, "frames are both empty"
66
67
68@td.skip_if_no("bs4")
69@td.skip_if_no("html5lib")
70def test_bs4_version_fails(monkeypatch, datapath):
71    import bs4
72
73    monkeypatch.setattr(bs4, "__version__", "4.2")
74    with pytest.raises(ImportError, match="Pandas requires version"):
75        read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4")
76
77
78def test_invalid_flavor():
79    url = "google.com"
80    flavor = "invalid flavor"
81    msg = r"\{" + flavor + r"\} is not a valid set of flavors"
82
83    with pytest.raises(ValueError, match=msg):
84        read_html(url, match="google", flavor=flavor)
85
86
87@td.skip_if_no("bs4")
88@td.skip_if_no("lxml")
89@td.skip_if_no("html5lib")
90def test_same_ordering(datapath):
91    filename = datapath("io", "data", "html", "valid_markup.html")
92    dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"])
93    dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"])
94    assert_framelist_equal(dfs_lxml, dfs_bs4)
95
96
97@pytest.mark.parametrize(
98    "flavor",
99    [
100        pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]),
101        pytest.param("lxml", marks=td.skip_if_no("lxml")),
102    ],
103    scope="class",
104)
105class TestReadHtml:
106    @pytest.fixture(autouse=True)
107    def set_files(self, datapath):
108        self.spam_data = datapath("io", "data", "html", "spam.html")
109        self.spam_data_kwargs = {}
110        self.spam_data_kwargs["encoding"] = "UTF-8"
111        self.banklist_data = datapath("io", "data", "html", "banklist.html")
112
113    @pytest.fixture(autouse=True, scope="function")
114    def set_defaults(self, flavor, request):
115        self.read_html = partial(read_html, flavor=flavor)
116        yield
117
118    def test_to_html_compat(self):
119        df = (
120            tm.makeCustomDataframe(
121                4,
122                3,
123                data_gen_f=lambda *args: np.random.rand(),
124                c_idx_names=False,
125                r_idx_names=False,
126            )
127            .applymap("{:.3f}".format)
128            .astype(float)
129        )
130        out = df.to_html()
131        res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
132        tm.assert_frame_equal(res, df)
133
134    @pytest.mark.xfail(reason="Html file was removed")
135    @tm.network
136    def test_banklist_url_positional_match(self):
137        url = "https://www.fdic.gov/bank/individual/failed/banklist.html"
138        # Passing match argument as positional should cause a FutureWarning.
139        with tm.assert_produces_warning(FutureWarning):
140            df1 = self.read_html(
141                url, "First Federal Bank of Florida", attrs={"id": "table"}
142            )
143        with tm.assert_produces_warning(FutureWarning):
144            df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})
145
146        assert_framelist_equal(df1, df2)
147
148    @pytest.mark.xfail(reason="Html file was removed")
149    @tm.network
150    def test_banklist_url(self):
151        url = "https://www.fdic.gov/bank/individual/failed/banklist.html"
152        df1 = self.read_html(
153            url, match="First Federal Bank of Florida", attrs={"id": "table"}
154        )
155        df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"})
156
157        assert_framelist_equal(df1, df2)
158
159    @tm.network
160    def test_spam_url(self):
161        url = (
162            "https://raw.githubusercontent.com/pandas-dev/pandas/master/"
163            "pandas/tests/io/data/html/spam.html"
164        )
165        df1 = self.read_html(url, match=".*Water.*")
166        df2 = self.read_html(url, match="Unit")
167
168        assert_framelist_equal(df1, df2)
169
170    @pytest.mark.slow
171    def test_banklist(self):
172        df1 = self.read_html(
173            self.banklist_data, match=".*Florida.*", attrs={"id": "table"}
174        )
175        df2 = self.read_html(
176            self.banklist_data, match="Metcalf Bank", attrs={"id": "table"}
177        )
178
179        assert_framelist_equal(df1, df2)
180
181    def test_spam(self):
182        df1 = self.read_html(self.spam_data, match=".*Water.*")
183        df2 = self.read_html(self.spam_data, match="Unit")
184        assert_framelist_equal(df1, df2)
185
186        assert df1[0].iloc[0, 0] == "Proximates"
187        assert df1[0].columns[0] == "Nutrient"
188
189    def test_spam_no_match(self):
190        dfs = self.read_html(self.spam_data)
191        for df in dfs:
192            assert isinstance(df, DataFrame)
193
194    def test_banklist_no_match(self):
195        dfs = self.read_html(self.banklist_data, attrs={"id": "table"})
196        for df in dfs:
197            assert isinstance(df, DataFrame)
198
199    def test_spam_header(self):
200        df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0]
201        assert df.columns[0] == "Proximates"
202        assert not df.empty
203
204    def test_skiprows_int(self):
205        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
206        df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)
207
208        assert_framelist_equal(df1, df2)
209
210    def test_skiprows_range(self):
211        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2))
212        df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2))
213
214        assert_framelist_equal(df1, df2)
215
216    def test_skiprows_list(self):
217        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2])
218        df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1])
219
220        assert_framelist_equal(df1, df2)
221
222    def test_skiprows_set(self):
223        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2})
224        df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1})
225
226        assert_framelist_equal(df1, df2)
227
228    def test_skiprows_slice(self):
229        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
230        df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)
231
232        assert_framelist_equal(df1, df2)
233
234    def test_skiprows_slice_short(self):
235        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2))
236        df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2))
237
238        assert_framelist_equal(df1, df2)
239
240    def test_skiprows_slice_long(self):
241        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5))
242        df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1))
243
244        assert_framelist_equal(df1, df2)
245
246    def test_skiprows_ndarray(self):
247        df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2))
248        df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2))
249
250        assert_framelist_equal(df1, df2)
251
252    def test_skiprows_invalid(self):
253        with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
254            self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf")
255
256    def test_index(self):
257        df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
258        df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
259        assert_framelist_equal(df1, df2)
260
261    def test_header_and_index_no_types(self):
262        df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
263        df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
264        assert_framelist_equal(df1, df2)
265
266    def test_header_and_index_with_types(self):
267        df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
268        df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
269        assert_framelist_equal(df1, df2)
270
271    def test_infer_types(self):
272
273        # 10892 infer_types removed
274        df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
275        df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
276        assert_framelist_equal(df1, df2)
277
278    def test_string_io(self):
279        with open(self.spam_data, **self.spam_data_kwargs) as f:
280            data1 = StringIO(f.read())
281
282        with open(self.spam_data, **self.spam_data_kwargs) as f:
283            data2 = StringIO(f.read())
284
285        df1 = self.read_html(data1, match=".*Water.*")
286        df2 = self.read_html(data2, match="Unit")
287        assert_framelist_equal(df1, df2)
288
289    def test_string(self):
290        with open(self.spam_data, **self.spam_data_kwargs) as f:
291            data = f.read()
292
293        df1 = self.read_html(data, match=".*Water.*")
294        df2 = self.read_html(data, match="Unit")
295
296        assert_framelist_equal(df1, df2)
297
298    def test_file_like(self):
299        with open(self.spam_data, **self.spam_data_kwargs) as f:
300            df1 = self.read_html(f, match=".*Water.*")
301
302        with open(self.spam_data, **self.spam_data_kwargs) as f:
303            df2 = self.read_html(f, match="Unit")
304
305        assert_framelist_equal(df1, df2)
306
307    @tm.network
308    def test_bad_url_protocol(self):
309        with pytest.raises(URLError):
310            self.read_html("git://github.com", match=".*Water.*")
311
312    @tm.network
313    @pytest.mark.slow
314    def test_invalid_url(self):
315        try:
316            with pytest.raises(URLError):
317                self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*")
318        except ValueError as e:
319            assert "No tables found" in str(e)
320
321    @pytest.mark.slow
322    def test_file_url(self):
323        url = self.banklist_data
324        dfs = self.read_html(
325            file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
326        )
327        assert isinstance(dfs, list)
328        for df in dfs:
329            assert isinstance(df, DataFrame)
330
331    @pytest.mark.slow
332    def test_invalid_table_attrs(self):
333        url = self.banklist_data
334        with pytest.raises(ValueError, match="No tables found"):
335            self.read_html(
336                url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"}
337            )
338
339    def _bank_data(self, *args, **kwargs):
340        return self.read_html(
341            self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs
342        )
343
344    @pytest.mark.slow
345    def test_multiindex_header(self):
346        df = self._bank_data(header=[0, 1])[0]
347        assert isinstance(df.columns, MultiIndex)
348
349    @pytest.mark.slow
350    def test_multiindex_index(self):
351        df = self._bank_data(index_col=[0, 1])[0]
352        assert isinstance(df.index, MultiIndex)
353
354    @pytest.mark.slow
355    def test_multiindex_header_index(self):
356        df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
357        assert isinstance(df.columns, MultiIndex)
358        assert isinstance(df.index, MultiIndex)
359
360    @pytest.mark.slow
361    def test_multiindex_header_skiprows_tuples(self):
362        df = self._bank_data(header=[0, 1], skiprows=1)[0]
363        assert isinstance(df.columns, MultiIndex)
364
365    @pytest.mark.slow
366    def test_multiindex_header_skiprows(self):
367        df = self._bank_data(header=[0, 1], skiprows=1)[0]
368        assert isinstance(df.columns, MultiIndex)
369
370    @pytest.mark.slow
371    def test_multiindex_header_index_skiprows(self):
372        df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
373        assert isinstance(df.index, MultiIndex)
374        assert isinstance(df.columns, MultiIndex)
375
376    @pytest.mark.slow
377    def test_regex_idempotency(self):
378        url = self.banklist_data
379        dfs = self.read_html(
380            file_path_to_url(os.path.abspath(url)),
381            match=re.compile(re.compile("Florida")),
382            attrs={"id": "table"},
383        )
384        assert isinstance(dfs, list)
385        for df in dfs:
386            assert isinstance(df, DataFrame)
387
388    def test_negative_skiprows(self):
389        msg = r"\(you passed a negative value\)"
390        with pytest.raises(ValueError, match=msg):
391            self.read_html(self.spam_data, match="Water", skiprows=-1)
392
393    @tm.network
394    def test_multiple_matches(self):
395        url = "https://docs.python.org/2/"
396        dfs = self.read_html(url, match="Python")
397        assert len(dfs) > 1
398
399    @tm.network
400    def test_python_docs_table(self):
401        url = "https://docs.python.org/2/"
402        dfs = self.read_html(url, match="Python")
403        zz = [df.iloc[0, 0][0:4] for df in dfs]
404        assert sorted(zz) == sorted(["Repo", "What"])
405
406    def test_empty_tables(self):
407        """
408        Make sure that read_html ignores empty tables.
409        """
410        html = """
411            <table>
412                <thead>
413                    <tr>
414                        <th>A</th>
415                        <th>B</th>
416                    </tr>
417                </thead>
418                <tbody>
419                    <tr>
420                        <td>1</td>
421                        <td>2</td>
422                    </tr>
423                </tbody>
424            </table>
425            <table>
426                <tbody>
427                </tbody>
428            </table>
429        """
430        result = self.read_html(html)
431        assert len(result) == 1
432
433    def test_multiple_tbody(self):
434        # GH-20690
435        # Read all tbody tags within a single table.
436        result = self.read_html(
437            """<table>
438            <thead>
439                <tr>
440                    <th>A</th>
441                    <th>B</th>
442                </tr>
443            </thead>
444            <tbody>
445                <tr>
446                    <td>1</td>
447                    <td>2</td>
448                </tr>
449            </tbody>
450            <tbody>
451                <tr>
452                    <td>3</td>
453                    <td>4</td>
454                </tr>
455            </tbody>
456        </table>"""
457        )[0]
458
459        expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"])
460
461        tm.assert_frame_equal(result, expected)
462
463    def test_header_and_one_column(self):
464        """
465        Don't fail with bs4 when there is a header and only one column
466        as described in issue #9178
467        """
468        result = self.read_html(
469            """<table>
470                <thead>
471                    <tr>
472                        <th>Header</th>
473                    </tr>
474                </thead>
475                <tbody>
476                    <tr>
477                        <td>first</td>
478                    </tr>
479                </tbody>
480            </table>"""
481        )[0]
482
483        expected = DataFrame(data={"Header": "first"}, index=[0])
484
485        tm.assert_frame_equal(result, expected)
486
487    def test_thead_without_tr(self):
488        """
489        Ensure parser adds <tr> within <thead> on malformed HTML.
490        """
491        result = self.read_html(
492            """<table>
493            <thead>
494                <tr>
495                    <th>Country</th>
496                    <th>Municipality</th>
497                    <th>Year</th>
498                </tr>
499            </thead>
500            <tbody>
501                <tr>
502                    <td>Ukraine</td>
503                    <th>Odessa</th>
504                    <td>1944</td>
505                </tr>
506            </tbody>
507        </table>"""
508        )[0]
509
510        expected = DataFrame(
511            data=[["Ukraine", "Odessa", 1944]],
512            columns=["Country", "Municipality", "Year"],
513        )
514
515        tm.assert_frame_equal(result, expected)
516
517    def test_tfoot_read(self):
518        """
519        Make sure that read_html reads tfoot, containing td or th.
520        Ignores empty tfoot
521        """
522        data_template = """<table>
523            <thead>
524                <tr>
525                    <th>A</th>
526                    <th>B</th>
527                </tr>
528            </thead>
529            <tbody>
530                <tr>
531                    <td>bodyA</td>
532                    <td>bodyB</td>
533                </tr>
534            </tbody>
535            <tfoot>
536                {footer}
537            </tfoot>
538        </table>"""
539
540        expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"])
541
542        expected2 = DataFrame(
543            data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"]
544        )
545
546        data1 = data_template.format(footer="")
547        data2 = data_template.format(footer="<tr><td>footA</td><th>footB</th></tr>")
548
549        result1 = self.read_html(data1)[0]
550        result2 = self.read_html(data2)[0]
551
552        tm.assert_frame_equal(result1, expected1)
553        tm.assert_frame_equal(result2, expected2)
554
555    def test_parse_header_of_non_string_column(self):
556        # GH5048: if header is specified explicitly, an int column should be
557        # parsed as int while its header is parsed as str
558        result = self.read_html(
559            """
560            <table>
561                <tr>
562                    <td>S</td>
563                    <td>I</td>
564                </tr>
565                <tr>
566                    <td>text</td>
567                    <td>1944</td>
568                </tr>
569            </table>
570        """,
571            header=0,
572        )[0]
573
574        expected = DataFrame([["text", 1944]], columns=("S", "I"))
575
576        tm.assert_frame_equal(result, expected)
577
578    @pytest.mark.slow
579    def test_banklist_header(self, datapath):
580        from pandas.io.html import _remove_whitespace
581
582        def try_remove_ws(x):
583            try:
584                return _remove_whitespace(x)
585            except AttributeError:
586                return x
587
588        df = self.read_html(self.banklist_data, match="Metcalf", attrs={"id": "table"})[
589            0
590        ]
591        ground_truth = read_csv(
592            datapath("io", "data", "csv", "banklist.csv"),
593            converters={"Updated Date": Timestamp, "Closing Date": Timestamp},
594        )
595        assert df.shape == ground_truth.shape
596        old = [
597            "First Vietnamese American BankIn Vietnamese",
598            "Westernbank Puerto RicoEn Espanol",
599            "R-G Premier Bank of Puerto RicoEn Espanol",
600            "EurobankEn Espanol",
601            "Sanderson State BankEn Espanol",
602            "Washington Mutual Bank(Including its subsidiary Washington "
603            "Mutual Bank FSB)",
604            "Silver State BankEn Espanol",
605            "AmTrade International BankEn Espanol",
606            "Hamilton Bank, NAEn Espanol",
607            "The Citizens Savings BankPioneer Community Bank, Inc.",
608        ]
609        new = [
610            "First Vietnamese American Bank",
611            "Westernbank Puerto Rico",
612            "R-G Premier Bank of Puerto Rico",
613            "Eurobank",
614            "Sanderson State Bank",
615            "Washington Mutual Bank",
616            "Silver State Bank",
617            "AmTrade International Bank",
618            "Hamilton Bank, NA",
619            "The Citizens Savings Bank",
620        ]
621        dfnew = df.applymap(try_remove_ws).replace(old, new)
622        gtnew = ground_truth.applymap(try_remove_ws)
623        converted = dfnew._convert(datetime=True, numeric=True)
624        date_cols = ["Closing Date", "Updated Date"]
625        converted[date_cols] = converted[date_cols].apply(to_datetime)
626        tm.assert_frame_equal(converted, gtnew)
627
628    @pytest.mark.slow
629    def test_gold_canyon(self):
630        gc = "Gold Canyon"
631        with open(self.banklist_data) as f:
632            raw_text = f.read()
633
634        assert gc in raw_text
635        df = self.read_html(
636            self.banklist_data, match="Gold Canyon", attrs={"id": "table"}
637        )[0]
638        assert gc in df.to_string()
639
640    def test_different_number_of_cols(self):
641        expected = self.read_html(
642            """<table>
643                        <thead>
644                            <tr style="text-align: right;">
645                            <th></th>
646                            <th>C_l0_g0</th>
647                            <th>C_l0_g1</th>
648                            <th>C_l0_g2</th>
649                            <th>C_l0_g3</th>
650                            <th>C_l0_g4</th>
651                            </tr>
652                        </thead>
653                        <tbody>
654                            <tr>
655                            <th>R_l0_g0</th>
656                            <td> 0.763</td>
657                            <td> 0.233</td>
658                            <td> nan</td>
659                            <td> nan</td>
660                            <td> nan</td>
661                            </tr>
662                            <tr>
663                            <th>R_l0_g1</th>
664                            <td> 0.244</td>
665                            <td> 0.285</td>
666                            <td> 0.392</td>
667                            <td> 0.137</td>
668                            <td> 0.222</td>
669                            </tr>
670                        </tbody>
671                    </table>""",
672            index_col=0,
673        )[0]
674
675        result = self.read_html(
676            """<table>
677                    <thead>
678                        <tr style="text-align: right;">
679                        <th></th>
680                        <th>C_l0_g0</th>
681                        <th>C_l0_g1</th>
682                        <th>C_l0_g2</th>
683                        <th>C_l0_g3</th>
684                        <th>C_l0_g4</th>
685                        </tr>
686                    </thead>
687                    <tbody>
688                        <tr>
689                        <th>R_l0_g0</th>
690                        <td> 0.763</td>
691                        <td> 0.233</td>
692                        </tr>
693                        <tr>
694                        <th>R_l0_g1</th>
695                        <td> 0.244</td>
696                        <td> 0.285</td>
697                        <td> 0.392</td>
698                        <td> 0.137</td>
699                        <td> 0.222</td>
700                        </tr>
701                    </tbody>
702                 </table>""",
703            index_col=0,
704        )[0]
705
706        tm.assert_frame_equal(result, expected)
707
708    def test_colspan_rowspan_1(self):
709        # GH17054
710        result = self.read_html(
711            """
712            <table>
713                <tr>
714                    <th>A</th>
715                    <th colspan="1">B</th>
716                    <th rowspan="1">C</th>
717                </tr>
718                <tr>
719                    <td>a</td>
720                    <td>b</td>
721                    <td>c</td>
722                </tr>
723            </table>
724        """
725        )[0]
726
727        expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"])
728
729        tm.assert_frame_equal(result, expected)
730
731    def test_colspan_rowspan_copy_values(self):
732        # GH17054
733
734        # In ASCII, with lowercase letters being copies:
735        #
736        # X x Y Z W
737        # A B b z C
738
739        result = self.read_html(
740            """
741            <table>
742                <tr>
743                    <td colspan="2">X</td>
744                    <td>Y</td>
745                    <td rowspan="2">Z</td>
746                    <td>W</td>
747                </tr>
748                <tr>
749                    <td>A</td>
750                    <td colspan="2">B</td>
751                    <td>C</td>
752                </tr>
753            </table>
754        """,
755            header=0,
756        )[0]
757
758        expected = DataFrame(
759            data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"]
760        )
761
762        tm.assert_frame_equal(result, expected)
763
764    def test_colspan_rowspan_both_not_1(self):
765        # GH17054
766
767        # In ASCII, with lowercase letters being copies:
768        #
769        # A B b b C
770        # a b b b D
771
772        result = self.read_html(
773            """
774            <table>
775                <tr>
776                    <td rowspan="2">A</td>
777                    <td rowspan="2" colspan="3">B</td>
778                    <td>C</td>
779                </tr>
780                <tr>
781                    <td>D</td>
782                </tr>
783            </table>
784        """,
785            header=0,
786        )[0]
787
788        expected = DataFrame(
789            data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"]
790        )
791
792        tm.assert_frame_equal(result, expected)
793
794    def test_rowspan_at_end_of_row(self):
795        # GH17054
796
797        # In ASCII, with lowercase letters being copies:
798        #
799        # A B
800        # C b
801
802        result = self.read_html(
803            """
804            <table>
805                <tr>
806                    <td>A</td>
807                    <td rowspan="2">B</td>
808                </tr>
809                <tr>
810                    <td>C</td>
811                </tr>
812            </table>
813        """,
814            header=0,
815        )[0]
816
817        expected = DataFrame(data=[["C", "B"]], columns=["A", "B"])
818
819        tm.assert_frame_equal(result, expected)
820
821    def test_rowspan_only_rows(self):
822        # GH17054
823
824        result = self.read_html(
825            """
826            <table>
827                <tr>
828                    <td rowspan="3">A</td>
829                    <td rowspan="3">B</td>
830                </tr>
831            </table>
832        """,
833            header=0,
834        )[0]
835
836        expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"])
837
838        tm.assert_frame_equal(result, expected)
839
840    def test_header_inferred_from_rows_with_only_th(self):
841        # GH17054
842        result = self.read_html(
843            """
844            <table>
845                <tr>
846                    <th>A</th>
847                    <th>B</th>
848                </tr>
849                <tr>
850                    <th>a</th>
851                    <th>b</th>
852                </tr>
853                <tr>
854                    <td>1</td>
855                    <td>2</td>
856                </tr>
857            </table>
858        """
859        )[0]
860
861        columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
862        expected = DataFrame(data=[[1, 2]], columns=columns)
863
864        tm.assert_frame_equal(result, expected)
865
866    def test_parse_dates_list(self):
867        df = DataFrame({"date": date_range("1/1/2001", periods=10)})
868        expected = df.to_html()
869        res = self.read_html(expected, parse_dates=[1], index_col=0)
870        tm.assert_frame_equal(df, res[0])
871        res = self.read_html(expected, parse_dates=["date"], index_col=0)
872        tm.assert_frame_equal(df, res[0])
873
874    def test_parse_dates_combine(self):
875        raw_dates = Series(date_range("1/1/2001", periods=10))
876        df = DataFrame(
877            {
878                "date": raw_dates.map(lambda x: str(x.date())),
879                "time": raw_dates.map(lambda x: str(x.time())),
880            }
881        )
882        res = self.read_html(
883            df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1
884        )
885        newdf = DataFrame({"datetime": raw_dates})
886        tm.assert_frame_equal(newdf, res[0])
887
888    def test_wikipedia_states_table(self, datapath):
889        data = datapath("io", "data", "html", "wikipedia_states.html")
890        assert os.path.isfile(data), f"{repr(data)} is not a file"
891        assert os.path.getsize(data), f"{repr(data)} is an empty file"
892        result = self.read_html(data, match="Arizona", header=1)[0]
893        assert result.shape == (60, 12)
894        assert "Unnamed" in result.columns[-1]
895        assert result["sq mi"].dtype == np.dtype("float64")
896        assert np.allclose(result.loc[0, "sq mi"], 665384.04)
897
898    def test_wikipedia_states_multiindex(self, datapath):
899        data = datapath("io", "data", "html", "wikipedia_states.html")
900        result = self.read_html(data, match="Arizona", index_col=0)[0]
901        assert result.shape == (60, 11)
902        assert "Unnamed" in result.columns[-1][1]
903        assert result.columns.nlevels == 2
904        assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04)
905
906    def test_parser_error_on_empty_header_row(self):
907        msg = (
908            r"Passed header=\[0,1\] are too many "
909            r"rows for this multi_index of columns"
910        )
911        with pytest.raises(ParserError, match=msg):
912            self.read_html(
913                """
914                <table>
915                    <thead>
916                        <tr><th></th><th></tr>
917                        <tr><th>A</th><th>B</th></tr>
918                    </thead>
919                    <tbody>
920                        <tr><td>a</td><td>b</td></tr>
921                    </tbody>
922                </table>
923            """,
924                header=[0, 1],
925            )
926
927    def test_decimal_rows(self):
928        # GH 12907
929        result = self.read_html(
930            """<html>
931            <body>
932             <table>
933                <thead>
934                    <tr>
935                        <th>Header</th>
936                    </tr>
937                </thead>
938                <tbody>
939                    <tr>
940                        <td>1100#101</td>
941                    </tr>
942                </tbody>
943            </table>
944            </body>
945        </html>""",
946            decimal="#",
947        )[0]
948
949        expected = DataFrame(data={"Header": 1100.101}, index=[0])
950
951        assert result["Header"].dtype == np.dtype("float64")
952        tm.assert_frame_equal(result, expected)
953
954    def test_bool_header_arg(self):
955        # GH 6114
956        for arg in [True, False]:
957            with pytest.raises(TypeError):
958                self.read_html(self.spam_data, header=arg)
959
960    def test_converters(self):
961        # GH 13461
962        result = self.read_html(
963            """<table>
964                 <thead>
965                   <tr>
966                     <th>a</th>
967                    </tr>
968                 </thead>
969                 <tbody>
970                   <tr>
971                     <td> 0.763</td>
972                   </tr>
973                   <tr>
974                     <td> 0.244</td>
975                   </tr>
976                 </tbody>
977               </table>""",
978            converters={"a": str},
979        )[0]
980
981        expected = DataFrame({"a": ["0.763", "0.244"]})
982
983        tm.assert_frame_equal(result, expected)
984
985    def test_na_values(self):
986        # GH 13461
987        result = self.read_html(
988            """<table>
989                 <thead>
990                   <tr>
991                     <th>a</th>
992                   </tr>
993                 </thead>
994                 <tbody>
995                   <tr>
996                     <td> 0.763</td>
997                   </tr>
998                   <tr>
999                     <td> 0.244</td>
1000                   </tr>
1001                 </tbody>
1002               </table>""",
1003            na_values=[0.244],
1004        )[0]
1005
1006        expected = DataFrame({"a": [0.763, np.nan]})
1007
1008        tm.assert_frame_equal(result, expected)
1009
1010    def test_keep_default_na(self):
1011        html_data = """<table>
1012                        <thead>
1013                            <tr>
1014                            <th>a</th>
1015                            </tr>
1016                        </thead>
1017                        <tbody>
1018                            <tr>
1019                            <td> N/A</td>
1020                            </tr>
1021                            <tr>
1022                            <td> NA</td>
1023                            </tr>
1024                        </tbody>
1025                    </table>"""
1026
1027        expected_df = DataFrame({"a": ["N/A", "NA"]})
1028        html_df = self.read_html(html_data, keep_default_na=False)[0]
1029        tm.assert_frame_equal(expected_df, html_df)
1030
1031        expected_df = DataFrame({"a": [np.nan, np.nan]})
1032        html_df = self.read_html(html_data, keep_default_na=True)[0]
1033        tm.assert_frame_equal(expected_df, html_df)
1034
1035    def test_preserve_empty_rows(self):
1036        result = self.read_html(
1037            """
1038            <table>
1039                <tr>
1040                    <th>A</th>
1041                    <th>B</th>
1042                </tr>
1043                <tr>
1044                    <td>a</td>
1045                    <td>b</td>
1046                </tr>
1047                <tr>
1048                    <td></td>
1049                    <td></td>
1050                </tr>
1051            </table>
1052        """
1053        )[0]
1054
1055        expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"])
1056
1057        tm.assert_frame_equal(result, expected)
1058
1059    def test_ignore_empty_rows_when_inferring_header(self):
1060        result = self.read_html(
1061            """
1062            <table>
1063                <thead>
1064                    <tr><th></th><th></tr>
1065                    <tr><th>A</th><th>B</th></tr>
1066                    <tr><th>a</th><th>b</th></tr>
1067                </thead>
1068                <tbody>
1069                    <tr><td>1</td><td>2</td></tr>
1070                </tbody>
1071            </table>
1072        """
1073        )[0]
1074
1075        columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
1076        expected = DataFrame(data=[[1, 2]], columns=columns)
1077
1078        tm.assert_frame_equal(result, expected)
1079
1080    def test_multiple_header_rows(self):
1081        # Issue #13434
1082        expected_df = DataFrame(
1083            data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]
1084        )
1085        expected_df.columns = [
1086            ["Unnamed: 0_level_0", "Age", "Party"],
1087            ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"],
1088        ]
1089        html = expected_df.to_html(index=False)
1090        html_df = self.read_html(html)[0]
1091        tm.assert_frame_equal(expected_df, html_df)
1092
1093    def test_works_on_valid_markup(self, datapath):
1094        filename = datapath("io", "data", "html", "valid_markup.html")
1095        dfs = self.read_html(filename, index_col=0)
1096        assert isinstance(dfs, list)
1097        assert isinstance(dfs[0], DataFrame)
1098
1099    @pytest.mark.slow
1100    def test_fallback_success(self, datapath):
1101        banklist_data = datapath("io", "data", "html", "banklist.html")
1102        self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])
1103
1104    def test_to_html_timestamp(self):
1105        rng = date_range("2000-01-01", periods=10)
1106        df = DataFrame(np.random.randn(10, 4), index=rng)
1107
1108        result = df.to_html()
1109        assert "2000-01-01" in result
1110
1111    @pytest.mark.parametrize(
1112        "displayed_only,exp0,exp1",
1113        [
1114            (True, DataFrame(["foo"]), None),
1115            (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"])),
1116        ],
1117    )
1118    def test_displayed_only(self, displayed_only, exp0, exp1):
1119        # GH 20027
1120        data = StringIO(
1121            """<html>
1122          <body>
1123            <table>
1124              <tr>
1125                <td>
1126                  foo
1127                  <span style="display:none;text-align:center">bar</span>
1128                  <span style="display:none">baz</span>
1129                  <span style="display: none">qux</span>
1130                </td>
1131              </tr>
1132            </table>
1133            <table style="display: none">
1134              <tr>
1135                <td>foo</td>
1136              </tr>
1137            </table>
1138          </body>
1139        </html>"""
1140        )
1141
1142        dfs = self.read_html(data, displayed_only=displayed_only)
1143        tm.assert_frame_equal(dfs[0], exp0)
1144
1145        if exp1 is not None:
1146            tm.assert_frame_equal(dfs[1], exp1)
1147        else:
1148            assert len(dfs) == 1  # Should not parse hidden table
1149
1150    def test_encode(self, html_encoding_file):
1151        base_path = os.path.basename(html_encoding_file)
1152        root = os.path.splitext(base_path)[0]
1153        _, encoding = root.split("_")
1154
1155        try:
1156            with open(html_encoding_file, "rb") as fobj:
1157                from_string = self.read_html(
1158                    fobj.read(), encoding=encoding, index_col=0
1159                ).pop()
1160
1161            with open(html_encoding_file, "rb") as fobj:
1162                from_file_like = self.read_html(
1163                    BytesIO(fobj.read()), encoding=encoding, index_col=0
1164                ).pop()
1165
1166            from_filename = self.read_html(
1167                html_encoding_file, encoding=encoding, index_col=0
1168            ).pop()
1169            tm.assert_frame_equal(from_string, from_file_like)
1170            tm.assert_frame_equal(from_string, from_filename)
1171        except Exception:
1172            # seems utf-16/32 fail on windows
1173            if is_platform_windows():
1174                if "16" in encoding or "32" in encoding:
1175                    pytest.skip()
1176            raise
1177
1178    def test_parse_failure_unseekable(self):
1179        # Issue #17975
1180
1181        if self.read_html.keywords.get("flavor") == "lxml":
1182            pytest.skip("Not applicable for lxml")
1183
1184        class UnseekableStringIO(StringIO):
1185            def seekable(self):
1186                return False
1187
1188        bad = UnseekableStringIO(
1189            """
1190            <table><tr><td>spam<foobr />eggs</td></tr></table>"""
1191        )
1192
1193        assert self.read_html(bad)
1194
1195        with pytest.raises(ValueError, match="passed a non-rewindable file object"):
1196            self.read_html(bad)
1197
1198    def test_parse_failure_rewinds(self):
1199        # Issue #17975
1200
1201        class MockFile:
1202            def __init__(self, data):
1203                self.data = data
1204                self.at_end = False
1205
1206            def read(self, size=None):
1207                data = "" if self.at_end else self.data
1208                self.at_end = True
1209                return data
1210
1211            def seek(self, offset):
1212                self.at_end = False
1213
1214            def seekable(self):
1215                return True
1216
1217        good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>")
1218        bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>")
1219
1220        assert self.read_html(good)
1221        assert self.read_html(bad)
1222
1223    @pytest.mark.slow
1224    def test_importcheck_thread_safety(self, datapath):
1225        # see gh-16928
1226
1227        class ErrorThread(threading.Thread):
1228            def run(self):
1229                try:
1230                    super().run()
1231                except Exception as err:
1232                    self.err = err
1233                else:
1234                    self.err = None
1235
1236        # force import check by reinitalising global vars in html.py
1237        reload(pandas.io.html)
1238
1239        filename = datapath("io", "data", "html", "valid_markup.html")
1240        helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
1241        helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))
1242
1243        helper_thread1.start()
1244        helper_thread2.start()
1245
1246        while helper_thread1.is_alive() or helper_thread2.is_alive():
1247            pass
1248        assert None is helper_thread1.err is helper_thread2.err
1249
1250    def test_parse_path_object(self, datapath):
1251        # GH 37705
1252        file_path_string = datapath("io", "data", "html", "spam.html")
1253        file_path = Path(file_path_string)
1254        df1 = self.read_html(file_path_string)[0]
1255        df2 = self.read_html(file_path)[0]
1256        tm.assert_frame_equal(df1, df2)
1257