1from datetime import datetime, time
2from functools import partial
3import os
4from pathlib import Path
5from urllib.error import URLError
6from zipfile import BadZipFile
7
8import numpy as np
9import pytest
10
11import pandas.util._test_decorators as td
12
13import pandas as pd
14from pandas import DataFrame, Index, MultiIndex, Series
15import pandas._testing as tm
16from pandas.tests.io.excel import xlrd_version
17
18read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
19engine_params = [
20    # Add any engines to test here
21    # When defusedxml is installed it triggers deprecation warnings for
22    # xlrd and openpyxl, so catch those here
23    pytest.param(
24        "xlrd",
25        marks=[
26            td.skip_if_no("xlrd"),
27        ],
28    ),
29    pytest.param(
30        "openpyxl",
31        marks=[
32            td.skip_if_no("openpyxl"),
33            pytest.mark.filterwarnings("ignore:.*html argument"),
34        ],
35    ),
36    pytest.param(
37        None,
38        marks=[
39            td.skip_if_no("xlrd"),
40        ],
41    ),
42    pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
43    pytest.param("odf", marks=td.skip_if_no("odf")),
44]
45
46
47def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
48    """
49    Filter out invalid (engine, ext) pairs instead of skipping, as that
50    produces 500+ pytest.skips.
51    """
52    engine = engine.values[0]
53    if engine == "openpyxl" and read_ext == ".xls":
54        return False
55    if engine == "odf" and read_ext != ".ods":
56        return False
57    if read_ext == ".ods" and engine != "odf":
58        return False
59    if engine == "pyxlsb" and read_ext != ".xlsb":
60        return False
61    if read_ext == ".xlsb" and engine != "pyxlsb":
62        return False
63    if (
64        engine == "xlrd"
65        and xlrd_version is not None
66        and xlrd_version >= "2"
67        and read_ext != ".xls"
68    ):
69        return False
70    return True
71
72
73def _transfer_marks(engine, read_ext):
74    """
75    engine gives us a pytest.param objec with some marks, read_ext is just
76    a string.  We need to generate a new pytest.param inheriting the marks.
77    """
78    values = engine.values + (read_ext,)
79    new_param = pytest.param(values, marks=engine.marks)
80    return new_param
81
82
83@pytest.fixture(
84    autouse=True,
85    params=[
86        _transfer_marks(eng, ext)
87        for eng in engine_params
88        for ext in read_ext_params
89        if _is_valid_engine_ext_pair(eng, ext)
90    ],
91)
92def engine_and_read_ext(request):
93    """
94    Fixture for Excel reader engine and read_ext, only including valid pairs.
95    """
96    return request.param
97
98
99@pytest.fixture
100def engine(engine_and_read_ext):
101    engine, read_ext = engine_and_read_ext
102    return engine
103
104
105@pytest.fixture
106def read_ext(engine_and_read_ext):
107    engine, read_ext = engine_and_read_ext
108    return read_ext
109
110
111class TestReaders:
112    @pytest.fixture(autouse=True)
113    def cd_and_set_engine(self, engine, datapath, monkeypatch):
114        """
115        Change directory and set engine for read_excel calls.
116        """
117        func = partial(pd.read_excel, engine=engine)
118        monkeypatch.chdir(datapath("io", "data", "excel"))
119        monkeypatch.setattr(pd, "read_excel", func)
120
121    def test_usecols_int(self, read_ext, df_ref):
122        df_ref = df_ref.reindex(columns=["A", "B", "C"])
123
124        # usecols as int
125        msg = "Passing an integer for `usecols`"
126        with pytest.raises(ValueError, match=msg):
127            pd.read_excel(
128                "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
129            )
130
131        # usecols as int
132        with pytest.raises(ValueError, match=msg):
133            pd.read_excel(
134                "test1" + read_ext,
135                sheet_name="Sheet2",
136                skiprows=[1],
137                index_col=0,
138                usecols=3,
139            )
140
141    def test_usecols_list(self, read_ext, df_ref):
142        if pd.read_excel.keywords["engine"] == "pyxlsb":
143            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
144
145        df_ref = df_ref.reindex(columns=["B", "C"])
146        df1 = pd.read_excel(
147            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
148        )
149        df2 = pd.read_excel(
150            "test1" + read_ext,
151            sheet_name="Sheet2",
152            skiprows=[1],
153            index_col=0,
154            usecols=[0, 2, 3],
155        )
156
157        # TODO add index to xls file)
158        tm.assert_frame_equal(df1, df_ref, check_names=False)
159        tm.assert_frame_equal(df2, df_ref, check_names=False)
160
161    def test_usecols_str(self, read_ext, df_ref):
162        if pd.read_excel.keywords["engine"] == "pyxlsb":
163            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
164
165        df1 = df_ref.reindex(columns=["A", "B", "C"])
166        df2 = pd.read_excel(
167            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
168        )
169        df3 = pd.read_excel(
170            "test1" + read_ext,
171            sheet_name="Sheet2",
172            skiprows=[1],
173            index_col=0,
174            usecols="A:D",
175        )
176
177        # TODO add index to xls, read xls ignores index name ?
178        tm.assert_frame_equal(df2, df1, check_names=False)
179        tm.assert_frame_equal(df3, df1, check_names=False)
180
181        df1 = df_ref.reindex(columns=["B", "C"])
182        df2 = pd.read_excel(
183            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
184        )
185        df3 = pd.read_excel(
186            "test1" + read_ext,
187            sheet_name="Sheet2",
188            skiprows=[1],
189            index_col=0,
190            usecols="A,C,D",
191        )
192        # TODO add index to xls file
193        tm.assert_frame_equal(df2, df1, check_names=False)
194        tm.assert_frame_equal(df3, df1, check_names=False)
195
196        df1 = df_ref.reindex(columns=["B", "C"])
197        df2 = pd.read_excel(
198            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
199        )
200        df3 = pd.read_excel(
201            "test1" + read_ext,
202            sheet_name="Sheet2",
203            skiprows=[1],
204            index_col=0,
205            usecols="A,C:D",
206        )
207        tm.assert_frame_equal(df2, df1, check_names=False)
208        tm.assert_frame_equal(df3, df1, check_names=False)
209
210    @pytest.mark.parametrize(
211        "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
212    )
213    def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref):
214        if pd.read_excel.keywords["engine"] == "pyxlsb":
215            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
216
217        expected = df_ref[["A", "C"]]
218        result = pd.read_excel(
219            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
220        )
221        tm.assert_frame_equal(result, expected, check_names=False)
222
223    @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
224    def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
225        expected = df_ref[["B", "D"]]
226        expected.index = range(len(expected))
227
228        result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
229        tm.assert_frame_equal(result, expected, check_names=False)
230
231    def test_read_excel_without_slicing(self, read_ext, df_ref):
232        if pd.read_excel.keywords["engine"] == "pyxlsb":
233            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
234
235        expected = df_ref
236        result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
237        tm.assert_frame_equal(result, expected, check_names=False)
238
239    def test_usecols_excel_range_str(self, read_ext, df_ref):
240        if pd.read_excel.keywords["engine"] == "pyxlsb":
241            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
242
243        expected = df_ref[["C", "D"]]
244        result = pd.read_excel(
245            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
246        )
247        tm.assert_frame_equal(result, expected, check_names=False)
248
249    def test_usecols_excel_range_str_invalid(self, read_ext):
250        msg = "Invalid column name: E1"
251
252        with pytest.raises(ValueError, match=msg):
253            pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1")
254
255    def test_index_col_label_error(self, read_ext):
256        msg = "list indices must be integers.*, not str"
257
258        with pytest.raises(TypeError, match=msg):
259            pd.read_excel(
260                "test1" + read_ext,
261                sheet_name="Sheet1",
262                index_col=["A"],
263                usecols=["A", "C"],
264            )
265
266    def test_index_col_empty(self, read_ext):
267        # see gh-9208
268        result = pd.read_excel(
269            "test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"]
270        )
271        expected = DataFrame(
272            columns=["D", "E", "F"],
273            index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
274        )
275        tm.assert_frame_equal(result, expected)
276
277    @pytest.mark.parametrize("index_col", [None, 2])
278    def test_index_col_with_unnamed(self, read_ext, index_col):
279        # see gh-18792
280        result = pd.read_excel(
281            "test1" + read_ext, sheet_name="Sheet4", index_col=index_col
282        )
283        expected = DataFrame(
284            [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
285        )
286        if index_col:
287            expected = expected.set_index(expected.columns[index_col])
288
289        tm.assert_frame_equal(result, expected)
290
291    def test_usecols_pass_non_existent_column(self, read_ext):
292        msg = (
293            "Usecols do not match columns, "
294            "columns expected but not found: " + r"\['E'\]"
295        )
296
297        with pytest.raises(ValueError, match=msg):
298            pd.read_excel("test1" + read_ext, usecols=["E"])
299
300    def test_usecols_wrong_type(self, read_ext):
301        msg = (
302            "'usecols' must either be list-like of "
303            "all strings, all unicode, all integers or a callable."
304        )
305
306        with pytest.raises(ValueError, match=msg):
307            pd.read_excel("test1" + read_ext, usecols=["E1", 0])
308
309    def test_excel_stop_iterator(self, read_ext):
310
311        parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1")
312        expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
313        tm.assert_frame_equal(parsed, expected)
314
315    def test_excel_cell_error_na(self, read_ext):
316        if pd.read_excel.keywords["engine"] == "pyxlsb":
317            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
318
319        parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1")
320        expected = DataFrame([[np.nan]], columns=["Test"])
321        tm.assert_frame_equal(parsed, expected)
322
323    def test_excel_table(self, read_ext, df_ref):
324        if pd.read_excel.keywords["engine"] == "pyxlsb":
325            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
326
327        df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
328        df2 = pd.read_excel(
329            "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
330        )
331        # TODO add index to file
332        tm.assert_frame_equal(df1, df_ref, check_names=False)
333        tm.assert_frame_equal(df2, df_ref, check_names=False)
334
335        df3 = pd.read_excel(
336            "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
337        )
338        tm.assert_frame_equal(df3, df1.iloc[:-1])
339
340    def test_reader_special_dtypes(self, read_ext):
341        if pd.read_excel.keywords["engine"] == "pyxlsb":
342            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
343
344        expected = DataFrame.from_dict(
345            {
346                "IntCol": [1, 2, -3, 4, 0],
347                "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
348                "BoolCol": [True, False, True, True, False],
349                "StrCol": [1, 2, 3, 4, 5],
350                # GH5394 - this is why convert_float isn't vectorized
351                "Str2Col": ["a", 3, "c", "d", "e"],
352                "DateCol": [
353                    datetime(2013, 10, 30),
354                    datetime(2013, 10, 31),
355                    datetime(1905, 1, 1),
356                    datetime(2013, 12, 14),
357                    datetime(2015, 3, 14),
358                ],
359            },
360        )
361        basename = "test_types"
362
363        # should read in correctly and infer types
364        actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
365        tm.assert_frame_equal(actual, expected)
366
367        # if not coercing number, then int comes in as float
368        float_expected = expected.copy()
369        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
370        float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
371        actual = pd.read_excel(
372            basename + read_ext, sheet_name="Sheet1", convert_float=False
373        )
374        tm.assert_frame_equal(actual, float_expected)
375
376        # check setting Index (assuming xls and xlsx are the same here)
377        for icol, name in enumerate(expected.columns):
378            actual = pd.read_excel(
379                basename + read_ext, sheet_name="Sheet1", index_col=icol
380            )
381            exp = expected.set_index(name)
382            tm.assert_frame_equal(actual, exp)
383
384        # convert_float and converters should be different but both accepted
385        expected["StrCol"] = expected["StrCol"].apply(str)
386        actual = pd.read_excel(
387            basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
388        )
389        tm.assert_frame_equal(actual, expected)
390
391        no_convert_float = float_expected.copy()
392        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
393        actual = pd.read_excel(
394            basename + read_ext,
395            sheet_name="Sheet1",
396            convert_float=False,
397            converters={"StrCol": str},
398        )
399        tm.assert_frame_equal(actual, no_convert_float)
400
401    # GH8212 - support for converters and missing values
402    def test_reader_converters(self, read_ext):
403
404        basename = "test_converters"
405
406        expected = DataFrame.from_dict(
407            {
408                "IntCol": [1, 2, -3, -1000, 0],
409                "FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005],
410                "BoolCol": ["Found", "Found", "Found", "Not found", "Found"],
411                "StrCol": ["1", np.nan, "3", "4", "5"],
412            }
413        )
414
415        converters = {
416            "IntCol": lambda x: int(x) if x != "" else -1000,
417            "FloatCol": lambda x: 10 * x if x else np.nan,
418            2: lambda x: "Found" if x != "" else "Not found",
419            3: lambda x: str(x) if x else "",
420        }
421
422        # should read in correctly and set types of single cells (not array
423        # dtypes)
424        actual = pd.read_excel(
425            basename + read_ext, sheet_name="Sheet1", converters=converters
426        )
427        tm.assert_frame_equal(actual, expected)
428
429    def test_reader_dtype(self, read_ext):
430        # GH 8212
431        basename = "testdtype"
432        actual = pd.read_excel(basename + read_ext)
433
434        expected = DataFrame(
435            {
436                "a": [1, 2, 3, 4],
437                "b": [2.5, 3.5, 4.5, 5.5],
438                "c": [1, 2, 3, 4],
439                "d": [1.0, 2.0, np.nan, 4.0],
440            }
441        ).reindex(columns=["a", "b", "c", "d"])
442
443        tm.assert_frame_equal(actual, expected)
444
445        actual = pd.read_excel(
446            basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
447        )
448
449        expected["a"] = expected["a"].astype("float64")
450        expected["b"] = expected["b"].astype("float32")
451        expected["c"] = ["001", "002", "003", "004"]
452        tm.assert_frame_equal(actual, expected)
453
454        msg = "Unable to convert column d to type int64"
455        with pytest.raises(ValueError, match=msg):
456            pd.read_excel(basename + read_ext, dtype={"d": "int64"})
457
458    @pytest.mark.parametrize(
459        "dtype,expected",
460        [
461            (
462                None,
463                DataFrame(
464                    {
465                        "a": [1, 2, 3, 4],
466                        "b": [2.5, 3.5, 4.5, 5.5],
467                        "c": [1, 2, 3, 4],
468                        "d": [1.0, 2.0, np.nan, 4.0],
469                    }
470                ),
471            ),
472            (
473                {"a": "float64", "b": "float32", "c": str, "d": str},
474                DataFrame(
475                    {
476                        "a": Series([1, 2, 3, 4], dtype="float64"),
477                        "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
478                        "c": ["001", "002", "003", "004"],
479                        "d": ["1", "2", np.nan, "4"],
480                    }
481                ),
482            ),
483        ],
484    )
485    def test_reader_dtype_str(self, read_ext, dtype, expected):
486        # see gh-20377
487        basename = "testdtype"
488
489        actual = pd.read_excel(basename + read_ext, dtype=dtype)
490        tm.assert_frame_equal(actual, expected)
491
492    def test_reader_spaces(self, read_ext):
493        # see gh-32207
494        basename = "test_spaces"
495
496        actual = pd.read_excel(basename + read_ext)
497        expected = DataFrame(
498            {
499                "testcol": [
500                    "this is great",
501                    "4    spaces",
502                    "1 trailing ",
503                    " 1 leading",
504                    "2  spaces  multiple  times",
505                ]
506            }
507        )
508        tm.assert_frame_equal(actual, expected)
509
510    # gh-36122, gh-35802
511    @pytest.mark.parametrize(
512        "basename,expected",
513        [
514            ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
515            ("gh-36122", DataFrame(columns=["got 2nd sa"])),
516        ],
517    )
518    def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
519        # see gh-35802
520        engine = pd.read_excel.keywords["engine"]
521        if engine != "odf":
522            pytest.skip(f"Skipped for engine: {engine}")
523
524        actual = pd.read_excel(basename + read_ext)
525        tm.assert_frame_equal(actual, expected)
526
527    def test_reading_all_sheets(self, read_ext):
528        # Test reading all sheet names by setting sheet_name to None,
529        # Ensure a dict is returned.
530        # See PR #9450
531        basename = "test_multisheet"
532        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
533        # ensure this is not alphabetical to test order preservation
534        expected_keys = ["Charlie", "Alpha", "Beta"]
535        tm.assert_contains_all(expected_keys, dfs.keys())
536        # Issue 9930
537        # Ensure sheet order is preserved
538        assert expected_keys == list(dfs.keys())
539
540    def test_reading_multiple_specific_sheets(self, read_ext):
541        # Test reading specific sheet names by specifying a mixed list
542        # of integers and strings, and confirm that duplicated sheet
543        # references (positions/names) are removed properly.
544        # Ensure a dict is returned
545        # See PR #9450
546        basename = "test_multisheet"
547        # Explicitly request duplicates. Only the set should be returned.
548        expected_keys = [2, "Charlie", "Charlie"]
549        dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
550        expected_keys = list(set(expected_keys))
551        tm.assert_contains_all(expected_keys, dfs.keys())
552        assert len(expected_keys) == len(dfs.keys())
553
554    def test_reading_all_sheets_with_blank(self, read_ext):
555        # Test reading all sheet names by setting sheet_name to None,
556        # In the case where some sheets are blank.
557        # Issue #11711
558        basename = "blank_with_header"
559        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
560        expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
561        tm.assert_contains_all(expected_keys, dfs.keys())
562
563    # GH6403
564    def test_read_excel_blank(self, read_ext):
565        actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1")
566        tm.assert_frame_equal(actual, DataFrame())
567
568    def test_read_excel_blank_with_header(self, read_ext):
569        expected = DataFrame(columns=["col_1", "col_2"])
570        actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1")
571        tm.assert_frame_equal(actual, expected)
572
573    def test_date_conversion_overflow(self, read_ext):
574        # GH 10001 : pandas.ExcelFile ignore parse_dates=False
575        if pd.read_excel.keywords["engine"] == "pyxlsb":
576            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
577
578        expected = DataFrame(
579            [
580                [pd.Timestamp("2016-03-12"), "Marc Johnson"],
581                [pd.Timestamp("2016-03-16"), "Jack Black"],
582                [1e20, "Timothy Brown"],
583            ],
584            columns=["DateColWithBigInt", "StringCol"],
585        )
586
587        if pd.read_excel.keywords["engine"] == "openpyxl":
588            pytest.xfail("Maybe not supported by openpyxl")
589
590        if pd.read_excel.keywords["engine"] is None:
591            # GH 35029
592            pytest.xfail("Defaults to openpyxl, maybe not supported")
593
594        result = pd.read_excel("testdateoverflow" + read_ext)
595        tm.assert_frame_equal(result, expected)
596
597    def test_sheet_name(self, read_ext, df_ref):
598        if pd.read_excel.keywords["engine"] == "pyxlsb":
599            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
600        filename = "test1"
601        sheet_name = "Sheet1"
602
603        if pd.read_excel.keywords["engine"] == "openpyxl":
604            pytest.xfail("Maybe not supported by openpyxl")
605
606        df1 = pd.read_excel(
607            filename + read_ext, sheet_name=sheet_name, index_col=0
608        )  # doc
609        df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
610
611        tm.assert_frame_equal(df1, df_ref, check_names=False)
612        tm.assert_frame_equal(df2, df_ref, check_names=False)
613
614    def test_excel_read_buffer(self, read_ext):
615
616        pth = "test1" + read_ext
617        expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0)
618        with open(pth, "rb") as f:
619            actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
620            tm.assert_frame_equal(expected, actual)
621
622    def test_bad_engine_raises(self, read_ext):
623        bad_engine = "foo"
624        with pytest.raises(ValueError, match="Unknown engine: foo"):
625            pd.read_excel("", engine=bad_engine)
626
627    @pytest.mark.parametrize(
628        "sheet_name",
629        [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
630    )
631    def test_bad_sheetname_raises(self, read_ext, sheet_name):
632        # GH 39250
633        msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
634        with pytest.raises(ValueError, match=msg):
635            pd.read_excel("blank" + read_ext, sheet_name=sheet_name)
636
637    def test_missing_file_raises(self, read_ext):
638        bad_file = f"foo{read_ext}"
639        # CI tests with zh_CN.utf8, translates to "No such file or directory"
640        with pytest.raises(
641            FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)"
642        ):
643            pd.read_excel(bad_file)
644
645    def test_corrupt_bytes_raises(self, read_ext, engine):
646        bad_stream = b"foo"
647        if engine is None or engine == "xlrd":
648            error = ValueError
649            msg = "File is not a recognized excel file"
650        else:
651            error = BadZipFile
652            msg = "File is not a zip file"
653        with pytest.raises(error, match=msg):
654            pd.read_excel(bad_stream)
655
656    @tm.network
657    def test_read_from_http_url(self, read_ext):
658        url = (
659            "https://raw.githubusercontent.com/pandas-dev/pandas/master/"
660            "pandas/tests/io/data/excel/test1" + read_ext
661        )
662        url_table = pd.read_excel(url)
663        local_table = pd.read_excel("test1" + read_ext)
664        tm.assert_frame_equal(url_table, local_table)
665
666    @td.skip_if_not_us_locale
667    def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
668        # Bucket "pandas-test" created in tests/io/conftest.py
669        with open("test1" + read_ext, "rb") as f:
670            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
671
672        url = "s3://pandas-test/test1" + read_ext
673
674        url_table = pd.read_excel(url, storage_options=s3so)
675        local_table = pd.read_excel("test1" + read_ext)
676        tm.assert_frame_equal(url_table, local_table)
677
678    def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
679        # GH 38788
680        # Bucket "pandas-test" created in tests/io/conftest.py
681        with open("test1" + read_ext, "rb") as f:
682            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
683
684        import s3fs
685
686        s3 = s3fs.S3FileSystem(**s3so)
687
688        with s3.open("s3://pandas-test/test1" + read_ext) as f:
689            url_table = pd.read_excel(f)
690
691        local_table = pd.read_excel("test1" + read_ext)
692        tm.assert_frame_equal(url_table, local_table)
693
694    @pytest.mark.slow
695    def test_read_from_file_url(self, read_ext, datapath):
696
697        # FILE
698        localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext)
699        local_table = pd.read_excel(localtable)
700
701        try:
702            url_table = pd.read_excel("file://localhost/" + localtable)
703        except URLError:
704            # fails on some systems
705            import platform
706
707            platform_info = " ".join(platform.uname()).strip()
708            pytest.skip(f"failing on {platform_info}")
709
710        tm.assert_frame_equal(url_table, local_table)
711
712    def test_read_from_pathlib_path(self, read_ext):
713
714        # GH12655
715        from pathlib import Path
716
717        str_path = "test1" + read_ext
718        expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
719
720        path_obj = Path("test1" + read_ext)
721        actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
722
723        tm.assert_frame_equal(expected, actual)
724
725    @td.skip_if_no("py.path")
726    @td.check_file_leaks
727    def test_read_from_py_localpath(self, read_ext):
728
729        # GH12655
730        from py.path import local as LocalPath
731
732        str_path = os.path.join("test1" + read_ext)
733        expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
734
735        path_obj = LocalPath().join("test1" + read_ext)
736        actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
737
738        tm.assert_frame_equal(expected, actual)
739
740    @td.check_file_leaks
741    def test_close_from_py_localpath(self, read_ext):
742
743        # GH31467
744        str_path = os.path.join("test1" + read_ext)
745        with open(str_path, "rb") as f:
746            x = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
747            del x
748            # should not throw an exception because the passed file was closed
749            f.read()
750
751    def test_reader_seconds(self, read_ext):
752        if pd.read_excel.keywords["engine"] == "pyxlsb":
753            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
754
755        # Test reading times with and without milliseconds. GH5945.
756        expected = DataFrame.from_dict(
757            {
758                "Time": [
759                    time(1, 2, 3),
760                    time(2, 45, 56, 100000),
761                    time(4, 29, 49, 200000),
762                    time(6, 13, 42, 300000),
763                    time(7, 57, 35, 400000),
764                    time(9, 41, 28, 500000),
765                    time(11, 25, 21, 600000),
766                    time(13, 9, 14, 700000),
767                    time(14, 53, 7, 800000),
768                    time(16, 37, 0, 900000),
769                    time(18, 20, 54),
770                ]
771            }
772        )
773
774        actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
775        tm.assert_frame_equal(actual, expected)
776
777        actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
778        tm.assert_frame_equal(actual, expected)
779
780    def test_read_excel_multiindex(self, read_ext):
781        # see gh-4679
782        if pd.read_excel.keywords["engine"] == "pyxlsb":
783            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
784
785        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
786        mi_file = "testmultiindex" + read_ext
787
788        # "mi_column" sheet
789        expected = DataFrame(
790            [
791                [1, 2.5, pd.Timestamp("2015-01-01"), True],
792                [2, 3.5, pd.Timestamp("2015-01-02"), False],
793                [3, 4.5, pd.Timestamp("2015-01-03"), False],
794                [4, 5.5, pd.Timestamp("2015-01-04"), True],
795            ],
796            columns=mi,
797        )
798
799        actual = pd.read_excel(
800            mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
801        )
802        tm.assert_frame_equal(actual, expected)
803
804        # "mi_index" sheet
805        expected.index = mi
806        expected.columns = ["a", "b", "c", "d"]
807
808        actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
809        tm.assert_frame_equal(actual, expected, check_names=False)
810
811        # "both" sheet
812        expected.columns = mi
813
814        actual = pd.read_excel(
815            mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
816        )
817        tm.assert_frame_equal(actual, expected, check_names=False)
818
819        # "mi_index_name" sheet
820        expected.columns = ["a", "b", "c", "d"]
821        expected.index = mi.set_names(["ilvl1", "ilvl2"])
822
823        actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
824        tm.assert_frame_equal(actual, expected)
825
826        # "mi_column_name" sheet
827        expected.index = list(range(4))
828        expected.columns = mi.set_names(["c1", "c2"])
829        actual = pd.read_excel(
830            mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
831        )
832        tm.assert_frame_equal(actual, expected)
833
834        # see gh-11317
835        # "name_with_int" sheet
836        expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
837
838        actual = pd.read_excel(
839            mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
840        )
841        tm.assert_frame_equal(actual, expected)
842
843        # "both_name" sheet
844        expected.columns = mi.set_names(["c1", "c2"])
845        expected.index = mi.set_names(["ilvl1", "ilvl2"])
846
847        actual = pd.read_excel(
848            mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
849        )
850        tm.assert_frame_equal(actual, expected)
851
852        # "both_skiprows" sheet
853        actual = pd.read_excel(
854            mi_file,
855            sheet_name="both_name_skiprows",
856            index_col=[0, 1],
857            header=[0, 1],
858            skiprows=2,
859        )
860        tm.assert_frame_equal(actual, expected)
861
862    def test_read_excel_multiindex_header_only(self, read_ext):
863        # see gh-11733.
864        #
865        # Don't try to parse a header name if there isn't one.
866        mi_file = "testmultiindex" + read_ext
867        result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1])
868
869        exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
870        expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
871        tm.assert_frame_equal(result, expected)
872
873    def test_excel_old_index_format(self, read_ext):
874        # see gh-4679
875        filename = "test_index_name_pre17" + read_ext
876
877        # We detect headers to determine if index names exist, so
878        # that "index" name in the "names" version of the data will
879        # now be interpreted as rows that include null data.
880        data = np.array(
881            [
882                [None, None, None, None, None],
883                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
884                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
885                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
886                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
887                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
888            ]
889        )
890        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
891        mi = MultiIndex(
892            levels=[
893                ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
894                ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
895            ],
896            codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
897            names=[None, None],
898        )
899        si = Index(
900            ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
901        )
902
903        expected = DataFrame(data, index=si, columns=columns)
904
905        actual = pd.read_excel(filename, sheet_name="single_names", index_col=0)
906        tm.assert_frame_equal(actual, expected)
907
908        expected.index = mi
909
910        actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1])
911        tm.assert_frame_equal(actual, expected)
912
913        # The analogous versions of the "names" version data
914        # where there are explicitly no names for the indices.
915        data = np.array(
916            [
917                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
918                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
919                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
920                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
921                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
922            ]
923        )
924        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
925        mi = MultiIndex(
926            levels=[
927                ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
928                ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
929            ],
930            codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
931            names=[None, None],
932        )
933        si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
934
935        expected = DataFrame(data, index=si, columns=columns)
936
937        actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0)
938        tm.assert_frame_equal(actual, expected)
939
940        expected.index = mi
941
942        actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
943        tm.assert_frame_equal(actual, expected, check_names=False)
944
945    def test_read_excel_bool_header_arg(self, read_ext):
946        # GH 6114
947        msg = "Passing a bool to header is invalid"
948        for arg in [True, False]:
949            with pytest.raises(TypeError, match=msg):
950                pd.read_excel("test1" + read_ext, header=arg)
951
952    def test_read_excel_skiprows(self, read_ext):
953        # GH 4903
954        if pd.read_excel.keywords["engine"] == "pyxlsb":
955            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
956
957        actual = pd.read_excel(
958            "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
959        )
960        expected = DataFrame(
961            [
962                [1, 2.5, pd.Timestamp("2015-01-01"), True],
963                [2, 3.5, pd.Timestamp("2015-01-02"), False],
964                [3, 4.5, pd.Timestamp("2015-01-03"), False],
965                [4, 5.5, pd.Timestamp("2015-01-04"), True],
966            ],
967            columns=["a", "b", "c", "d"],
968        )
969        tm.assert_frame_equal(actual, expected)
970
971        actual = pd.read_excel(
972            "testskiprows" + read_ext,
973            sheet_name="skiprows_list",
974            skiprows=np.array([0, 2]),
975        )
976        tm.assert_frame_equal(actual, expected)
977
978        # GH36435
979        actual = pd.read_excel(
980            "testskiprows" + read_ext,
981            sheet_name="skiprows_list",
982            skiprows=lambda x: x in [0, 2],
983        )
984        tm.assert_frame_equal(actual, expected)
985
986        actual = pd.read_excel(
987            "testskiprows" + read_ext,
988            sheet_name="skiprows_list",
989            skiprows=3,
990            names=["a", "b", "c", "d"],
991        )
992        expected = DataFrame(
993            [
994                # [1, 2.5, pd.Timestamp("2015-01-01"), True],
995                [2, 3.5, pd.Timestamp("2015-01-02"), False],
996                [3, 4.5, pd.Timestamp("2015-01-03"), False],
997                [4, 5.5, pd.Timestamp("2015-01-04"), True],
998            ],
999            columns=["a", "b", "c", "d"],
1000        )
1001        tm.assert_frame_equal(actual, expected)
1002
1003    def test_read_excel_nrows(self, read_ext):
1004        # GH 16645
1005        num_rows_to_pull = 5
1006        actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
1007        expected = pd.read_excel("test1" + read_ext)
1008        expected = expected[:num_rows_to_pull]
1009        tm.assert_frame_equal(actual, expected)
1010
1011    def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
1012        # GH 16645
1013        expected = pd.read_excel("test1" + read_ext)
1014        num_records_in_file = len(expected)
1015        num_rows_to_pull = num_records_in_file + 10
1016        actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
1017        tm.assert_frame_equal(actual, expected)
1018
1019    def test_read_excel_nrows_non_integer_parameter(self, read_ext):
1020        # GH 16645
1021        msg = "'nrows' must be an integer >=0"
1022        with pytest.raises(ValueError, match=msg):
1023            pd.read_excel("test1" + read_ext, nrows="5")
1024
1025    def test_read_excel_squeeze(self, read_ext):
1026        # GH 12157
1027        f = "test_squeeze" + read_ext
1028
1029        actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
1030        expected = Series([2, 3, 4], [4, 5, 6], name="b")
1031        expected.index.name = "a"
1032        tm.assert_series_equal(actual, expected)
1033
1034        actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
1035        expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
1036        tm.assert_frame_equal(actual, expected)
1037
1038        actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
1039        expected = Series([1, 2, 3], name="a")
1040        tm.assert_series_equal(actual, expected)
1041
1042    def test_deprecated_kwargs(self, read_ext):
1043        with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
1044            pd.read_excel("test1" + read_ext, "Sheet1", 0)
1045
1046        pd.read_excel("test1" + read_ext)
1047
1048    def test_no_header_with_list_index_col(self, read_ext):
1049        # GH 31783
1050        file_name = "testmultiindex" + read_ext
1051        data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
1052        idx = MultiIndex.from_tuples(
1053            [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
1054        )
1055        expected = DataFrame(data, index=idx, columns=(2, 3))
1056        result = pd.read_excel(
1057            file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
1058        )
1059        tm.assert_frame_equal(expected, result)
1060
1061
1062class TestExcelFileRead:
1063    @pytest.fixture(autouse=True)
1064    def cd_and_set_engine(self, engine, datapath, monkeypatch):
1065        """
1066        Change directory and set engine for ExcelFile objects.
1067        """
1068        func = partial(pd.ExcelFile, engine=engine)
1069        monkeypatch.chdir(datapath("io", "data", "excel"))
1070        monkeypatch.setattr(pd, "ExcelFile", func)
1071
1072    def test_excel_passes_na(self, read_ext):
1073        with pd.ExcelFile("test4" + read_ext) as excel:
1074            parsed = pd.read_excel(
1075                excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
1076            )
1077        expected = DataFrame(
1078            [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
1079        )
1080        tm.assert_frame_equal(parsed, expected)
1081
1082        with pd.ExcelFile("test4" + read_ext) as excel:
1083            parsed = pd.read_excel(
1084                excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
1085            )
1086        expected = DataFrame(
1087            [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
1088        )
1089        tm.assert_frame_equal(parsed, expected)
1090
1091        # 13967
1092        with pd.ExcelFile("test5" + read_ext) as excel:
1093            parsed = pd.read_excel(
1094                excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
1095            )
1096        expected = DataFrame(
1097            [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
1098        )
1099        tm.assert_frame_equal(parsed, expected)
1100
1101        with pd.ExcelFile("test5" + read_ext) as excel:
1102            parsed = pd.read_excel(
1103                excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
1104            )
1105        expected = DataFrame(
1106            [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
1107        )
1108        tm.assert_frame_equal(parsed, expected)
1109
1110    @pytest.mark.parametrize("na_filter", [None, True, False])
1111    def test_excel_passes_na_filter(self, read_ext, na_filter):
1112        # gh-25453
1113        kwargs = {}
1114
1115        if na_filter is not None:
1116            kwargs["na_filter"] = na_filter
1117
1118        with pd.ExcelFile("test5" + read_ext) as excel:
1119            parsed = pd.read_excel(
1120                excel,
1121                sheet_name="Sheet1",
1122                keep_default_na=True,
1123                na_values=["apple"],
1124                **kwargs,
1125            )
1126
1127        if na_filter is False:
1128            expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]]
1129        else:
1130            expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]]
1131
1132        expected = DataFrame(expected, columns=["Test"])
1133        tm.assert_frame_equal(parsed, expected)
1134
1135    def test_excel_table_sheet_by_index(self, read_ext, df_ref):
1136        # For some reason pd.read_excel has no attribute 'keywords' here.
1137        # Skipping based on read_ext instead.
1138        if read_ext == ".xlsb":
1139            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
1140
1141        with pd.ExcelFile("test1" + read_ext) as excel:
1142            df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
1143            df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
1144        tm.assert_frame_equal(df1, df_ref, check_names=False)
1145        tm.assert_frame_equal(df2, df_ref, check_names=False)
1146
1147        with pd.ExcelFile("test1" + read_ext) as excel:
1148            df1 = excel.parse(0, index_col=0)
1149            df2 = excel.parse(1, skiprows=[1], index_col=0)
1150        tm.assert_frame_equal(df1, df_ref, check_names=False)
1151        tm.assert_frame_equal(df2, df_ref, check_names=False)
1152
1153        with pd.ExcelFile("test1" + read_ext) as excel:
1154            df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
1155        tm.assert_frame_equal(df3, df1.iloc[:-1])
1156
1157        with pd.ExcelFile("test1" + read_ext) as excel:
1158            df3 = excel.parse(0, index_col=0, skipfooter=1)
1159
1160        tm.assert_frame_equal(df3, df1.iloc[:-1])
1161
1162    def test_sheet_name(self, read_ext, df_ref):
1163        # For some reason pd.read_excel has no attribute 'keywords' here.
1164        # Skipping based on read_ext instead.
1165        if read_ext == ".xlsb":
1166            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
1167
1168        filename = "test1"
1169        sheet_name = "Sheet1"
1170
1171        with pd.ExcelFile(filename + read_ext) as excel:
1172            df1_parse = excel.parse(sheet_name=sheet_name, index_col=0)  # doc
1173
1174        with pd.ExcelFile(filename + read_ext) as excel:
1175            df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
1176
1177        tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
1178        tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
1179
1180    @pytest.mark.parametrize(
1181        "sheet_name",
1182        [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
1183    )
1184    def test_bad_sheetname_raises(self, read_ext, sheet_name):
1185        # GH 39250
1186        msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
1187        with pytest.raises(ValueError, match=msg):
1188            with pd.ExcelFile("blank" + read_ext) as excel:
1189                excel.parse(sheet_name=sheet_name)
1190
1191    def test_excel_read_buffer(self, engine, read_ext):
1192        pth = "test1" + read_ext
1193        expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine)
1194
1195        with open(pth, "rb") as f:
1196            with pd.ExcelFile(f) as xls:
1197                actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0)
1198
1199        tm.assert_frame_equal(expected, actual)
1200
1201    def test_reader_closes_file(self, engine, read_ext):
1202        with open("test1" + read_ext, "rb") as f:
1203            with pd.ExcelFile(f) as xlsx:
1204                # parses okay
1205                pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine)
1206
1207        assert f.closed
1208
1209    def test_conflicting_excel_engines(self, read_ext):
1210        # GH 26566
1211        msg = "Engine should not be specified when passing an ExcelFile"
1212
1213        with pd.ExcelFile("test1" + read_ext) as xl:
1214            with pytest.raises(ValueError, match=msg):
1215                pd.read_excel(xl, engine="foo")
1216
1217    def test_excel_read_binary(self, engine, read_ext):
1218        # GH 15914
1219        expected = pd.read_excel("test1" + read_ext, engine=engine)
1220
1221        with open("test1" + read_ext, "rb") as f:
1222            data = f.read()
1223
1224        actual = pd.read_excel(data, engine=engine)
1225        tm.assert_frame_equal(expected, actual)
1226
1227    def test_excel_read_binary_via_read_excel(self, read_ext, engine):
1228        # GH 38424
1229        if read_ext == ".xlsb" and engine == "pyxlsb":
1230            pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
1231        with open("test1" + read_ext, "rb") as f:
1232            result = pd.read_excel(f)
1233        expected = pd.read_excel("test1" + read_ext, engine=engine)
1234        tm.assert_frame_equal(result, expected)
1235
1236    @pytest.mark.skipif(
1237        xlrd_version is not None and xlrd_version >= "2",
1238        reason="xlrd no longer supports xlsx",
1239    )
1240    def test_excel_high_surrogate(self, engine):
1241        # GH 23809
1242        expected = DataFrame(["\udc88"], columns=["Column1"])
1243
1244        # should not produce a segmentation violation
1245        actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
1246        tm.assert_frame_equal(expected, actual)
1247
1248    @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
1249    def test_header_with_index_col(self, engine, filename):
1250        # GH 33476
1251        idx = Index(["Z"], name="I2")
1252        cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
1253        expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
1254        result = pd.read_excel(
1255            filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
1256        )
1257        tm.assert_frame_equal(expected, result)
1258
1259    def test_read_datetime_multiindex(self, engine, read_ext):
1260        # GH 34748
1261        if engine == "pyxlsb":
1262            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
1263
1264        f = "test_datetime_mi" + read_ext
1265        with pd.ExcelFile(f) as excel:
1266            actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
1267        expected_column_index = MultiIndex.from_tuples(
1268            [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
1269            names=[
1270                pd.to_datetime("02/29/2020").to_pydatetime(),
1271                pd.to_datetime("03/01/2020").to_pydatetime(),
1272            ],
1273        )
1274        expected = DataFrame([], columns=expected_column_index)
1275
1276        tm.assert_frame_equal(expected, actual)
1277
1278    def test_corrupt_files_closed(self, request, engine, read_ext):
1279        # GH41778
1280        errors = (BadZipFile, ValueError)
1281        if engine is None:
1282            pytest.skip()
1283        elif engine == "xlrd":
1284            import xlrd
1285
1286            errors = (BadZipFile, ValueError, xlrd.biffh.XLRDError)
1287
1288        with tm.ensure_clean(f"corrupt{read_ext}") as file:
1289            Path(file).write_text("corrupt")
1290            with tm.assert_produces_warning(False):
1291                try:
1292                    pd.ExcelFile(file, engine=engine)
1293                except errors:
1294                    pass
1295