1from datetime import datetime, time 2from functools import partial 3import os 4from pathlib import Path 5from urllib.error import URLError 6from zipfile import BadZipFile 7 8import numpy as np 9import pytest 10 11import pandas.util._test_decorators as td 12 13import pandas as pd 14from pandas import DataFrame, Index, MultiIndex, Series 15import pandas._testing as tm 16from pandas.tests.io.excel import xlrd_version 17 18read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] 19engine_params = [ 20 # Add any engines to test here 21 # When defusedxml is installed it triggers deprecation warnings for 22 # xlrd and openpyxl, so catch those here 23 pytest.param( 24 "xlrd", 25 marks=[ 26 td.skip_if_no("xlrd"), 27 ], 28 ), 29 pytest.param( 30 "openpyxl", 31 marks=[ 32 td.skip_if_no("openpyxl"), 33 pytest.mark.filterwarnings("ignore:.*html argument"), 34 ], 35 ), 36 pytest.param( 37 None, 38 marks=[ 39 td.skip_if_no("xlrd"), 40 ], 41 ), 42 pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), 43 pytest.param("odf", marks=td.skip_if_no("odf")), 44] 45 46 47def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: 48 """ 49 Filter out invalid (engine, ext) pairs instead of skipping, as that 50 produces 500+ pytest.skips. 51 """ 52 engine = engine.values[0] 53 if engine == "openpyxl" and read_ext == ".xls": 54 return False 55 if engine == "odf" and read_ext != ".ods": 56 return False 57 if read_ext == ".ods" and engine != "odf": 58 return False 59 if engine == "pyxlsb" and read_ext != ".xlsb": 60 return False 61 if read_ext == ".xlsb" and engine != "pyxlsb": 62 return False 63 if ( 64 engine == "xlrd" 65 and xlrd_version is not None 66 and xlrd_version >= "2" 67 and read_ext != ".xls" 68 ): 69 return False 70 return True 71 72 73def _transfer_marks(engine, read_ext): 74 """ 75 engine gives us a pytest.param objec with some marks, read_ext is just 76 a string. We need to generate a new pytest.param inheriting the marks. 77 """ 78 values = engine.values + (read_ext,) 79 new_param = pytest.param(values, marks=engine.marks) 80 return new_param 81 82 83@pytest.fixture( 84 autouse=True, 85 params=[ 86 _transfer_marks(eng, ext) 87 for eng in engine_params 88 for ext in read_ext_params 89 if _is_valid_engine_ext_pair(eng, ext) 90 ], 91) 92def engine_and_read_ext(request): 93 """ 94 Fixture for Excel reader engine and read_ext, only including valid pairs. 95 """ 96 return request.param 97 98 99@pytest.fixture 100def engine(engine_and_read_ext): 101 engine, read_ext = engine_and_read_ext 102 return engine 103 104 105@pytest.fixture 106def read_ext(engine_and_read_ext): 107 engine, read_ext = engine_and_read_ext 108 return read_ext 109 110 111class TestReaders: 112 @pytest.fixture(autouse=True) 113 def cd_and_set_engine(self, engine, datapath, monkeypatch): 114 """ 115 Change directory and set engine for read_excel calls. 116 """ 117 func = partial(pd.read_excel, engine=engine) 118 monkeypatch.chdir(datapath("io", "data", "excel")) 119 monkeypatch.setattr(pd, "read_excel", func) 120 121 def test_usecols_int(self, read_ext, df_ref): 122 df_ref = df_ref.reindex(columns=["A", "B", "C"]) 123 124 # usecols as int 125 msg = "Passing an integer for `usecols`" 126 with pytest.raises(ValueError, match=msg): 127 pd.read_excel( 128 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 129 ) 130 131 # usecols as int 132 with pytest.raises(ValueError, match=msg): 133 pd.read_excel( 134 "test1" + read_ext, 135 sheet_name="Sheet2", 136 skiprows=[1], 137 index_col=0, 138 usecols=3, 139 ) 140 141 def test_usecols_list(self, read_ext, df_ref): 142 if pd.read_excel.keywords["engine"] == "pyxlsb": 143 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 144 145 df_ref = df_ref.reindex(columns=["B", "C"]) 146 df1 = pd.read_excel( 147 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3] 148 ) 149 df2 = pd.read_excel( 150 "test1" + read_ext, 151 sheet_name="Sheet2", 152 skiprows=[1], 153 index_col=0, 154 usecols=[0, 2, 3], 155 ) 156 157 # TODO add index to xls file) 158 tm.assert_frame_equal(df1, df_ref, check_names=False) 159 tm.assert_frame_equal(df2, df_ref, check_names=False) 160 161 def test_usecols_str(self, read_ext, df_ref): 162 if pd.read_excel.keywords["engine"] == "pyxlsb": 163 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 164 165 df1 = df_ref.reindex(columns=["A", "B", "C"]) 166 df2 = pd.read_excel( 167 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D" 168 ) 169 df3 = pd.read_excel( 170 "test1" + read_ext, 171 sheet_name="Sheet2", 172 skiprows=[1], 173 index_col=0, 174 usecols="A:D", 175 ) 176 177 # TODO add index to xls, read xls ignores index name ? 178 tm.assert_frame_equal(df2, df1, check_names=False) 179 tm.assert_frame_equal(df3, df1, check_names=False) 180 181 df1 = df_ref.reindex(columns=["B", "C"]) 182 df2 = pd.read_excel( 183 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D" 184 ) 185 df3 = pd.read_excel( 186 "test1" + read_ext, 187 sheet_name="Sheet2", 188 skiprows=[1], 189 index_col=0, 190 usecols="A,C,D", 191 ) 192 # TODO add index to xls file 193 tm.assert_frame_equal(df2, df1, check_names=False) 194 tm.assert_frame_equal(df3, df1, check_names=False) 195 196 df1 = df_ref.reindex(columns=["B", "C"]) 197 df2 = pd.read_excel( 198 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D" 199 ) 200 df3 = pd.read_excel( 201 "test1" + read_ext, 202 sheet_name="Sheet2", 203 skiprows=[1], 204 index_col=0, 205 usecols="A,C:D", 206 ) 207 tm.assert_frame_equal(df2, df1, check_names=False) 208 tm.assert_frame_equal(df3, df1, check_names=False) 209 210 @pytest.mark.parametrize( 211 "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] 212 ) 213 def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): 214 if pd.read_excel.keywords["engine"] == "pyxlsb": 215 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 216 217 expected = df_ref[["A", "C"]] 218 result = pd.read_excel( 219 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols 220 ) 221 tm.assert_frame_equal(result, expected, check_names=False) 222 223 @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) 224 def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): 225 expected = df_ref[["B", "D"]] 226 expected.index = range(len(expected)) 227 228 result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) 229 tm.assert_frame_equal(result, expected, check_names=False) 230 231 def test_read_excel_without_slicing(self, read_ext, df_ref): 232 if pd.read_excel.keywords["engine"] == "pyxlsb": 233 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 234 235 expected = df_ref 236 result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) 237 tm.assert_frame_equal(result, expected, check_names=False) 238 239 def test_usecols_excel_range_str(self, read_ext, df_ref): 240 if pd.read_excel.keywords["engine"] == "pyxlsb": 241 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 242 243 expected = df_ref[["C", "D"]] 244 result = pd.read_excel( 245 "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E" 246 ) 247 tm.assert_frame_equal(result, expected, check_names=False) 248 249 def test_usecols_excel_range_str_invalid(self, read_ext): 250 msg = "Invalid column name: E1" 251 252 with pytest.raises(ValueError, match=msg): 253 pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1") 254 255 def test_index_col_label_error(self, read_ext): 256 msg = "list indices must be integers.*, not str" 257 258 with pytest.raises(TypeError, match=msg): 259 pd.read_excel( 260 "test1" + read_ext, 261 sheet_name="Sheet1", 262 index_col=["A"], 263 usecols=["A", "C"], 264 ) 265 266 def test_index_col_empty(self, read_ext): 267 # see gh-9208 268 result = pd.read_excel( 269 "test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"] 270 ) 271 expected = DataFrame( 272 columns=["D", "E", "F"], 273 index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]), 274 ) 275 tm.assert_frame_equal(result, expected) 276 277 @pytest.mark.parametrize("index_col", [None, 2]) 278 def test_index_col_with_unnamed(self, read_ext, index_col): 279 # see gh-18792 280 result = pd.read_excel( 281 "test1" + read_ext, sheet_name="Sheet4", index_col=index_col 282 ) 283 expected = DataFrame( 284 [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"] 285 ) 286 if index_col: 287 expected = expected.set_index(expected.columns[index_col]) 288 289 tm.assert_frame_equal(result, expected) 290 291 def test_usecols_pass_non_existent_column(self, read_ext): 292 msg = ( 293 "Usecols do not match columns, " 294 "columns expected but not found: " + r"\['E'\]" 295 ) 296 297 with pytest.raises(ValueError, match=msg): 298 pd.read_excel("test1" + read_ext, usecols=["E"]) 299 300 def test_usecols_wrong_type(self, read_ext): 301 msg = ( 302 "'usecols' must either be list-like of " 303 "all strings, all unicode, all integers or a callable." 304 ) 305 306 with pytest.raises(ValueError, match=msg): 307 pd.read_excel("test1" + read_ext, usecols=["E1", 0]) 308 309 def test_excel_stop_iterator(self, read_ext): 310 311 parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1") 312 expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) 313 tm.assert_frame_equal(parsed, expected) 314 315 def test_excel_cell_error_na(self, read_ext): 316 if pd.read_excel.keywords["engine"] == "pyxlsb": 317 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 318 319 parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") 320 expected = DataFrame([[np.nan]], columns=["Test"]) 321 tm.assert_frame_equal(parsed, expected) 322 323 def test_excel_table(self, read_ext, df_ref): 324 if pd.read_excel.keywords["engine"] == "pyxlsb": 325 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 326 327 df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) 328 df2 = pd.read_excel( 329 "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0 330 ) 331 # TODO add index to file 332 tm.assert_frame_equal(df1, df_ref, check_names=False) 333 tm.assert_frame_equal(df2, df_ref, check_names=False) 334 335 df3 = pd.read_excel( 336 "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1 337 ) 338 tm.assert_frame_equal(df3, df1.iloc[:-1]) 339 340 def test_reader_special_dtypes(self, read_ext): 341 if pd.read_excel.keywords["engine"] == "pyxlsb": 342 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 343 344 expected = DataFrame.from_dict( 345 { 346 "IntCol": [1, 2, -3, 4, 0], 347 "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005], 348 "BoolCol": [True, False, True, True, False], 349 "StrCol": [1, 2, 3, 4, 5], 350 # GH5394 - this is why convert_float isn't vectorized 351 "Str2Col": ["a", 3, "c", "d", "e"], 352 "DateCol": [ 353 datetime(2013, 10, 30), 354 datetime(2013, 10, 31), 355 datetime(1905, 1, 1), 356 datetime(2013, 12, 14), 357 datetime(2015, 3, 14), 358 ], 359 }, 360 ) 361 basename = "test_types" 362 363 # should read in correctly and infer types 364 actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1") 365 tm.assert_frame_equal(actual, expected) 366 367 # if not coercing number, then int comes in as float 368 float_expected = expected.copy() 369 float_expected["IntCol"] = float_expected["IntCol"].astype(float) 370 float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 371 actual = pd.read_excel( 372 basename + read_ext, sheet_name="Sheet1", convert_float=False 373 ) 374 tm.assert_frame_equal(actual, float_expected) 375 376 # check setting Index (assuming xls and xlsx are the same here) 377 for icol, name in enumerate(expected.columns): 378 actual = pd.read_excel( 379 basename + read_ext, sheet_name="Sheet1", index_col=icol 380 ) 381 exp = expected.set_index(name) 382 tm.assert_frame_equal(actual, exp) 383 384 # convert_float and converters should be different but both accepted 385 expected["StrCol"] = expected["StrCol"].apply(str) 386 actual = pd.read_excel( 387 basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str} 388 ) 389 tm.assert_frame_equal(actual, expected) 390 391 no_convert_float = float_expected.copy() 392 no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) 393 actual = pd.read_excel( 394 basename + read_ext, 395 sheet_name="Sheet1", 396 convert_float=False, 397 converters={"StrCol": str}, 398 ) 399 tm.assert_frame_equal(actual, no_convert_float) 400 401 # GH8212 - support for converters and missing values 402 def test_reader_converters(self, read_ext): 403 404 basename = "test_converters" 405 406 expected = DataFrame.from_dict( 407 { 408 "IntCol": [1, 2, -3, -1000, 0], 409 "FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005], 410 "BoolCol": ["Found", "Found", "Found", "Not found", "Found"], 411 "StrCol": ["1", np.nan, "3", "4", "5"], 412 } 413 ) 414 415 converters = { 416 "IntCol": lambda x: int(x) if x != "" else -1000, 417 "FloatCol": lambda x: 10 * x if x else np.nan, 418 2: lambda x: "Found" if x != "" else "Not found", 419 3: lambda x: str(x) if x else "", 420 } 421 422 # should read in correctly and set types of single cells (not array 423 # dtypes) 424 actual = pd.read_excel( 425 basename + read_ext, sheet_name="Sheet1", converters=converters 426 ) 427 tm.assert_frame_equal(actual, expected) 428 429 def test_reader_dtype(self, read_ext): 430 # GH 8212 431 basename = "testdtype" 432 actual = pd.read_excel(basename + read_ext) 433 434 expected = DataFrame( 435 { 436 "a": [1, 2, 3, 4], 437 "b": [2.5, 3.5, 4.5, 5.5], 438 "c": [1, 2, 3, 4], 439 "d": [1.0, 2.0, np.nan, 4.0], 440 } 441 ).reindex(columns=["a", "b", "c", "d"]) 442 443 tm.assert_frame_equal(actual, expected) 444 445 actual = pd.read_excel( 446 basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str} 447 ) 448 449 expected["a"] = expected["a"].astype("float64") 450 expected["b"] = expected["b"].astype("float32") 451 expected["c"] = ["001", "002", "003", "004"] 452 tm.assert_frame_equal(actual, expected) 453 454 msg = "Unable to convert column d to type int64" 455 with pytest.raises(ValueError, match=msg): 456 pd.read_excel(basename + read_ext, dtype={"d": "int64"}) 457 458 @pytest.mark.parametrize( 459 "dtype,expected", 460 [ 461 ( 462 None, 463 DataFrame( 464 { 465 "a": [1, 2, 3, 4], 466 "b": [2.5, 3.5, 4.5, 5.5], 467 "c": [1, 2, 3, 4], 468 "d": [1.0, 2.0, np.nan, 4.0], 469 } 470 ), 471 ), 472 ( 473 {"a": "float64", "b": "float32", "c": str, "d": str}, 474 DataFrame( 475 { 476 "a": Series([1, 2, 3, 4], dtype="float64"), 477 "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), 478 "c": ["001", "002", "003", "004"], 479 "d": ["1", "2", np.nan, "4"], 480 } 481 ), 482 ), 483 ], 484 ) 485 def test_reader_dtype_str(self, read_ext, dtype, expected): 486 # see gh-20377 487 basename = "testdtype" 488 489 actual = pd.read_excel(basename + read_ext, dtype=dtype) 490 tm.assert_frame_equal(actual, expected) 491 492 def test_reader_spaces(self, read_ext): 493 # see gh-32207 494 basename = "test_spaces" 495 496 actual = pd.read_excel(basename + read_ext) 497 expected = DataFrame( 498 { 499 "testcol": [ 500 "this is great", 501 "4 spaces", 502 "1 trailing ", 503 " 1 leading", 504 "2 spaces multiple times", 505 ] 506 } 507 ) 508 tm.assert_frame_equal(actual, expected) 509 510 # gh-36122, gh-35802 511 @pytest.mark.parametrize( 512 "basename,expected", 513 [ 514 ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})), 515 ("gh-36122", DataFrame(columns=["got 2nd sa"])), 516 ], 517 ) 518 def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): 519 # see gh-35802 520 engine = pd.read_excel.keywords["engine"] 521 if engine != "odf": 522 pytest.skip(f"Skipped for engine: {engine}") 523 524 actual = pd.read_excel(basename + read_ext) 525 tm.assert_frame_equal(actual, expected) 526 527 def test_reading_all_sheets(self, read_ext): 528 # Test reading all sheet names by setting sheet_name to None, 529 # Ensure a dict is returned. 530 # See PR #9450 531 basename = "test_multisheet" 532 dfs = pd.read_excel(basename + read_ext, sheet_name=None) 533 # ensure this is not alphabetical to test order preservation 534 expected_keys = ["Charlie", "Alpha", "Beta"] 535 tm.assert_contains_all(expected_keys, dfs.keys()) 536 # Issue 9930 537 # Ensure sheet order is preserved 538 assert expected_keys == list(dfs.keys()) 539 540 def test_reading_multiple_specific_sheets(self, read_ext): 541 # Test reading specific sheet names by specifying a mixed list 542 # of integers and strings, and confirm that duplicated sheet 543 # references (positions/names) are removed properly. 544 # Ensure a dict is returned 545 # See PR #9450 546 basename = "test_multisheet" 547 # Explicitly request duplicates. Only the set should be returned. 548 expected_keys = [2, "Charlie", "Charlie"] 549 dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) 550 expected_keys = list(set(expected_keys)) 551 tm.assert_contains_all(expected_keys, dfs.keys()) 552 assert len(expected_keys) == len(dfs.keys()) 553 554 def test_reading_all_sheets_with_blank(self, read_ext): 555 # Test reading all sheet names by setting sheet_name to None, 556 # In the case where some sheets are blank. 557 # Issue #11711 558 basename = "blank_with_header" 559 dfs = pd.read_excel(basename + read_ext, sheet_name=None) 560 expected_keys = ["Sheet1", "Sheet2", "Sheet3"] 561 tm.assert_contains_all(expected_keys, dfs.keys()) 562 563 # GH6403 564 def test_read_excel_blank(self, read_ext): 565 actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1") 566 tm.assert_frame_equal(actual, DataFrame()) 567 568 def test_read_excel_blank_with_header(self, read_ext): 569 expected = DataFrame(columns=["col_1", "col_2"]) 570 actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1") 571 tm.assert_frame_equal(actual, expected) 572 573 def test_date_conversion_overflow(self, read_ext): 574 # GH 10001 : pandas.ExcelFile ignore parse_dates=False 575 if pd.read_excel.keywords["engine"] == "pyxlsb": 576 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 577 578 expected = DataFrame( 579 [ 580 [pd.Timestamp("2016-03-12"), "Marc Johnson"], 581 [pd.Timestamp("2016-03-16"), "Jack Black"], 582 [1e20, "Timothy Brown"], 583 ], 584 columns=["DateColWithBigInt", "StringCol"], 585 ) 586 587 if pd.read_excel.keywords["engine"] == "openpyxl": 588 pytest.xfail("Maybe not supported by openpyxl") 589 590 if pd.read_excel.keywords["engine"] is None: 591 # GH 35029 592 pytest.xfail("Defaults to openpyxl, maybe not supported") 593 594 result = pd.read_excel("testdateoverflow" + read_ext) 595 tm.assert_frame_equal(result, expected) 596 597 def test_sheet_name(self, read_ext, df_ref): 598 if pd.read_excel.keywords["engine"] == "pyxlsb": 599 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 600 filename = "test1" 601 sheet_name = "Sheet1" 602 603 if pd.read_excel.keywords["engine"] == "openpyxl": 604 pytest.xfail("Maybe not supported by openpyxl") 605 606 df1 = pd.read_excel( 607 filename + read_ext, sheet_name=sheet_name, index_col=0 608 ) # doc 609 df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) 610 611 tm.assert_frame_equal(df1, df_ref, check_names=False) 612 tm.assert_frame_equal(df2, df_ref, check_names=False) 613 614 def test_excel_read_buffer(self, read_ext): 615 616 pth = "test1" + read_ext 617 expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0) 618 with open(pth, "rb") as f: 619 actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0) 620 tm.assert_frame_equal(expected, actual) 621 622 def test_bad_engine_raises(self, read_ext): 623 bad_engine = "foo" 624 with pytest.raises(ValueError, match="Unknown engine: foo"): 625 pd.read_excel("", engine=bad_engine) 626 627 @pytest.mark.parametrize( 628 "sheet_name", 629 [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], 630 ) 631 def test_bad_sheetname_raises(self, read_ext, sheet_name): 632 # GH 39250 633 msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" 634 with pytest.raises(ValueError, match=msg): 635 pd.read_excel("blank" + read_ext, sheet_name=sheet_name) 636 637 def test_missing_file_raises(self, read_ext): 638 bad_file = f"foo{read_ext}" 639 # CI tests with zh_CN.utf8, translates to "No such file or directory" 640 with pytest.raises( 641 FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)" 642 ): 643 pd.read_excel(bad_file) 644 645 def test_corrupt_bytes_raises(self, read_ext, engine): 646 bad_stream = b"foo" 647 if engine is None or engine == "xlrd": 648 error = ValueError 649 msg = "File is not a recognized excel file" 650 else: 651 error = BadZipFile 652 msg = "File is not a zip file" 653 with pytest.raises(error, match=msg): 654 pd.read_excel(bad_stream) 655 656 @tm.network 657 def test_read_from_http_url(self, read_ext): 658 url = ( 659 "https://raw.githubusercontent.com/pandas-dev/pandas/master/" 660 "pandas/tests/io/data/excel/test1" + read_ext 661 ) 662 url_table = pd.read_excel(url) 663 local_table = pd.read_excel("test1" + read_ext) 664 tm.assert_frame_equal(url_table, local_table) 665 666 @td.skip_if_not_us_locale 667 def test_read_from_s3_url(self, read_ext, s3_resource, s3so): 668 # Bucket "pandas-test" created in tests/io/conftest.py 669 with open("test1" + read_ext, "rb") as f: 670 s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) 671 672 url = "s3://pandas-test/test1" + read_ext 673 674 url_table = pd.read_excel(url, storage_options=s3so) 675 local_table = pd.read_excel("test1" + read_ext) 676 tm.assert_frame_equal(url_table, local_table) 677 678 def test_read_from_s3_object(self, read_ext, s3_resource, s3so): 679 # GH 38788 680 # Bucket "pandas-test" created in tests/io/conftest.py 681 with open("test1" + read_ext, "rb") as f: 682 s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) 683 684 import s3fs 685 686 s3 = s3fs.S3FileSystem(**s3so) 687 688 with s3.open("s3://pandas-test/test1" + read_ext) as f: 689 url_table = pd.read_excel(f) 690 691 local_table = pd.read_excel("test1" + read_ext) 692 tm.assert_frame_equal(url_table, local_table) 693 694 @pytest.mark.slow 695 def test_read_from_file_url(self, read_ext, datapath): 696 697 # FILE 698 localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) 699 local_table = pd.read_excel(localtable) 700 701 try: 702 url_table = pd.read_excel("file://localhost/" + localtable) 703 except URLError: 704 # fails on some systems 705 import platform 706 707 platform_info = " ".join(platform.uname()).strip() 708 pytest.skip(f"failing on {platform_info}") 709 710 tm.assert_frame_equal(url_table, local_table) 711 712 def test_read_from_pathlib_path(self, read_ext): 713 714 # GH12655 715 from pathlib import Path 716 717 str_path = "test1" + read_ext 718 expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) 719 720 path_obj = Path("test1" + read_ext) 721 actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0) 722 723 tm.assert_frame_equal(expected, actual) 724 725 @td.skip_if_no("py.path") 726 @td.check_file_leaks 727 def test_read_from_py_localpath(self, read_ext): 728 729 # GH12655 730 from py.path import local as LocalPath 731 732 str_path = os.path.join("test1" + read_ext) 733 expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) 734 735 path_obj = LocalPath().join("test1" + read_ext) 736 actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0) 737 738 tm.assert_frame_equal(expected, actual) 739 740 @td.check_file_leaks 741 def test_close_from_py_localpath(self, read_ext): 742 743 # GH31467 744 str_path = os.path.join("test1" + read_ext) 745 with open(str_path, "rb") as f: 746 x = pd.read_excel(f, sheet_name="Sheet1", index_col=0) 747 del x 748 # should not throw an exception because the passed file was closed 749 f.read() 750 751 def test_reader_seconds(self, read_ext): 752 if pd.read_excel.keywords["engine"] == "pyxlsb": 753 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 754 755 # Test reading times with and without milliseconds. GH5945. 756 expected = DataFrame.from_dict( 757 { 758 "Time": [ 759 time(1, 2, 3), 760 time(2, 45, 56, 100000), 761 time(4, 29, 49, 200000), 762 time(6, 13, 42, 300000), 763 time(7, 57, 35, 400000), 764 time(9, 41, 28, 500000), 765 time(11, 25, 21, 600000), 766 time(13, 9, 14, 700000), 767 time(14, 53, 7, 800000), 768 time(16, 37, 0, 900000), 769 time(18, 20, 54), 770 ] 771 } 772 ) 773 774 actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") 775 tm.assert_frame_equal(actual, expected) 776 777 actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") 778 tm.assert_frame_equal(actual, expected) 779 780 def test_read_excel_multiindex(self, read_ext): 781 # see gh-4679 782 if pd.read_excel.keywords["engine"] == "pyxlsb": 783 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 784 785 mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) 786 mi_file = "testmultiindex" + read_ext 787 788 # "mi_column" sheet 789 expected = DataFrame( 790 [ 791 [1, 2.5, pd.Timestamp("2015-01-01"), True], 792 [2, 3.5, pd.Timestamp("2015-01-02"), False], 793 [3, 4.5, pd.Timestamp("2015-01-03"), False], 794 [4, 5.5, pd.Timestamp("2015-01-04"), True], 795 ], 796 columns=mi, 797 ) 798 799 actual = pd.read_excel( 800 mi_file, sheet_name="mi_column", header=[0, 1], index_col=0 801 ) 802 tm.assert_frame_equal(actual, expected) 803 804 # "mi_index" sheet 805 expected.index = mi 806 expected.columns = ["a", "b", "c", "d"] 807 808 actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) 809 tm.assert_frame_equal(actual, expected, check_names=False) 810 811 # "both" sheet 812 expected.columns = mi 813 814 actual = pd.read_excel( 815 mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] 816 ) 817 tm.assert_frame_equal(actual, expected, check_names=False) 818 819 # "mi_index_name" sheet 820 expected.columns = ["a", "b", "c", "d"] 821 expected.index = mi.set_names(["ilvl1", "ilvl2"]) 822 823 actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1]) 824 tm.assert_frame_equal(actual, expected) 825 826 # "mi_column_name" sheet 827 expected.index = list(range(4)) 828 expected.columns = mi.set_names(["c1", "c2"]) 829 actual = pd.read_excel( 830 mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 831 ) 832 tm.assert_frame_equal(actual, expected) 833 834 # see gh-11317 835 # "name_with_int" sheet 836 expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) 837 838 actual = pd.read_excel( 839 mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1] 840 ) 841 tm.assert_frame_equal(actual, expected) 842 843 # "both_name" sheet 844 expected.columns = mi.set_names(["c1", "c2"]) 845 expected.index = mi.set_names(["ilvl1", "ilvl2"]) 846 847 actual = pd.read_excel( 848 mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1] 849 ) 850 tm.assert_frame_equal(actual, expected) 851 852 # "both_skiprows" sheet 853 actual = pd.read_excel( 854 mi_file, 855 sheet_name="both_name_skiprows", 856 index_col=[0, 1], 857 header=[0, 1], 858 skiprows=2, 859 ) 860 tm.assert_frame_equal(actual, expected) 861 862 def test_read_excel_multiindex_header_only(self, read_ext): 863 # see gh-11733. 864 # 865 # Don't try to parse a header name if there isn't one. 866 mi_file = "testmultiindex" + read_ext 867 result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1]) 868 869 exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) 870 expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) 871 tm.assert_frame_equal(result, expected) 872 873 def test_excel_old_index_format(self, read_ext): 874 # see gh-4679 875 filename = "test_index_name_pre17" + read_ext 876 877 # We detect headers to determine if index names exist, so 878 # that "index" name in the "names" version of the data will 879 # now be interpreted as rows that include null data. 880 data = np.array( 881 [ 882 [None, None, None, None, None], 883 ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], 884 ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], 885 ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], 886 ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], 887 ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], 888 ] 889 ) 890 columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] 891 mi = MultiIndex( 892 levels=[ 893 ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], 894 ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], 895 ], 896 codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], 897 names=[None, None], 898 ) 899 si = Index( 900 ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None 901 ) 902 903 expected = DataFrame(data, index=si, columns=columns) 904 905 actual = pd.read_excel(filename, sheet_name="single_names", index_col=0) 906 tm.assert_frame_equal(actual, expected) 907 908 expected.index = mi 909 910 actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1]) 911 tm.assert_frame_equal(actual, expected) 912 913 # The analogous versions of the "names" version data 914 # where there are explicitly no names for the indices. 915 data = np.array( 916 [ 917 ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], 918 ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], 919 ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], 920 ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], 921 ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], 922 ] 923 ) 924 columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] 925 mi = MultiIndex( 926 levels=[ 927 ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], 928 ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], 929 ], 930 codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], 931 names=[None, None], 932 ) 933 si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) 934 935 expected = DataFrame(data, index=si, columns=columns) 936 937 actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0) 938 tm.assert_frame_equal(actual, expected) 939 940 expected.index = mi 941 942 actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1]) 943 tm.assert_frame_equal(actual, expected, check_names=False) 944 945 def test_read_excel_bool_header_arg(self, read_ext): 946 # GH 6114 947 msg = "Passing a bool to header is invalid" 948 for arg in [True, False]: 949 with pytest.raises(TypeError, match=msg): 950 pd.read_excel("test1" + read_ext, header=arg) 951 952 def test_read_excel_skiprows(self, read_ext): 953 # GH 4903 954 if pd.read_excel.keywords["engine"] == "pyxlsb": 955 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 956 957 actual = pd.read_excel( 958 "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] 959 ) 960 expected = DataFrame( 961 [ 962 [1, 2.5, pd.Timestamp("2015-01-01"), True], 963 [2, 3.5, pd.Timestamp("2015-01-02"), False], 964 [3, 4.5, pd.Timestamp("2015-01-03"), False], 965 [4, 5.5, pd.Timestamp("2015-01-04"), True], 966 ], 967 columns=["a", "b", "c", "d"], 968 ) 969 tm.assert_frame_equal(actual, expected) 970 971 actual = pd.read_excel( 972 "testskiprows" + read_ext, 973 sheet_name="skiprows_list", 974 skiprows=np.array([0, 2]), 975 ) 976 tm.assert_frame_equal(actual, expected) 977 978 # GH36435 979 actual = pd.read_excel( 980 "testskiprows" + read_ext, 981 sheet_name="skiprows_list", 982 skiprows=lambda x: x in [0, 2], 983 ) 984 tm.assert_frame_equal(actual, expected) 985 986 actual = pd.read_excel( 987 "testskiprows" + read_ext, 988 sheet_name="skiprows_list", 989 skiprows=3, 990 names=["a", "b", "c", "d"], 991 ) 992 expected = DataFrame( 993 [ 994 # [1, 2.5, pd.Timestamp("2015-01-01"), True], 995 [2, 3.5, pd.Timestamp("2015-01-02"), False], 996 [3, 4.5, pd.Timestamp("2015-01-03"), False], 997 [4, 5.5, pd.Timestamp("2015-01-04"), True], 998 ], 999 columns=["a", "b", "c", "d"], 1000 ) 1001 tm.assert_frame_equal(actual, expected) 1002 1003 def test_read_excel_nrows(self, read_ext): 1004 # GH 16645 1005 num_rows_to_pull = 5 1006 actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) 1007 expected = pd.read_excel("test1" + read_ext) 1008 expected = expected[:num_rows_to_pull] 1009 tm.assert_frame_equal(actual, expected) 1010 1011 def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): 1012 # GH 16645 1013 expected = pd.read_excel("test1" + read_ext) 1014 num_records_in_file = len(expected) 1015 num_rows_to_pull = num_records_in_file + 10 1016 actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) 1017 tm.assert_frame_equal(actual, expected) 1018 1019 def test_read_excel_nrows_non_integer_parameter(self, read_ext): 1020 # GH 16645 1021 msg = "'nrows' must be an integer >=0" 1022 with pytest.raises(ValueError, match=msg): 1023 pd.read_excel("test1" + read_ext, nrows="5") 1024 1025 def test_read_excel_squeeze(self, read_ext): 1026 # GH 12157 1027 f = "test_squeeze" + read_ext 1028 1029 actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True) 1030 expected = Series([2, 3, 4], [4, 5, 6], name="b") 1031 expected.index.name = "a" 1032 tm.assert_series_equal(actual, expected) 1033 1034 actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True) 1035 expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) 1036 tm.assert_frame_equal(actual, expected) 1037 1038 actual = pd.read_excel(f, sheet_name="one_column", squeeze=True) 1039 expected = Series([1, 2, 3], name="a") 1040 tm.assert_series_equal(actual, expected) 1041 1042 def test_deprecated_kwargs(self, read_ext): 1043 with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): 1044 pd.read_excel("test1" + read_ext, "Sheet1", 0) 1045 1046 pd.read_excel("test1" + read_ext) 1047 1048 def test_no_header_with_list_index_col(self, read_ext): 1049 # GH 31783 1050 file_name = "testmultiindex" + read_ext 1051 data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] 1052 idx = MultiIndex.from_tuples( 1053 [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1) 1054 ) 1055 expected = DataFrame(data, index=idx, columns=(2, 3)) 1056 result = pd.read_excel( 1057 file_name, sheet_name="index_col_none", index_col=[0, 1], header=None 1058 ) 1059 tm.assert_frame_equal(expected, result) 1060 1061 1062class TestExcelFileRead: 1063 @pytest.fixture(autouse=True) 1064 def cd_and_set_engine(self, engine, datapath, monkeypatch): 1065 """ 1066 Change directory and set engine for ExcelFile objects. 1067 """ 1068 func = partial(pd.ExcelFile, engine=engine) 1069 monkeypatch.chdir(datapath("io", "data", "excel")) 1070 monkeypatch.setattr(pd, "ExcelFile", func) 1071 1072 def test_excel_passes_na(self, read_ext): 1073 with pd.ExcelFile("test4" + read_ext) as excel: 1074 parsed = pd.read_excel( 1075 excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"] 1076 ) 1077 expected = DataFrame( 1078 [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"] 1079 ) 1080 tm.assert_frame_equal(parsed, expected) 1081 1082 with pd.ExcelFile("test4" + read_ext) as excel: 1083 parsed = pd.read_excel( 1084 excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"] 1085 ) 1086 expected = DataFrame( 1087 [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] 1088 ) 1089 tm.assert_frame_equal(parsed, expected) 1090 1091 # 13967 1092 with pd.ExcelFile("test5" + read_ext) as excel: 1093 parsed = pd.read_excel( 1094 excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"] 1095 ) 1096 expected = DataFrame( 1097 [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"] 1098 ) 1099 tm.assert_frame_equal(parsed, expected) 1100 1101 with pd.ExcelFile("test5" + read_ext) as excel: 1102 parsed = pd.read_excel( 1103 excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"] 1104 ) 1105 expected = DataFrame( 1106 [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] 1107 ) 1108 tm.assert_frame_equal(parsed, expected) 1109 1110 @pytest.mark.parametrize("na_filter", [None, True, False]) 1111 def test_excel_passes_na_filter(self, read_ext, na_filter): 1112 # gh-25453 1113 kwargs = {} 1114 1115 if na_filter is not None: 1116 kwargs["na_filter"] = na_filter 1117 1118 with pd.ExcelFile("test5" + read_ext) as excel: 1119 parsed = pd.read_excel( 1120 excel, 1121 sheet_name="Sheet1", 1122 keep_default_na=True, 1123 na_values=["apple"], 1124 **kwargs, 1125 ) 1126 1127 if na_filter is False: 1128 expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]] 1129 else: 1130 expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]] 1131 1132 expected = DataFrame(expected, columns=["Test"]) 1133 tm.assert_frame_equal(parsed, expected) 1134 1135 def test_excel_table_sheet_by_index(self, read_ext, df_ref): 1136 # For some reason pd.read_excel has no attribute 'keywords' here. 1137 # Skipping based on read_ext instead. 1138 if read_ext == ".xlsb": 1139 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 1140 1141 with pd.ExcelFile("test1" + read_ext) as excel: 1142 df1 = pd.read_excel(excel, sheet_name=0, index_col=0) 1143 df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0) 1144 tm.assert_frame_equal(df1, df_ref, check_names=False) 1145 tm.assert_frame_equal(df2, df_ref, check_names=False) 1146 1147 with pd.ExcelFile("test1" + read_ext) as excel: 1148 df1 = excel.parse(0, index_col=0) 1149 df2 = excel.parse(1, skiprows=[1], index_col=0) 1150 tm.assert_frame_equal(df1, df_ref, check_names=False) 1151 tm.assert_frame_equal(df2, df_ref, check_names=False) 1152 1153 with pd.ExcelFile("test1" + read_ext) as excel: 1154 df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1) 1155 tm.assert_frame_equal(df3, df1.iloc[:-1]) 1156 1157 with pd.ExcelFile("test1" + read_ext) as excel: 1158 df3 = excel.parse(0, index_col=0, skipfooter=1) 1159 1160 tm.assert_frame_equal(df3, df1.iloc[:-1]) 1161 1162 def test_sheet_name(self, read_ext, df_ref): 1163 # For some reason pd.read_excel has no attribute 'keywords' here. 1164 # Skipping based on read_ext instead. 1165 if read_ext == ".xlsb": 1166 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 1167 1168 filename = "test1" 1169 sheet_name = "Sheet1" 1170 1171 with pd.ExcelFile(filename + read_ext) as excel: 1172 df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc 1173 1174 with pd.ExcelFile(filename + read_ext) as excel: 1175 df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) 1176 1177 tm.assert_frame_equal(df1_parse, df_ref, check_names=False) 1178 tm.assert_frame_equal(df2_parse, df_ref, check_names=False) 1179 1180 @pytest.mark.parametrize( 1181 "sheet_name", 1182 [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], 1183 ) 1184 def test_bad_sheetname_raises(self, read_ext, sheet_name): 1185 # GH 39250 1186 msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" 1187 with pytest.raises(ValueError, match=msg): 1188 with pd.ExcelFile("blank" + read_ext) as excel: 1189 excel.parse(sheet_name=sheet_name) 1190 1191 def test_excel_read_buffer(self, engine, read_ext): 1192 pth = "test1" + read_ext 1193 expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine) 1194 1195 with open(pth, "rb") as f: 1196 with pd.ExcelFile(f) as xls: 1197 actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0) 1198 1199 tm.assert_frame_equal(expected, actual) 1200 1201 def test_reader_closes_file(self, engine, read_ext): 1202 with open("test1" + read_ext, "rb") as f: 1203 with pd.ExcelFile(f) as xlsx: 1204 # parses okay 1205 pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine) 1206 1207 assert f.closed 1208 1209 def test_conflicting_excel_engines(self, read_ext): 1210 # GH 26566 1211 msg = "Engine should not be specified when passing an ExcelFile" 1212 1213 with pd.ExcelFile("test1" + read_ext) as xl: 1214 with pytest.raises(ValueError, match=msg): 1215 pd.read_excel(xl, engine="foo") 1216 1217 def test_excel_read_binary(self, engine, read_ext): 1218 # GH 15914 1219 expected = pd.read_excel("test1" + read_ext, engine=engine) 1220 1221 with open("test1" + read_ext, "rb") as f: 1222 data = f.read() 1223 1224 actual = pd.read_excel(data, engine=engine) 1225 tm.assert_frame_equal(expected, actual) 1226 1227 def test_excel_read_binary_via_read_excel(self, read_ext, engine): 1228 # GH 38424 1229 if read_ext == ".xlsb" and engine == "pyxlsb": 1230 pytest.xfail("GH 38667 - should default to pyxlsb but doesn't") 1231 with open("test1" + read_ext, "rb") as f: 1232 result = pd.read_excel(f) 1233 expected = pd.read_excel("test1" + read_ext, engine=engine) 1234 tm.assert_frame_equal(result, expected) 1235 1236 @pytest.mark.skipif( 1237 xlrd_version is not None and xlrd_version >= "2", 1238 reason="xlrd no longer supports xlsx", 1239 ) 1240 def test_excel_high_surrogate(self, engine): 1241 # GH 23809 1242 expected = DataFrame(["\udc88"], columns=["Column1"]) 1243 1244 # should not produce a segmentation violation 1245 actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd") 1246 tm.assert_frame_equal(expected, actual) 1247 1248 @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) 1249 def test_header_with_index_col(self, engine, filename): 1250 # GH 33476 1251 idx = Index(["Z"], name="I2") 1252 cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) 1253 expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64") 1254 result = pd.read_excel( 1255 filename, sheet_name="Sheet1", index_col=0, header=[0, 1] 1256 ) 1257 tm.assert_frame_equal(expected, result) 1258 1259 def test_read_datetime_multiindex(self, engine, read_ext): 1260 # GH 34748 1261 if engine == "pyxlsb": 1262 pytest.xfail("Sheets containing datetimes not supported by pyxlsb") 1263 1264 f = "test_datetime_mi" + read_ext 1265 with pd.ExcelFile(f) as excel: 1266 actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) 1267 expected_column_index = MultiIndex.from_tuples( 1268 [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))], 1269 names=[ 1270 pd.to_datetime("02/29/2020").to_pydatetime(), 1271 pd.to_datetime("03/01/2020").to_pydatetime(), 1272 ], 1273 ) 1274 expected = DataFrame([], columns=expected_column_index) 1275 1276 tm.assert_frame_equal(expected, actual) 1277 1278 def test_corrupt_files_closed(self, request, engine, read_ext): 1279 # GH41778 1280 errors = (BadZipFile, ValueError) 1281 if engine is None: 1282 pytest.skip() 1283 elif engine == "xlrd": 1284 import xlrd 1285 1286 errors = (BadZipFile, ValueError, xlrd.biffh.XLRDError) 1287 1288 with tm.ensure_clean(f"corrupt{read_ext}") as file: 1289 Path(file).write_text("corrupt") 1290 with tm.assert_produces_warning(False): 1291 try: 1292 pd.ExcelFile(file, engine=engine) 1293 except errors: 1294 pass 1295