1import abc 2import datetime 3from distutils.version import LooseVersion 4import inspect 5from io import BufferedIOBase, BytesIO, RawIOBase 6import os 7from textwrap import fill 8from typing import IO, Any, Dict, Mapping, Optional, Union, cast 9import warnings 10import zipfile 11 12from pandas._config import config 13 14from pandas._libs.parsers import STR_NA_VALUES 15from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions 16from pandas.compat._optional import get_version, import_optional_dependency 17from pandas.errors import EmptyDataError 18from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc 19 20from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like 21 22from pandas.core.frame import DataFrame 23from pandas.core.shared_docs import _shared_docs 24 25from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg 26from pandas.io.excel._util import ( 27 fill_mi_header, 28 get_default_writer, 29 get_writer, 30 maybe_convert_usecols, 31 pop_header_name, 32) 33from pandas.io.parsers import TextParser 34 35_read_excel_doc = ( 36 """ 37Read an Excel file into a pandas DataFrame. 38 39Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions 40read from a local filesystem or URL. Supports an option to read 41a single sheet or a list of sheets. 42 43Parameters 44---------- 45io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object 46 Any valid string path is acceptable. The string could be a URL. Valid 47 URL schemes include http, ftp, s3, and file. For file URLs, a host is 48 expected. A local file could be: ``file://localhost/path/to/table.xlsx``. 49 50 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 51 52 By file-like object, we refer to objects with a ``read()`` method, 53 such as a file handle (e.g. via builtin ``open`` function) 54 or ``StringIO``. 55sheet_name : str, int, list, or None, default 0 56 Strings are used for sheet names. Integers are used in zero-indexed 57 sheet positions. Lists of strings/integers are used to request 58 multiple sheets. Specify None to get all sheets. 59 60 Available cases: 61 62 * Defaults to ``0``: 1st sheet as a `DataFrame` 63 * ``1``: 2nd sheet as a `DataFrame` 64 * ``"Sheet1"``: Load sheet with name "Sheet1" 65 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" 66 as a dict of `DataFrame` 67 * None: All sheets. 68 69header : int, list of int, default 0 70 Row (0-indexed) to use for the column labels of the parsed 71 DataFrame. If a list of integers is passed those row positions will 72 be combined into a ``MultiIndex``. Use None if there is no header. 73names : array-like, default None 74 List of column names to use. If file contains no header row, 75 then you should explicitly pass header=None. 76index_col : int, list of int, default None 77 Column (0-indexed) to use as the row labels of the DataFrame. 78 Pass None if there is no such column. If a list is passed, 79 those columns will be combined into a ``MultiIndex``. If a 80 subset of data is selected with ``usecols``, index_col 81 is based on the subset. 82usecols : int, str, list-like, or callable default None 83 * If None, then parse all columns. 84 * If str, then indicates comma separated list of Excel column letters 85 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of 86 both sides. 87 * If list of int, then indicates list of column numbers to be parsed. 88 * If list of string, then indicates list of column names to be parsed. 89 90 .. versionadded:: 0.24.0 91 92 * If callable, then evaluate each column name against it and parse the 93 column if the callable returns ``True``. 94 95 Returns a subset of the columns according to behavior above. 96 97 .. versionadded:: 0.24.0 98 99squeeze : bool, default False 100 If the parsed data only contains one column then return a Series. 101dtype : Type name or dict of column -> type, default None 102 Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} 103 Use `object` to preserve data as stored in Excel and not interpret dtype. 104 If converters are specified, they will be applied INSTEAD 105 of dtype conversion. 106engine : str, default None 107 If io is not a buffer or path, this must be set to identify io. 108 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". 109 Engine compatibility : 110 111 - "xlrd" supports old-style Excel files (.xls). 112 - "openpyxl" supports newer Excel file formats. 113 - "odf" supports OpenDocument file formats (.odf, .ods, .odt). 114 - "pyxlsb" supports Binary Excel files. 115 116 .. versionchanged:: 1.2.0 117 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 118 now only supports old-style ``.xls`` files. 119 When ``engine=None``, the following logic will be 120 used to determine the engine: 121 122 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 123 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 124 - Otherwise if ``path_or_buffer`` is an xls format, 125 ``xlrd`` will be used. 126 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, 127 then ``openpyxl`` will be used. 128 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. 129 - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This 130 case will raise a ``ValueError`` in a future version of pandas. 131 132converters : dict, default None 133 Dict of functions for converting values in certain columns. Keys can 134 either be integers or column labels, values are functions that take one 135 input argument, the Excel cell content, and return the transformed 136 content. 137true_values : list, default None 138 Values to consider as True. 139false_values : list, default None 140 Values to consider as False. 141skiprows : list-like, int, or callable, optional 142 Line numbers to skip (0-indexed) or number of lines to skip (int) at the 143 start of the file. If callable, the callable function will be evaluated 144 against the row indices, returning True if the row should be skipped and 145 False otherwise. An example of a valid callable argument would be ``lambda 146 x: x in [0, 2]``. 147nrows : int, default None 148 Number of rows to parse. 149na_values : scalar, str, list-like, or dict, default None 150 Additional strings to recognize as NA/NaN. If dict passed, specific 151 per-column NA values. By default the following values are interpreted 152 as NaN: '""" 153 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 154 + """'. 155keep_default_na : bool, default True 156 Whether or not to include the default NaN values when parsing the data. 157 Depending on whether `na_values` is passed in, the behavior is as follows: 158 159 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 160 is appended to the default NaN values used for parsing. 161 * If `keep_default_na` is True, and `na_values` are not specified, only 162 the default NaN values are used for parsing. 163 * If `keep_default_na` is False, and `na_values` are specified, only 164 the NaN values specified `na_values` are used for parsing. 165 * If `keep_default_na` is False, and `na_values` are not specified, no 166 strings will be parsed as NaN. 167 168 Note that if `na_filter` is passed in as False, the `keep_default_na` and 169 `na_values` parameters will be ignored. 170na_filter : bool, default True 171 Detect missing value markers (empty strings and the value of na_values). In 172 data without any NAs, passing na_filter=False can improve the performance 173 of reading a large file. 174verbose : bool, default False 175 Indicate number of NA values placed in non-numeric columns. 176parse_dates : bool, list-like, or dict, default False 177 The behavior is as follows: 178 179 * bool. If True -> try parsing the index. 180 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 181 each as a separate date column. 182 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 183 a single date column. 184 * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call 185 result 'foo' 186 187 If a column or index contains an unparseable date, the entire column or 188 index will be returned unaltered as an object data type. If you don`t want to 189 parse some cells as date just change their type in Excel to "Text". 190 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. 191 192 Note: A fast-path exists for iso8601-formatted dates. 193date_parser : function, optional 194 Function to use for converting a sequence of string columns to an array of 195 datetime instances. The default uses ``dateutil.parser.parser`` to do the 196 conversion. Pandas will try to call `date_parser` in three different ways, 197 advancing to the next if an exception occurs: 1) Pass one or more arrays 198 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 199 string values from the columns defined by `parse_dates` into a single array 200 and pass that; and 3) call `date_parser` once for each row using one or 201 more strings (corresponding to the columns defined by `parse_dates`) as 202 arguments. 203thousands : str, default None 204 Thousands separator for parsing string columns to numeric. Note that 205 this parameter is only necessary for columns stored as TEXT in Excel, 206 any numeric columns will automatically be parsed, regardless of display 207 format. 208comment : str, default None 209 Comments out remainder of line. Pass a character or characters to this 210 argument to indicate comments in the input file. Any data between the 211 comment string and the end of the current line is ignored. 212skipfooter : int, default 0 213 Rows at the end to skip (0-indexed). 214convert_float : bool, default True 215 Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric 216 data will be read in as floats: Excel stores all numbers as floats 217 internally. 218mangle_dupe_cols : bool, default True 219 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 220 'X'...'X'. Passing in False will cause data to be overwritten if there 221 are duplicate names in the columns. 222storage_options : dict, optional 223 Extra options that make sense for a particular storage connection, e.g. 224 host, port, username, password, etc., if using a URL that will 225 be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error 226 will be raised if providing this argument with a local path or 227 a file-like buffer. See the fsspec and backend storage implementation 228 docs for the set of allowed keys and values. 229 230 .. versionadded:: 1.2.0 231 232Returns 233------- 234DataFrame or dict of DataFrames 235 DataFrame from the passed in Excel file. See notes in sheet_name 236 argument for more information on when a dict of DataFrames is returned. 237 238See Also 239-------- 240DataFrame.to_excel : Write DataFrame to an Excel file. 241DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 242read_csv : Read a comma-separated values (csv) file into DataFrame. 243read_fwf : Read a table of fixed-width formatted lines into DataFrame. 244 245Examples 246-------- 247The file can be read using the file name as string or an open file object: 248 249>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP 250 Name Value 2510 string1 1 2521 string2 2 2532 #Comment 3 254 255>>> pd.read_excel(open('tmp.xlsx', 'rb'), 256... sheet_name='Sheet3') # doctest: +SKIP 257 Unnamed: 0 Name Value 2580 0 string1 1 2591 1 string2 2 2602 2 #Comment 3 261 262Index and header can be specified via the `index_col` and `header` arguments 263 264>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP 265 0 1 2 2660 NaN Name Value 2671 0.0 string1 1 2682 1.0 string2 2 2693 2.0 #Comment 3 270 271Column types are inferred but can be explicitly specified 272 273>>> pd.read_excel('tmp.xlsx', index_col=0, 274... dtype={'Name': str, 'Value': float}) # doctest: +SKIP 275 Name Value 2760 string1 1.0 2771 string2 2.0 2782 #Comment 3.0 279 280True, False, and NA values, and thousands separators have defaults, 281but can be explicitly specified, too. Supply the values you would like 282as strings or lists of strings! 283 284>>> pd.read_excel('tmp.xlsx', index_col=0, 285... na_values=['string1', 'string2']) # doctest: +SKIP 286 Name Value 2870 NaN 1 2881 NaN 2 2892 #Comment 3 290 291Comment lines in the excel input file can be skipped using the `comment` kwarg 292 293>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP 294 Name Value 2950 string1 1.0 2961 string2 2.0 2972 None NaN 298""" 299) 300 301 302@deprecate_nonkeyword_arguments(allowed_args=2, version="2.0") 303@Appender(_read_excel_doc) 304def read_excel( 305 io, 306 sheet_name=0, 307 header=0, 308 names=None, 309 index_col=None, 310 usecols=None, 311 squeeze=False, 312 dtype=None, 313 engine=None, 314 converters=None, 315 true_values=None, 316 false_values=None, 317 skiprows=None, 318 nrows=None, 319 na_values=None, 320 keep_default_na=True, 321 na_filter=True, 322 verbose=False, 323 parse_dates=False, 324 date_parser=None, 325 thousands=None, 326 comment=None, 327 skipfooter=0, 328 convert_float=True, 329 mangle_dupe_cols=True, 330 storage_options: StorageOptions = None, 331): 332 333 should_close = False 334 if not isinstance(io, ExcelFile): 335 should_close = True 336 io = ExcelFile(io, storage_options=storage_options, engine=engine) 337 elif engine and engine != io.engine: 338 raise ValueError( 339 "Engine should not be specified when passing " 340 "an ExcelFile - ExcelFile already has the engine set" 341 ) 342 343 try: 344 data = io.parse( 345 sheet_name=sheet_name, 346 header=header, 347 names=names, 348 index_col=index_col, 349 usecols=usecols, 350 squeeze=squeeze, 351 dtype=dtype, 352 converters=converters, 353 true_values=true_values, 354 false_values=false_values, 355 skiprows=skiprows, 356 nrows=nrows, 357 na_values=na_values, 358 keep_default_na=keep_default_na, 359 na_filter=na_filter, 360 verbose=verbose, 361 parse_dates=parse_dates, 362 date_parser=date_parser, 363 thousands=thousands, 364 comment=comment, 365 skipfooter=skipfooter, 366 convert_float=convert_float, 367 mangle_dupe_cols=mangle_dupe_cols, 368 ) 369 finally: 370 # make sure to close opened file handles 371 if should_close: 372 io.close() 373 return data 374 375 376class BaseExcelReader(metaclass=abc.ABCMeta): 377 def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): 378 self.handles = IOHandles( 379 handle=filepath_or_buffer, compression={"method": None} 380 ) 381 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): 382 self.handles = get_handle( 383 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False 384 ) 385 386 if isinstance(self.handles.handle, self._workbook_class): 387 self.book = self.handles.handle 388 elif hasattr(self.handles.handle, "read"): 389 # N.B. xlrd.Book has a read attribute too 390 self.handles.handle.seek(0) 391 try: 392 self.book = self.load_workbook(self.handles.handle) 393 except Exception: 394 self.close() 395 raise 396 elif isinstance(self.handles.handle, bytes): 397 self.book = self.load_workbook(BytesIO(self.handles.handle)) 398 else: 399 raise ValueError( 400 "Must explicitly set engine if not passing in buffer or path for io." 401 ) 402 403 @property 404 @abc.abstractmethod 405 def _workbook_class(self): 406 pass 407 408 @abc.abstractmethod 409 def load_workbook(self, filepath_or_buffer): 410 pass 411 412 def close(self): 413 if hasattr(self, "book") and hasattr(self.book, "close"): 414 # pyxlsb: opens a TemporaryFile 415 # openpyxl: https://stackoverflow.com/questions/31416842/ 416 # openpyxl-does-not-close-excel-workbook-in-read-only-mode 417 self.book.close() 418 self.handles.close() 419 420 @property 421 @abc.abstractmethod 422 def sheet_names(self): 423 pass 424 425 @abc.abstractmethod 426 def get_sheet_by_name(self, name): 427 pass 428 429 @abc.abstractmethod 430 def get_sheet_by_index(self, index): 431 pass 432 433 @abc.abstractmethod 434 def get_sheet_data(self, sheet, convert_float): 435 pass 436 437 def raise_if_bad_sheet_by_index(self, index: int) -> None: 438 n_sheets = len(self.sheet_names) 439 if index >= n_sheets: 440 raise ValueError( 441 f"Worksheet index {index} is invalid, {n_sheets} worksheets found" 442 ) 443 444 def raise_if_bad_sheet_by_name(self, name: str) -> None: 445 if name not in self.sheet_names: 446 raise ValueError(f"Worksheet named '{name}' not found") 447 448 def parse( 449 self, 450 sheet_name=0, 451 header=0, 452 names=None, 453 index_col=None, 454 usecols=None, 455 squeeze=False, 456 dtype=None, 457 true_values=None, 458 false_values=None, 459 skiprows=None, 460 nrows=None, 461 na_values=None, 462 verbose=False, 463 parse_dates=False, 464 date_parser=None, 465 thousands=None, 466 comment=None, 467 skipfooter=0, 468 convert_float=True, 469 mangle_dupe_cols=True, 470 **kwds, 471 ): 472 473 validate_header_arg(header) 474 475 ret_dict = False 476 477 # Keep sheetname to maintain backwards compatibility. 478 if isinstance(sheet_name, list): 479 sheets = sheet_name 480 ret_dict = True 481 elif sheet_name is None: 482 sheets = self.sheet_names 483 ret_dict = True 484 else: 485 sheets = [sheet_name] 486 487 # handle same-type duplicates. 488 sheets = list(dict.fromkeys(sheets).keys()) 489 490 output = {} 491 492 for asheetname in sheets: 493 if verbose: 494 print(f"Reading sheet {asheetname}") 495 496 if isinstance(asheetname, str): 497 sheet = self.get_sheet_by_name(asheetname) 498 else: # assume an integer if not a string 499 sheet = self.get_sheet_by_index(asheetname) 500 501 data = self.get_sheet_data(sheet, convert_float) 502 usecols = maybe_convert_usecols(usecols) 503 504 if not data: 505 output[asheetname] = DataFrame() 506 continue 507 508 if is_list_like(header) and len(header) == 1: 509 header = header[0] 510 511 # forward fill and pull out names for MultiIndex column 512 header_names = None 513 if header is not None and is_list_like(header): 514 header_names = [] 515 control_row = [True] * len(data[0]) 516 517 for row in header: 518 if is_integer(skiprows): 519 row += skiprows 520 521 data[row], control_row = fill_mi_header(data[row], control_row) 522 523 if index_col is not None: 524 header_name, _ = pop_header_name(data[row], index_col) 525 header_names.append(header_name) 526 527 if is_list_like(index_col): 528 # Forward fill values for MultiIndex index. 529 if header is None: 530 offset = 0 531 elif not is_list_like(header): 532 offset = 1 + header 533 else: 534 offset = 1 + max(header) 535 536 # Check if we have an empty dataset 537 # before trying to collect data. 538 if offset < len(data): 539 for col in index_col: 540 last = data[offset][col] 541 542 for row in range(offset + 1, len(data)): 543 if data[row][col] == "" or data[row][col] is None: 544 data[row][col] = last 545 else: 546 last = data[row][col] 547 548 has_index_names = is_list_like(header) and len(header) > 1 549 550 # GH 12292 : error when read one empty column from excel file 551 try: 552 parser = TextParser( 553 data, 554 names=names, 555 header=header, 556 index_col=index_col, 557 has_index_names=has_index_names, 558 squeeze=squeeze, 559 dtype=dtype, 560 true_values=true_values, 561 false_values=false_values, 562 skiprows=skiprows, 563 nrows=nrows, 564 na_values=na_values, 565 parse_dates=parse_dates, 566 date_parser=date_parser, 567 thousands=thousands, 568 comment=comment, 569 skipfooter=skipfooter, 570 usecols=usecols, 571 mangle_dupe_cols=mangle_dupe_cols, 572 **kwds, 573 ) 574 575 output[asheetname] = parser.read(nrows=nrows) 576 577 if not squeeze or isinstance(output[asheetname], DataFrame): 578 if header_names: 579 output[asheetname].columns = output[ 580 asheetname 581 ].columns.set_names(header_names) 582 583 except EmptyDataError: 584 # No Data, return an empty DataFrame 585 output[asheetname] = DataFrame() 586 587 if ret_dict: 588 return output 589 else: 590 return output[asheetname] 591 592 593class ExcelWriter(metaclass=abc.ABCMeta): 594 """ 595 Class for writing DataFrame objects into excel sheets. 596 597 Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. 598 See DataFrame.to_excel for typical usage. 599 600 The writer should be used as a context manager. Otherwise, call `close()` to save 601 and close any opened file handles. 602 603 Parameters 604 ---------- 605 path : str or typing.BinaryIO 606 Path to xls or xlsx or ods file. 607 engine : str (optional) 608 Engine to use for writing. If None, defaults to 609 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword 610 argument. 611 612 .. deprecated:: 1.2.0 613 614 As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer 615 maintained, the ``xlwt`` engine will be removed in a future 616 version of pandas. 617 618 date_format : str, default None 619 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 620 datetime_format : str, default None 621 Format string for datetime objects written into Excel files. 622 (e.g. 'YYYY-MM-DD HH:MM:SS'). 623 mode : {'w', 'a'}, default 'w' 624 File mode to use (write or append). Append does not work with fsspec URLs. 625 626 .. versionadded:: 0.24.0 627 storage_options : dict, optional 628 Extra options that make sense for a particular storage connection, e.g. 629 host, port, username, password, etc., if using a URL that will 630 be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". 631 632 .. versionadded:: 1.2.0 633 634 Attributes 635 ---------- 636 None 637 638 Methods 639 ------- 640 None 641 642 Notes 643 ----- 644 None of the methods and properties are considered public. 645 646 For compatibility with CSV writers, ExcelWriter serializes lists 647 and dicts to strings before writing. 648 649 Examples 650 -------- 651 Default usage: 652 653 >>> with ExcelWriter('path_to_file.xlsx') as writer: 654 ... df.to_excel(writer) 655 656 To write to separate sheets in a single file: 657 658 >>> with ExcelWriter('path_to_file.xlsx') as writer: 659 ... df1.to_excel(writer, sheet_name='Sheet1') 660 ... df2.to_excel(writer, sheet_name='Sheet2') 661 662 You can set the date format or datetime format: 663 664 >>> with ExcelWriter('path_to_file.xlsx', 665 ... date_format='YYYY-MM-DD', 666 ... datetime_format='YYYY-MM-DD HH:MM:SS') as writer: 667 ... df.to_excel(writer) 668 669 You can also append to an existing Excel file: 670 671 >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: 672 ... df.to_excel(writer, sheet_name='Sheet3') 673 674 You can store Excel file in RAM: 675 676 >>> import io 677 >>> buffer = io.BytesIO() 678 >>> with pd.ExcelWriter(buffer) as writer: 679 ... df.to_excel(writer) 680 681 You can pack Excel file into zip archive: 682 683 >>> import zipfile 684 >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf: 685 ... with zf.open('filename.xlsx', 'w') as buffer: 686 ... with pd.ExcelWriter(buffer) as writer: 687 ... df.to_excel(writer) 688 """ 689 690 # Defining an ExcelWriter implementation (see abstract methods for more...) 691 692 # - Mandatory 693 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` 694 # --> called to write additional DataFrames to disk 695 # - ``supported_extensions`` (tuple of supported extensions), used to 696 # check that engine supports the given extension. 697 # - ``engine`` - string that gives the engine name. Necessary to 698 # instantiate class directly and bypass ``ExcelWriterMeta`` engine 699 # lookup. 700 # - ``save(self)`` --> called to save file to disk 701 # - Mostly mandatory (i.e. should at least exist) 702 # - book, cur_sheet, path 703 704 # - Optional: 705 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called 706 # with path as first argument. 707 708 # You also need to register the class with ``register_writer()``. 709 # Technically, ExcelWriter implementations don't need to subclass 710 # ExcelWriter. 711 def __new__(cls, path, engine=None, **kwargs): 712 # only switch class if generic(ExcelWriter) 713 714 if cls is ExcelWriter: 715 if engine is None or (isinstance(engine, str) and engine == "auto"): 716 if isinstance(path, str): 717 ext = os.path.splitext(path)[-1][1:] 718 else: 719 ext = "xlsx" 720 721 try: 722 engine = config.get_option(f"io.excel.{ext}.writer", silent=True) 723 if engine == "auto": 724 engine = get_default_writer(ext) 725 except KeyError as err: 726 raise ValueError(f"No engine for filetype: '{ext}'") from err 727 728 if engine == "xlwt": 729 xls_config_engine = config.get_option( 730 "io.excel.xls.writer", silent=True 731 ) 732 # Don't warn a 2nd time if user has changed the default engine for xls 733 if xls_config_engine != "xlwt": 734 warnings.warn( 735 "As the xlwt package is no longer maintained, the xlwt " 736 "engine will be removed in a future version of pandas. " 737 "This is the only engine in pandas that supports writing " 738 "in the xls format. Install openpyxl and write to an xlsx " 739 "file instead. You can set the option io.excel.xls.writer " 740 "to 'xlwt' to silence this warning. While this option is " 741 "deprecated and will also raise a warning, it can " 742 "be globally set and the warning suppressed.", 743 FutureWarning, 744 stacklevel=4, 745 ) 746 747 cls = get_writer(engine) 748 749 return object.__new__(cls) 750 751 # declare external properties you can count on 752 curr_sheet = None 753 path = None 754 755 @property 756 @abc.abstractmethod 757 def supported_extensions(self): 758 """Extensions that writer engine supports.""" 759 pass 760 761 @property 762 @abc.abstractmethod 763 def engine(self): 764 """Name of engine.""" 765 pass 766 767 @abc.abstractmethod 768 def write_cells( 769 self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None 770 ): 771 """ 772 Write given formatted cells into Excel an excel sheet 773 774 Parameters 775 ---------- 776 cells : generator 777 cell of formatted data to save to Excel sheet 778 sheet_name : str, default None 779 Name of Excel sheet, if None, then use self.cur_sheet 780 startrow : upper left cell row to dump data frame 781 startcol : upper left cell column to dump data frame 782 freeze_panes: int tuple of length 2 783 contains the bottom-most row and right-most column to freeze 784 """ 785 pass 786 787 @abc.abstractmethod 788 def save(self): 789 """ 790 Save workbook to disk. 791 """ 792 pass 793 794 def __init__( 795 self, 796 path: Union[FilePathOrBuffer, "ExcelWriter"], 797 engine=None, 798 date_format=None, 799 datetime_format=None, 800 mode: str = "w", 801 storage_options: StorageOptions = None, 802 **engine_kwargs, 803 ): 804 # validate that this engine can handle the extension 805 if isinstance(path, str): 806 ext = os.path.splitext(path)[-1] 807 self.check_extension(ext) 808 809 # use mode to open the file 810 if "b" not in mode: 811 mode += "b" 812 # use "a" for the user to append data to excel but internally use "r+" to let 813 # the excel backend first read the existing file and then write any data to it 814 mode = mode.replace("a", "r+") 815 816 # cast ExcelWriter to avoid adding 'if self.handles is not None' 817 self.handles = IOHandles(cast(Buffer, path), compression={"copression": None}) 818 if not isinstance(path, ExcelWriter): 819 self.handles = get_handle( 820 path, mode, storage_options=storage_options, is_text=False 821 ) 822 self.sheets: Dict[str, Any] = {} 823 self.cur_sheet = None 824 825 if date_format is None: 826 self.date_format = "YYYY-MM-DD" 827 else: 828 self.date_format = date_format 829 if datetime_format is None: 830 self.datetime_format = "YYYY-MM-DD HH:MM:SS" 831 else: 832 self.datetime_format = datetime_format 833 834 self.mode = mode 835 836 def __fspath__(self): 837 return getattr(self.handles.handle, "name", "") 838 839 def _get_sheet_name(self, sheet_name): 840 if sheet_name is None: 841 sheet_name = self.cur_sheet 842 if sheet_name is None: # pragma: no cover 843 raise ValueError("Must pass explicit sheet_name or set cur_sheet property") 844 return sheet_name 845 846 def _value_with_fmt(self, val): 847 """ 848 Convert numpy types to Python types for the Excel writers. 849 850 Parameters 851 ---------- 852 val : object 853 Value to be written into cells 854 855 Returns 856 ------- 857 Tuple with the first element being the converted value and the second 858 being an optional format 859 """ 860 fmt = None 861 862 if is_integer(val): 863 val = int(val) 864 elif is_float(val): 865 val = float(val) 866 elif is_bool(val): 867 val = bool(val) 868 elif isinstance(val, datetime.datetime): 869 fmt = self.datetime_format 870 elif isinstance(val, datetime.date): 871 fmt = self.date_format 872 elif isinstance(val, datetime.timedelta): 873 val = val.total_seconds() / float(86400) 874 fmt = "0" 875 else: 876 val = str(val) 877 878 return val, fmt 879 880 @classmethod 881 def check_extension(cls, ext: str): 882 """ 883 checks that path's extension against the Writer's supported 884 extensions. If it isn't supported, raises UnsupportedFiletypeError. 885 """ 886 if ext.startswith("."): 887 ext = ext[1:] 888 # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" 889 # (not iterable) [attr-defined] 890 if not any( 891 ext in extension 892 for extension in cls.supported_extensions # type: ignore[attr-defined] 893 ): 894 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") 895 else: 896 return True 897 898 # Allow use as a contextmanager 899 def __enter__(self): 900 return self 901 902 def __exit__(self, exc_type, exc_value, traceback): 903 self.close() 904 905 def close(self): 906 """synonym for save, to make it more file-like""" 907 content = self.save() 908 self.handles.close() 909 return content 910 911 912XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" 913ZIP_SIGNATURE = b"PK\x03\x04" 914PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) 915 916 917@doc(storage_options=_shared_docs["storage_options"]) 918def inspect_excel_format( 919 path: Optional[str] = None, 920 content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, 921 storage_options: StorageOptions = None, 922) -> str: 923 """ 924 Inspect the path or content of an excel file and get its format. 925 926 At least one of path or content must be not None. If both are not None, 927 content will take precedence. 928 929 Adopted from xlrd: https://github.com/python-excel/xlrd. 930 931 Parameters 932 ---------- 933 path : str, optional 934 Path to file to inspect. May be a URL. 935 content : file-like object, optional 936 Content of file to inspect. 937 {storage_options} 938 939 Returns 940 ------- 941 str 942 Format of file. 943 944 Raises 945 ------ 946 ValueError 947 If resulting stream is empty. 948 BadZipFile 949 If resulting stream does not have an XLS signature and is not a valid zipfile. 950 """ 951 content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] 952 if isinstance(content, bytes): 953 content_or_path = BytesIO(content) 954 else: 955 content_or_path = content or path 956 assert content_or_path is not None 957 958 with get_handle( 959 content_or_path, "rb", storage_options=storage_options, is_text=False 960 ) as handle: 961 stream = handle.handle 962 stream.seek(0) 963 buf = stream.read(PEEK_SIZE) 964 if buf is None: 965 raise ValueError("stream is empty") 966 else: 967 assert isinstance(buf, bytes) 968 peek = buf 969 stream.seek(0) 970 971 if peek.startswith(XLS_SIGNATURE): 972 return "xls" 973 elif not peek.startswith(ZIP_SIGNATURE): 974 raise ValueError("File is not a recognized excel file") 975 976 # ZipFile typing is overly-strict 977 # https://github.com/python/typeshed/issues/4212 978 zf = zipfile.ZipFile(stream) # type: ignore[arg-type] 979 980 # Workaround for some third party files that use forward slashes and 981 # lower case names. 982 component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] 983 984 if "xl/workbook.xml" in component_names: 985 return "xlsx" 986 if "xl/workbook.bin" in component_names: 987 return "xlsb" 988 if "content.xml" in component_names: 989 return "ods" 990 return "zip" 991 992 993class ExcelFile: 994 """ 995 Class for parsing tabular excel sheets into DataFrame objects. 996 997 See read_excel for more documentation. 998 999 Parameters 1000 ---------- 1001 path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath), 1002 a file-like object, xlrd workbook or openpypl workbook. 1003 If a string or path object, expected to be a path to a 1004 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. 1005 engine : str, default None 1006 If io is not a buffer or path, this must be set to identify io. 1007 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` 1008 Engine compatibility : 1009 1010 - ``xlrd`` supports old-style Excel files (.xls). 1011 - ``openpyxl`` supports newer Excel file formats. 1012 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). 1013 - ``pyxlsb`` supports Binary Excel files. 1014 1015 .. versionchanged:: 1.2.0 1016 1017 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 1018 now only supports old-style ``.xls`` files. 1019 When ``engine=None``, the following logic will be 1020 used to determine the engine: 1021 1022 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 1023 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 1024 - Otherwise if ``path_or_buffer`` is an xls format, 1025 ``xlrd`` will be used. 1026 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, 1027 then ``openpyxl`` will be used. 1028 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. 1029 - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. 1030 This case will raise a ``ValueError`` in a future version of pandas. 1031 1032 .. warning:: 1033 1034 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. 1035 This is not supported, switch to using ``openpyxl`` instead. 1036 """ 1037 1038 from pandas.io.excel._odfreader import ODFReader 1039 from pandas.io.excel._openpyxl import OpenpyxlReader 1040 from pandas.io.excel._pyxlsb import PyxlsbReader 1041 from pandas.io.excel._xlrd import XlrdReader 1042 1043 _engines: Mapping[str, Any] = { 1044 "xlrd": XlrdReader, 1045 "openpyxl": OpenpyxlReader, 1046 "odf": ODFReader, 1047 "pyxlsb": PyxlsbReader, 1048 } 1049 1050 def __init__( 1051 self, path_or_buffer, engine=None, storage_options: StorageOptions = None 1052 ): 1053 if engine is not None and engine not in self._engines: 1054 raise ValueError(f"Unknown engine: {engine}") 1055 1056 # Could be a str, ExcelFile, Book, etc. 1057 self.io = path_or_buffer 1058 # Always a string 1059 self._io = stringify_path(path_or_buffer) 1060 1061 # Determine xlrd version if installed 1062 if ( 1063 import_optional_dependency( 1064 "xlrd", raise_on_missing=False, on_version="ignore" 1065 ) 1066 is None 1067 ): 1068 xlrd_version = None 1069 else: 1070 import xlrd 1071 1072 xlrd_version = LooseVersion(get_version(xlrd)) 1073 1074 ext = None 1075 if engine is None: 1076 # Only determine ext if it is needed 1077 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): 1078 ext = "xls" 1079 else: 1080 ext = inspect_excel_format( 1081 content=path_or_buffer, storage_options=storage_options 1082 ) 1083 1084 if ext == "ods": 1085 engine = "odf" 1086 elif ext == "xls": 1087 engine = "xlrd" 1088 else: 1089 # GH 35029 - Prefer openpyxl except for xls files 1090 if ( 1091 import_optional_dependency( 1092 "openpyxl", raise_on_missing=False, on_version="ignore" 1093 ) 1094 is not None 1095 ): 1096 engine = "openpyxl" 1097 else: 1098 engine = "xlrd" 1099 1100 if engine == "xlrd" and xlrd_version is not None: 1101 if ext is None: 1102 # Need ext to determine ext in order to raise/warn 1103 if isinstance(path_or_buffer, xlrd.Book): 1104 ext = "xls" 1105 else: 1106 ext = inspect_excel_format( 1107 content=path_or_buffer, storage_options=storage_options 1108 ) 1109 1110 if ext != "xls" and xlrd_version >= "2": 1111 raise ValueError( 1112 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " 1113 f"only the xls format is supported. Install openpyxl instead." 1114 ) 1115 elif ext != "xls": 1116 caller = inspect.stack()[1] 1117 if ( 1118 caller.filename.endswith( 1119 os.path.join("pandas", "io", "excel", "_base.py") 1120 ) 1121 and caller.function == "read_excel" 1122 ): 1123 stacklevel = 4 1124 else: 1125 stacklevel = 2 1126 warnings.warn( 1127 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " 1128 f"only the xls format is supported. As a result, the " 1129 f"openpyxl engine will be used if it is installed and the " 1130 f"engine argument is not specified. Install " 1131 f"openpyxl instead.", 1132 FutureWarning, 1133 stacklevel=stacklevel, 1134 ) 1135 assert engine in self._engines, f"Engine {engine} not recognized" 1136 1137 self.engine = engine 1138 self.storage_options = storage_options 1139 1140 self._reader = self._engines[engine](self._io, storage_options=storage_options) 1141 1142 def __fspath__(self): 1143 return self._io 1144 1145 def parse( 1146 self, 1147 sheet_name=0, 1148 header=0, 1149 names=None, 1150 index_col=None, 1151 usecols=None, 1152 squeeze=False, 1153 converters=None, 1154 true_values=None, 1155 false_values=None, 1156 skiprows=None, 1157 nrows=None, 1158 na_values=None, 1159 parse_dates=False, 1160 date_parser=None, 1161 thousands=None, 1162 comment=None, 1163 skipfooter=0, 1164 convert_float=True, 1165 mangle_dupe_cols=True, 1166 **kwds, 1167 ): 1168 """ 1169 Parse specified sheet(s) into a DataFrame. 1170 1171 Equivalent to read_excel(ExcelFile, ...) See the read_excel 1172 docstring for more info on accepted parameters. 1173 1174 Returns 1175 ------- 1176 DataFrame or dict of DataFrames 1177 DataFrame from the passed in Excel file. 1178 """ 1179 return self._reader.parse( 1180 sheet_name=sheet_name, 1181 header=header, 1182 names=names, 1183 index_col=index_col, 1184 usecols=usecols, 1185 squeeze=squeeze, 1186 converters=converters, 1187 true_values=true_values, 1188 false_values=false_values, 1189 skiprows=skiprows, 1190 nrows=nrows, 1191 na_values=na_values, 1192 parse_dates=parse_dates, 1193 date_parser=date_parser, 1194 thousands=thousands, 1195 comment=comment, 1196 skipfooter=skipfooter, 1197 convert_float=convert_float, 1198 mangle_dupe_cols=mangle_dupe_cols, 1199 **kwds, 1200 ) 1201 1202 @property 1203 def book(self): 1204 return self._reader.book 1205 1206 @property 1207 def sheet_names(self): 1208 return self._reader.sheet_names 1209 1210 def close(self): 1211 """close io if necessary""" 1212 self._reader.close() 1213 1214 def __enter__(self): 1215 return self 1216 1217 def __exit__(self, exc_type, exc_value, traceback): 1218 self.close() 1219 1220 def __del__(self): 1221 # Ensure we don't leak file descriptors, but put in try/except in case 1222 # attributes are already deleted 1223 try: 1224 self.close() 1225 except AttributeError: 1226 pass 1227