1import abc
2import datetime
3from distutils.version import LooseVersion
4import inspect
5from io import BufferedIOBase, BytesIO, RawIOBase
6import os
7from textwrap import fill
8from typing import IO, Any, Dict, Mapping, Optional, Union, cast
9import warnings
10import zipfile
11
12from pandas._config import config
13
14from pandas._libs.parsers import STR_NA_VALUES
15from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
16from pandas.compat._optional import get_version, import_optional_dependency
17from pandas.errors import EmptyDataError
18from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc
19
20from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
21
22from pandas.core.frame import DataFrame
23from pandas.core.shared_docs import _shared_docs
24
25from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
26from pandas.io.excel._util import (
27    fill_mi_header,
28    get_default_writer,
29    get_writer,
30    maybe_convert_usecols,
31    pop_header_name,
32)
33from pandas.io.parsers import TextParser
34
35_read_excel_doc = (
36    """
37Read an Excel file into a pandas DataFrame.
38
39Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
40read from a local filesystem or URL. Supports an option to read
41a single sheet or a list of sheets.
42
43Parameters
44----------
45io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
46    Any valid string path is acceptable. The string could be a URL. Valid
47    URL schemes include http, ftp, s3, and file. For file URLs, a host is
48    expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
49
50    If you want to pass in a path object, pandas accepts any ``os.PathLike``.
51
52    By file-like object, we refer to objects with a ``read()`` method,
53    such as a file handle (e.g. via builtin ``open`` function)
54    or ``StringIO``.
55sheet_name : str, int, list, or None, default 0
56    Strings are used for sheet names. Integers are used in zero-indexed
57    sheet positions. Lists of strings/integers are used to request
58    multiple sheets. Specify None to get all sheets.
59
60    Available cases:
61
62    * Defaults to ``0``: 1st sheet as a `DataFrame`
63    * ``1``: 2nd sheet as a `DataFrame`
64    * ``"Sheet1"``: Load sheet with name "Sheet1"
65    * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
66      as a dict of `DataFrame`
67    * None: All sheets.
68
69header : int, list of int, default 0
70    Row (0-indexed) to use for the column labels of the parsed
71    DataFrame. If a list of integers is passed those row positions will
72    be combined into a ``MultiIndex``. Use None if there is no header.
73names : array-like, default None
74    List of column names to use. If file contains no header row,
75    then you should explicitly pass header=None.
76index_col : int, list of int, default None
77    Column (0-indexed) to use as the row labels of the DataFrame.
78    Pass None if there is no such column.  If a list is passed,
79    those columns will be combined into a ``MultiIndex``.  If a
80    subset of data is selected with ``usecols``, index_col
81    is based on the subset.
82usecols : int, str, list-like, or callable default None
83    * If None, then parse all columns.
84    * If str, then indicates comma separated list of Excel column letters
85      and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
86      both sides.
87    * If list of int, then indicates list of column numbers to be parsed.
88    * If list of string, then indicates list of column names to be parsed.
89
90      .. versionadded:: 0.24.0
91
92    * If callable, then evaluate each column name against it and parse the
93      column if the callable returns ``True``.
94
95    Returns a subset of the columns according to behavior above.
96
97      .. versionadded:: 0.24.0
98
99squeeze : bool, default False
100    If the parsed data only contains one column then return a Series.
101dtype : Type name or dict of column -> type, default None
102    Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
103    Use `object` to preserve data as stored in Excel and not interpret dtype.
104    If converters are specified, they will be applied INSTEAD
105    of dtype conversion.
106engine : str, default None
107    If io is not a buffer or path, this must be set to identify io.
108    Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
109    Engine compatibility :
110
111    - "xlrd" supports old-style Excel files (.xls).
112    - "openpyxl" supports newer Excel file formats.
113    - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
114    - "pyxlsb" supports Binary Excel files.
115
116    .. versionchanged:: 1.2.0
117        The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
118        now only supports old-style ``.xls`` files.
119        When ``engine=None``, the following logic will be
120        used to determine the engine:
121
122       - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
123         then `odf <https://pypi.org/project/odfpy/>`_ will be used.
124       - Otherwise if ``path_or_buffer`` is an xls format,
125         ``xlrd`` will be used.
126       - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
127         then ``openpyxl`` will be used.
128       - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
129       - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
130         case will raise a ``ValueError`` in a future version of pandas.
131
132converters : dict, default None
133    Dict of functions for converting values in certain columns. Keys can
134    either be integers or column labels, values are functions that take one
135    input argument, the Excel cell content, and return the transformed
136    content.
137true_values : list, default None
138    Values to consider as True.
139false_values : list, default None
140    Values to consider as False.
141skiprows : list-like, int, or callable, optional
142    Line numbers to skip (0-indexed) or number of lines to skip (int) at the
143    start of the file. If callable, the callable function will be evaluated
144    against the row indices, returning True if the row should be skipped and
145    False otherwise. An example of a valid callable argument would be ``lambda
146    x: x in [0, 2]``.
147nrows : int, default None
148    Number of rows to parse.
149na_values : scalar, str, list-like, or dict, default None
150    Additional strings to recognize as NA/NaN. If dict passed, specific
151    per-column NA values. By default the following values are interpreted
152    as NaN: '"""
153    + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent="    ")
154    + """'.
155keep_default_na : bool, default True
156    Whether or not to include the default NaN values when parsing the data.
157    Depending on whether `na_values` is passed in, the behavior is as follows:
158
159    * If `keep_default_na` is True, and `na_values` are specified, `na_values`
160      is appended to the default NaN values used for parsing.
161    * If `keep_default_na` is True, and `na_values` are not specified, only
162      the default NaN values are used for parsing.
163    * If `keep_default_na` is False, and `na_values` are specified, only
164      the NaN values specified `na_values` are used for parsing.
165    * If `keep_default_na` is False, and `na_values` are not specified, no
166      strings will be parsed as NaN.
167
168    Note that if `na_filter` is passed in as False, the `keep_default_na` and
169    `na_values` parameters will be ignored.
170na_filter : bool, default True
171    Detect missing value markers (empty strings and the value of na_values). In
172    data without any NAs, passing na_filter=False can improve the performance
173    of reading a large file.
174verbose : bool, default False
175    Indicate number of NA values placed in non-numeric columns.
176parse_dates : bool, list-like, or dict, default False
177    The behavior is as follows:
178
179    * bool. If True -> try parsing the index.
180    * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
181      each as a separate date column.
182    * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
183      a single date column.
184    * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
185      result 'foo'
186
187    If a column or index contains an unparseable date, the entire column or
188    index will be returned unaltered as an object data type. If you don`t want to
189    parse some cells as date just change their type in Excel to "Text".
190    For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
191
192    Note: A fast-path exists for iso8601-formatted dates.
193date_parser : function, optional
194    Function to use for converting a sequence of string columns to an array of
195    datetime instances. The default uses ``dateutil.parser.parser`` to do the
196    conversion. Pandas will try to call `date_parser` in three different ways,
197    advancing to the next if an exception occurs: 1) Pass one or more arrays
198    (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
199    string values from the columns defined by `parse_dates` into a single array
200    and pass that; and 3) call `date_parser` once for each row using one or
201    more strings (corresponding to the columns defined by `parse_dates`) as
202    arguments.
203thousands : str, default None
204    Thousands separator for parsing string columns to numeric.  Note that
205    this parameter is only necessary for columns stored as TEXT in Excel,
206    any numeric columns will automatically be parsed, regardless of display
207    format.
208comment : str, default None
209    Comments out remainder of line. Pass a character or characters to this
210    argument to indicate comments in the input file. Any data between the
211    comment string and the end of the current line is ignored.
212skipfooter : int, default 0
213    Rows at the end to skip (0-indexed).
214convert_float : bool, default True
215    Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
216    data will be read in as floats: Excel stores all numbers as floats
217    internally.
218mangle_dupe_cols : bool, default True
219    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
220    'X'...'X'. Passing in False will cause data to be overwritten if there
221    are duplicate names in the columns.
222storage_options : dict, optional
223    Extra options that make sense for a particular storage connection, e.g.
224    host, port, username, password, etc., if using a URL that will
225    be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
226    will be raised if providing this argument with a local path or
227    a file-like buffer. See the fsspec and backend storage implementation
228    docs for the set of allowed keys and values.
229
230    .. versionadded:: 1.2.0
231
232Returns
233-------
234DataFrame or dict of DataFrames
235    DataFrame from the passed in Excel file. See notes in sheet_name
236    argument for more information on when a dict of DataFrames is returned.
237
238See Also
239--------
240DataFrame.to_excel : Write DataFrame to an Excel file.
241DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
242read_csv : Read a comma-separated values (csv) file into DataFrame.
243read_fwf : Read a table of fixed-width formatted lines into DataFrame.
244
245Examples
246--------
247The file can be read using the file name as string or an open file object:
248
249>>> pd.read_excel('tmp.xlsx', index_col=0)  # doctest: +SKIP
250       Name  Value
2510   string1      1
2521   string2      2
2532  #Comment      3
254
255>>> pd.read_excel(open('tmp.xlsx', 'rb'),
256...               sheet_name='Sheet3')  # doctest: +SKIP
257   Unnamed: 0      Name  Value
2580           0   string1      1
2591           1   string2      2
2602           2  #Comment      3
261
262Index and header can be specified via the `index_col` and `header` arguments
263
264>>> pd.read_excel('tmp.xlsx', index_col=None, header=None)  # doctest: +SKIP
265     0         1      2
2660  NaN      Name  Value
2671  0.0   string1      1
2682  1.0   string2      2
2693  2.0  #Comment      3
270
271Column types are inferred but can be explicitly specified
272
273>>> pd.read_excel('tmp.xlsx', index_col=0,
274...               dtype={'Name': str, 'Value': float})  # doctest: +SKIP
275       Name  Value
2760   string1    1.0
2771   string2    2.0
2782  #Comment    3.0
279
280True, False, and NA values, and thousands separators have defaults,
281but can be explicitly specified, too. Supply the values you would like
282as strings or lists of strings!
283
284>>> pd.read_excel('tmp.xlsx', index_col=0,
285...               na_values=['string1', 'string2'])  # doctest: +SKIP
286       Name  Value
2870       NaN      1
2881       NaN      2
2892  #Comment      3
290
291Comment lines in the excel input file can be skipped using the `comment` kwarg
292
293>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#')  # doctest: +SKIP
294      Name  Value
2950  string1    1.0
2961  string2    2.0
2972     None    NaN
298"""
299)
300
301
302@deprecate_nonkeyword_arguments(allowed_args=2, version="2.0")
303@Appender(_read_excel_doc)
304def read_excel(
305    io,
306    sheet_name=0,
307    header=0,
308    names=None,
309    index_col=None,
310    usecols=None,
311    squeeze=False,
312    dtype=None,
313    engine=None,
314    converters=None,
315    true_values=None,
316    false_values=None,
317    skiprows=None,
318    nrows=None,
319    na_values=None,
320    keep_default_na=True,
321    na_filter=True,
322    verbose=False,
323    parse_dates=False,
324    date_parser=None,
325    thousands=None,
326    comment=None,
327    skipfooter=0,
328    convert_float=True,
329    mangle_dupe_cols=True,
330    storage_options: StorageOptions = None,
331):
332
333    should_close = False
334    if not isinstance(io, ExcelFile):
335        should_close = True
336        io = ExcelFile(io, storage_options=storage_options, engine=engine)
337    elif engine and engine != io.engine:
338        raise ValueError(
339            "Engine should not be specified when passing "
340            "an ExcelFile - ExcelFile already has the engine set"
341        )
342
343    try:
344        data = io.parse(
345            sheet_name=sheet_name,
346            header=header,
347            names=names,
348            index_col=index_col,
349            usecols=usecols,
350            squeeze=squeeze,
351            dtype=dtype,
352            converters=converters,
353            true_values=true_values,
354            false_values=false_values,
355            skiprows=skiprows,
356            nrows=nrows,
357            na_values=na_values,
358            keep_default_na=keep_default_na,
359            na_filter=na_filter,
360            verbose=verbose,
361            parse_dates=parse_dates,
362            date_parser=date_parser,
363            thousands=thousands,
364            comment=comment,
365            skipfooter=skipfooter,
366            convert_float=convert_float,
367            mangle_dupe_cols=mangle_dupe_cols,
368        )
369    finally:
370        # make sure to close opened file handles
371        if should_close:
372            io.close()
373    return data
374
375
376class BaseExcelReader(metaclass=abc.ABCMeta):
377    def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None):
378        self.handles = IOHandles(
379            handle=filepath_or_buffer, compression={"method": None}
380        )
381        if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
382            self.handles = get_handle(
383                filepath_or_buffer, "rb", storage_options=storage_options, is_text=False
384            )
385
386        if isinstance(self.handles.handle, self._workbook_class):
387            self.book = self.handles.handle
388        elif hasattr(self.handles.handle, "read"):
389            # N.B. xlrd.Book has a read attribute too
390            self.handles.handle.seek(0)
391            try:
392                self.book = self.load_workbook(self.handles.handle)
393            except Exception:
394                self.close()
395                raise
396        elif isinstance(self.handles.handle, bytes):
397            self.book = self.load_workbook(BytesIO(self.handles.handle))
398        else:
399            raise ValueError(
400                "Must explicitly set engine if not passing in buffer or path for io."
401            )
402
403    @property
404    @abc.abstractmethod
405    def _workbook_class(self):
406        pass
407
408    @abc.abstractmethod
409    def load_workbook(self, filepath_or_buffer):
410        pass
411
412    def close(self):
413        if hasattr(self, "book") and hasattr(self.book, "close"):
414            # pyxlsb: opens a TemporaryFile
415            # openpyxl: https://stackoverflow.com/questions/31416842/
416            #     openpyxl-does-not-close-excel-workbook-in-read-only-mode
417            self.book.close()
418        self.handles.close()
419
420    @property
421    @abc.abstractmethod
422    def sheet_names(self):
423        pass
424
425    @abc.abstractmethod
426    def get_sheet_by_name(self, name):
427        pass
428
429    @abc.abstractmethod
430    def get_sheet_by_index(self, index):
431        pass
432
433    @abc.abstractmethod
434    def get_sheet_data(self, sheet, convert_float):
435        pass
436
437    def raise_if_bad_sheet_by_index(self, index: int) -> None:
438        n_sheets = len(self.sheet_names)
439        if index >= n_sheets:
440            raise ValueError(
441                f"Worksheet index {index} is invalid, {n_sheets} worksheets found"
442            )
443
444    def raise_if_bad_sheet_by_name(self, name: str) -> None:
445        if name not in self.sheet_names:
446            raise ValueError(f"Worksheet named '{name}' not found")
447
448    def parse(
449        self,
450        sheet_name=0,
451        header=0,
452        names=None,
453        index_col=None,
454        usecols=None,
455        squeeze=False,
456        dtype=None,
457        true_values=None,
458        false_values=None,
459        skiprows=None,
460        nrows=None,
461        na_values=None,
462        verbose=False,
463        parse_dates=False,
464        date_parser=None,
465        thousands=None,
466        comment=None,
467        skipfooter=0,
468        convert_float=True,
469        mangle_dupe_cols=True,
470        **kwds,
471    ):
472
473        validate_header_arg(header)
474
475        ret_dict = False
476
477        # Keep sheetname to maintain backwards compatibility.
478        if isinstance(sheet_name, list):
479            sheets = sheet_name
480            ret_dict = True
481        elif sheet_name is None:
482            sheets = self.sheet_names
483            ret_dict = True
484        else:
485            sheets = [sheet_name]
486
487        # handle same-type duplicates.
488        sheets = list(dict.fromkeys(sheets).keys())
489
490        output = {}
491
492        for asheetname in sheets:
493            if verbose:
494                print(f"Reading sheet {asheetname}")
495
496            if isinstance(asheetname, str):
497                sheet = self.get_sheet_by_name(asheetname)
498            else:  # assume an integer if not a string
499                sheet = self.get_sheet_by_index(asheetname)
500
501            data = self.get_sheet_data(sheet, convert_float)
502            usecols = maybe_convert_usecols(usecols)
503
504            if not data:
505                output[asheetname] = DataFrame()
506                continue
507
508            if is_list_like(header) and len(header) == 1:
509                header = header[0]
510
511            # forward fill and pull out names for MultiIndex column
512            header_names = None
513            if header is not None and is_list_like(header):
514                header_names = []
515                control_row = [True] * len(data[0])
516
517                for row in header:
518                    if is_integer(skiprows):
519                        row += skiprows
520
521                    data[row], control_row = fill_mi_header(data[row], control_row)
522
523                    if index_col is not None:
524                        header_name, _ = pop_header_name(data[row], index_col)
525                        header_names.append(header_name)
526
527            if is_list_like(index_col):
528                # Forward fill values for MultiIndex index.
529                if header is None:
530                    offset = 0
531                elif not is_list_like(header):
532                    offset = 1 + header
533                else:
534                    offset = 1 + max(header)
535
536                # Check if we have an empty dataset
537                # before trying to collect data.
538                if offset < len(data):
539                    for col in index_col:
540                        last = data[offset][col]
541
542                        for row in range(offset + 1, len(data)):
543                            if data[row][col] == "" or data[row][col] is None:
544                                data[row][col] = last
545                            else:
546                                last = data[row][col]
547
548            has_index_names = is_list_like(header) and len(header) > 1
549
550            # GH 12292 : error when read one empty column from excel file
551            try:
552                parser = TextParser(
553                    data,
554                    names=names,
555                    header=header,
556                    index_col=index_col,
557                    has_index_names=has_index_names,
558                    squeeze=squeeze,
559                    dtype=dtype,
560                    true_values=true_values,
561                    false_values=false_values,
562                    skiprows=skiprows,
563                    nrows=nrows,
564                    na_values=na_values,
565                    parse_dates=parse_dates,
566                    date_parser=date_parser,
567                    thousands=thousands,
568                    comment=comment,
569                    skipfooter=skipfooter,
570                    usecols=usecols,
571                    mangle_dupe_cols=mangle_dupe_cols,
572                    **kwds,
573                )
574
575                output[asheetname] = parser.read(nrows=nrows)
576
577                if not squeeze or isinstance(output[asheetname], DataFrame):
578                    if header_names:
579                        output[asheetname].columns = output[
580                            asheetname
581                        ].columns.set_names(header_names)
582
583            except EmptyDataError:
584                # No Data, return an empty DataFrame
585                output[asheetname] = DataFrame()
586
587        if ret_dict:
588            return output
589        else:
590            return output[asheetname]
591
592
593class ExcelWriter(metaclass=abc.ABCMeta):
594    """
595    Class for writing DataFrame objects into excel sheets.
596
597    Default is to use xlwt for xls, openpyxl for xlsx, odf for ods.
598    See DataFrame.to_excel for typical usage.
599
600    The writer should be used as a context manager. Otherwise, call `close()` to save
601    and close any opened file handles.
602
603    Parameters
604    ----------
605    path : str or typing.BinaryIO
606        Path to xls or xlsx or ods file.
607    engine : str (optional)
608        Engine to use for writing. If None, defaults to
609        ``io.excel.<extension>.writer``.  NOTE: can only be passed as a keyword
610        argument.
611
612        .. deprecated:: 1.2.0
613
614            As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
615            maintained, the ``xlwt`` engine will be removed in a future
616            version of pandas.
617
618    date_format : str, default None
619        Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
620    datetime_format : str, default None
621        Format string for datetime objects written into Excel files.
622        (e.g. 'YYYY-MM-DD HH:MM:SS').
623    mode : {'w', 'a'}, default 'w'
624        File mode to use (write or append). Append does not work with fsspec URLs.
625
626        .. versionadded:: 0.24.0
627    storage_options : dict, optional
628        Extra options that make sense for a particular storage connection, e.g.
629        host, port, username, password, etc., if using a URL that will
630        be parsed by ``fsspec``, e.g., starting "s3://", "gcs://".
631
632        .. versionadded:: 1.2.0
633
634    Attributes
635    ----------
636    None
637
638    Methods
639    -------
640    None
641
642    Notes
643    -----
644    None of the methods and properties are considered public.
645
646    For compatibility with CSV writers, ExcelWriter serializes lists
647    and dicts to strings before writing.
648
649    Examples
650    --------
651    Default usage:
652
653    >>> with ExcelWriter('path_to_file.xlsx') as writer:
654    ...     df.to_excel(writer)
655
656    To write to separate sheets in a single file:
657
658    >>> with ExcelWriter('path_to_file.xlsx') as writer:
659    ...     df1.to_excel(writer, sheet_name='Sheet1')
660    ...     df2.to_excel(writer, sheet_name='Sheet2')
661
662    You can set the date format or datetime format:
663
664    >>> with ExcelWriter('path_to_file.xlsx',
665    ...                   date_format='YYYY-MM-DD',
666    ...                   datetime_format='YYYY-MM-DD HH:MM:SS') as writer:
667    ...     df.to_excel(writer)
668
669    You can also append to an existing Excel file:
670
671    >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
672    ...     df.to_excel(writer, sheet_name='Sheet3')
673
674    You can store Excel file in RAM:
675
676    >>> import io
677    >>> buffer = io.BytesIO()
678    >>> with pd.ExcelWriter(buffer) as writer:
679    ...     df.to_excel(writer)
680
681    You can pack Excel file into zip archive:
682
683    >>> import zipfile
684    >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf:
685    ...     with zf.open('filename.xlsx', 'w') as buffer:
686    ...         with pd.ExcelWriter(buffer) as writer:
687    ...             df.to_excel(writer)
688    """
689
690    # Defining an ExcelWriter implementation (see abstract methods for more...)
691
692    # - Mandatory
693    #   - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
694    #     --> called to write additional DataFrames to disk
695    #   - ``supported_extensions`` (tuple of supported extensions), used to
696    #      check that engine supports the given extension.
697    #   - ``engine`` - string that gives the engine name. Necessary to
698    #     instantiate class directly and bypass ``ExcelWriterMeta`` engine
699    #     lookup.
700    #   - ``save(self)`` --> called to save file to disk
701    # - Mostly mandatory (i.e. should at least exist)
702    #   - book, cur_sheet, path
703
704    # - Optional:
705    #   - ``__init__(self, path, engine=None, **kwargs)`` --> always called
706    #     with path as first argument.
707
708    # You also need to register the class with ``register_writer()``.
709    # Technically, ExcelWriter implementations don't need to subclass
710    # ExcelWriter.
711    def __new__(cls, path, engine=None, **kwargs):
712        # only switch class if generic(ExcelWriter)
713
714        if cls is ExcelWriter:
715            if engine is None or (isinstance(engine, str) and engine == "auto"):
716                if isinstance(path, str):
717                    ext = os.path.splitext(path)[-1][1:]
718                else:
719                    ext = "xlsx"
720
721                try:
722                    engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
723                    if engine == "auto":
724                        engine = get_default_writer(ext)
725                except KeyError as err:
726                    raise ValueError(f"No engine for filetype: '{ext}'") from err
727
728            if engine == "xlwt":
729                xls_config_engine = config.get_option(
730                    "io.excel.xls.writer", silent=True
731                )
732                # Don't warn a 2nd time if user has changed the default engine for xls
733                if xls_config_engine != "xlwt":
734                    warnings.warn(
735                        "As the xlwt package is no longer maintained, the xlwt "
736                        "engine will be removed in a future version of pandas. "
737                        "This is the only engine in pandas that supports writing "
738                        "in the xls format. Install openpyxl and write to an xlsx "
739                        "file instead. You can set the option io.excel.xls.writer "
740                        "to 'xlwt' to silence this warning. While this option is "
741                        "deprecated and will also raise a warning, it can "
742                        "be globally set and the warning suppressed.",
743                        FutureWarning,
744                        stacklevel=4,
745                    )
746
747            cls = get_writer(engine)
748
749        return object.__new__(cls)
750
751    # declare external properties you can count on
752    curr_sheet = None
753    path = None
754
755    @property
756    @abc.abstractmethod
757    def supported_extensions(self):
758        """Extensions that writer engine supports."""
759        pass
760
761    @property
762    @abc.abstractmethod
763    def engine(self):
764        """Name of engine."""
765        pass
766
767    @abc.abstractmethod
768    def write_cells(
769        self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
770    ):
771        """
772        Write given formatted cells into Excel an excel sheet
773
774        Parameters
775        ----------
776        cells : generator
777            cell of formatted data to save to Excel sheet
778        sheet_name : str, default None
779            Name of Excel sheet, if None, then use self.cur_sheet
780        startrow : upper left cell row to dump data frame
781        startcol : upper left cell column to dump data frame
782        freeze_panes: int tuple of length 2
783            contains the bottom-most row and right-most column to freeze
784        """
785        pass
786
787    @abc.abstractmethod
788    def save(self):
789        """
790        Save workbook to disk.
791        """
792        pass
793
794    def __init__(
795        self,
796        path: Union[FilePathOrBuffer, "ExcelWriter"],
797        engine=None,
798        date_format=None,
799        datetime_format=None,
800        mode: str = "w",
801        storage_options: StorageOptions = None,
802        **engine_kwargs,
803    ):
804        # validate that this engine can handle the extension
805        if isinstance(path, str):
806            ext = os.path.splitext(path)[-1]
807            self.check_extension(ext)
808
809        # use mode to open the file
810        if "b" not in mode:
811            mode += "b"
812        # use "a" for the user to append data to excel but internally use "r+" to let
813        # the excel backend first read the existing file and then write any data to it
814        mode = mode.replace("a", "r+")
815
816        # cast ExcelWriter to avoid adding 'if self.handles is not None'
817        self.handles = IOHandles(cast(Buffer, path), compression={"copression": None})
818        if not isinstance(path, ExcelWriter):
819            self.handles = get_handle(
820                path, mode, storage_options=storage_options, is_text=False
821            )
822        self.sheets: Dict[str, Any] = {}
823        self.cur_sheet = None
824
825        if date_format is None:
826            self.date_format = "YYYY-MM-DD"
827        else:
828            self.date_format = date_format
829        if datetime_format is None:
830            self.datetime_format = "YYYY-MM-DD HH:MM:SS"
831        else:
832            self.datetime_format = datetime_format
833
834        self.mode = mode
835
836    def __fspath__(self):
837        return getattr(self.handles.handle, "name", "")
838
839    def _get_sheet_name(self, sheet_name):
840        if sheet_name is None:
841            sheet_name = self.cur_sheet
842        if sheet_name is None:  # pragma: no cover
843            raise ValueError("Must pass explicit sheet_name or set cur_sheet property")
844        return sheet_name
845
846    def _value_with_fmt(self, val):
847        """
848        Convert numpy types to Python types for the Excel writers.
849
850        Parameters
851        ----------
852        val : object
853            Value to be written into cells
854
855        Returns
856        -------
857        Tuple with the first element being the converted value and the second
858            being an optional format
859        """
860        fmt = None
861
862        if is_integer(val):
863            val = int(val)
864        elif is_float(val):
865            val = float(val)
866        elif is_bool(val):
867            val = bool(val)
868        elif isinstance(val, datetime.datetime):
869            fmt = self.datetime_format
870        elif isinstance(val, datetime.date):
871            fmt = self.date_format
872        elif isinstance(val, datetime.timedelta):
873            val = val.total_seconds() / float(86400)
874            fmt = "0"
875        else:
876            val = str(val)
877
878        return val, fmt
879
880    @classmethod
881    def check_extension(cls, ext: str):
882        """
883        checks that path's extension against the Writer's supported
884        extensions.  If it isn't supported, raises UnsupportedFiletypeError.
885        """
886        if ext.startswith("."):
887            ext = ext[1:]
888        # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__"
889        #  (not iterable)  [attr-defined]
890        if not any(
891            ext in extension
892            for extension in cls.supported_extensions  # type: ignore[attr-defined]
893        ):
894            raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")
895        else:
896            return True
897
898    # Allow use as a contextmanager
899    def __enter__(self):
900        return self
901
902    def __exit__(self, exc_type, exc_value, traceback):
903        self.close()
904
905    def close(self):
906        """synonym for save, to make it more file-like"""
907        content = self.save()
908        self.handles.close()
909        return content
910
911
912XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
913ZIP_SIGNATURE = b"PK\x03\x04"
914PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
915
916
917@doc(storage_options=_shared_docs["storage_options"])
918def inspect_excel_format(
919    path: Optional[str] = None,
920    content: Union[None, BufferedIOBase, RawIOBase, bytes] = None,
921    storage_options: StorageOptions = None,
922) -> str:
923    """
924    Inspect the path or content of an excel file and get its format.
925
926    At least one of path or content must be not None. If both are not None,
927    content will take precedence.
928
929    Adopted from xlrd: https://github.com/python-excel/xlrd.
930
931    Parameters
932    ----------
933    path : str, optional
934        Path to file to inspect. May be a URL.
935    content : file-like object, optional
936        Content of file to inspect.
937    {storage_options}
938
939    Returns
940    -------
941    str
942        Format of file.
943
944    Raises
945    ------
946    ValueError
947        If resulting stream is empty.
948    BadZipFile
949        If resulting stream does not have an XLS signature and is not a valid zipfile.
950    """
951    content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]]
952    if isinstance(content, bytes):
953        content_or_path = BytesIO(content)
954    else:
955        content_or_path = content or path
956    assert content_or_path is not None
957
958    with get_handle(
959        content_or_path, "rb", storage_options=storage_options, is_text=False
960    ) as handle:
961        stream = handle.handle
962        stream.seek(0)
963        buf = stream.read(PEEK_SIZE)
964        if buf is None:
965            raise ValueError("stream is empty")
966        else:
967            assert isinstance(buf, bytes)
968            peek = buf
969        stream.seek(0)
970
971        if peek.startswith(XLS_SIGNATURE):
972            return "xls"
973        elif not peek.startswith(ZIP_SIGNATURE):
974            raise ValueError("File is not a recognized excel file")
975
976        # ZipFile typing is overly-strict
977        # https://github.com/python/typeshed/issues/4212
978        zf = zipfile.ZipFile(stream)  # type: ignore[arg-type]
979
980        # Workaround for some third party files that use forward slashes and
981        # lower case names.
982        component_names = [name.replace("\\", "/").lower() for name in zf.namelist()]
983
984        if "xl/workbook.xml" in component_names:
985            return "xlsx"
986        if "xl/workbook.bin" in component_names:
987            return "xlsb"
988        if "content.xml" in component_names:
989            return "ods"
990        return "zip"
991
992
993class ExcelFile:
994    """
995    Class for parsing tabular excel sheets into DataFrame objects.
996
997    See read_excel for more documentation.
998
999    Parameters
1000    ----------
1001    path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath),
1002        a file-like object, xlrd workbook or openpypl workbook.
1003        If a string or path object, expected to be a path to a
1004        .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
1005    engine : str, default None
1006        If io is not a buffer or path, this must be set to identify io.
1007        Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
1008        Engine compatibility :
1009
1010        - ``xlrd`` supports old-style Excel files (.xls).
1011        - ``openpyxl`` supports newer Excel file formats.
1012        - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
1013        - ``pyxlsb`` supports Binary Excel files.
1014
1015        .. versionchanged:: 1.2.0
1016
1017           The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
1018           now only supports old-style ``.xls`` files.
1019           When ``engine=None``, the following logic will be
1020           used to determine the engine:
1021
1022           - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
1023             then `odf <https://pypi.org/project/odfpy/>`_ will be used.
1024           - Otherwise if ``path_or_buffer`` is an xls format,
1025             ``xlrd`` will be used.
1026           - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
1027             then ``openpyxl`` will be used.
1028           - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
1029           - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
1030             This case will raise a ``ValueError`` in a future version of pandas.
1031
1032           .. warning::
1033
1034            Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
1035            This is not supported, switch to using ``openpyxl`` instead.
1036    """
1037
1038    from pandas.io.excel._odfreader import ODFReader
1039    from pandas.io.excel._openpyxl import OpenpyxlReader
1040    from pandas.io.excel._pyxlsb import PyxlsbReader
1041    from pandas.io.excel._xlrd import XlrdReader
1042
1043    _engines: Mapping[str, Any] = {
1044        "xlrd": XlrdReader,
1045        "openpyxl": OpenpyxlReader,
1046        "odf": ODFReader,
1047        "pyxlsb": PyxlsbReader,
1048    }
1049
1050    def __init__(
1051        self, path_or_buffer, engine=None, storage_options: StorageOptions = None
1052    ):
1053        if engine is not None and engine not in self._engines:
1054            raise ValueError(f"Unknown engine: {engine}")
1055
1056        # Could be a str, ExcelFile, Book, etc.
1057        self.io = path_or_buffer
1058        # Always a string
1059        self._io = stringify_path(path_or_buffer)
1060
1061        # Determine xlrd version if installed
1062        if (
1063            import_optional_dependency(
1064                "xlrd", raise_on_missing=False, on_version="ignore"
1065            )
1066            is None
1067        ):
1068            xlrd_version = None
1069        else:
1070            import xlrd
1071
1072            xlrd_version = LooseVersion(get_version(xlrd))
1073
1074        ext = None
1075        if engine is None:
1076            # Only determine ext if it is needed
1077            if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1078                ext = "xls"
1079            else:
1080                ext = inspect_excel_format(
1081                    content=path_or_buffer, storage_options=storage_options
1082                )
1083
1084            if ext == "ods":
1085                engine = "odf"
1086            elif ext == "xls":
1087                engine = "xlrd"
1088            else:
1089                # GH 35029 - Prefer openpyxl except for xls files
1090                if (
1091                    import_optional_dependency(
1092                        "openpyxl", raise_on_missing=False, on_version="ignore"
1093                    )
1094                    is not None
1095                ):
1096                    engine = "openpyxl"
1097                else:
1098                    engine = "xlrd"
1099
1100        if engine == "xlrd" and xlrd_version is not None:
1101            if ext is None:
1102                # Need ext to determine ext in order to raise/warn
1103                if isinstance(path_or_buffer, xlrd.Book):
1104                    ext = "xls"
1105                else:
1106                    ext = inspect_excel_format(
1107                        content=path_or_buffer, storage_options=storage_options
1108                    )
1109
1110            if ext != "xls" and xlrd_version >= "2":
1111                raise ValueError(
1112                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1113                    f"only the xls format is supported. Install openpyxl instead."
1114                )
1115            elif ext != "xls":
1116                caller = inspect.stack()[1]
1117                if (
1118                    caller.filename.endswith(
1119                        os.path.join("pandas", "io", "excel", "_base.py")
1120                    )
1121                    and caller.function == "read_excel"
1122                ):
1123                    stacklevel = 4
1124                else:
1125                    stacklevel = 2
1126                warnings.warn(
1127                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1128                    f"only the xls format is supported. As a result, the "
1129                    f"openpyxl engine will be used if it is installed and the "
1130                    f"engine argument is not specified. Install "
1131                    f"openpyxl instead.",
1132                    FutureWarning,
1133                    stacklevel=stacklevel,
1134                )
1135        assert engine in self._engines, f"Engine {engine} not recognized"
1136
1137        self.engine = engine
1138        self.storage_options = storage_options
1139
1140        self._reader = self._engines[engine](self._io, storage_options=storage_options)
1141
1142    def __fspath__(self):
1143        return self._io
1144
1145    def parse(
1146        self,
1147        sheet_name=0,
1148        header=0,
1149        names=None,
1150        index_col=None,
1151        usecols=None,
1152        squeeze=False,
1153        converters=None,
1154        true_values=None,
1155        false_values=None,
1156        skiprows=None,
1157        nrows=None,
1158        na_values=None,
1159        parse_dates=False,
1160        date_parser=None,
1161        thousands=None,
1162        comment=None,
1163        skipfooter=0,
1164        convert_float=True,
1165        mangle_dupe_cols=True,
1166        **kwds,
1167    ):
1168        """
1169        Parse specified sheet(s) into a DataFrame.
1170
1171        Equivalent to read_excel(ExcelFile, ...)  See the read_excel
1172        docstring for more info on accepted parameters.
1173
1174        Returns
1175        -------
1176        DataFrame or dict of DataFrames
1177            DataFrame from the passed in Excel file.
1178        """
1179        return self._reader.parse(
1180            sheet_name=sheet_name,
1181            header=header,
1182            names=names,
1183            index_col=index_col,
1184            usecols=usecols,
1185            squeeze=squeeze,
1186            converters=converters,
1187            true_values=true_values,
1188            false_values=false_values,
1189            skiprows=skiprows,
1190            nrows=nrows,
1191            na_values=na_values,
1192            parse_dates=parse_dates,
1193            date_parser=date_parser,
1194            thousands=thousands,
1195            comment=comment,
1196            skipfooter=skipfooter,
1197            convert_float=convert_float,
1198            mangle_dupe_cols=mangle_dupe_cols,
1199            **kwds,
1200        )
1201
1202    @property
1203    def book(self):
1204        return self._reader.book
1205
1206    @property
1207    def sheet_names(self):
1208        return self._reader.sheet_names
1209
1210    def close(self):
1211        """close io if necessary"""
1212        self._reader.close()
1213
1214    def __enter__(self):
1215        return self
1216
1217    def __exit__(self, exc_type, exc_value, traceback):
1218        self.close()
1219
1220    def __del__(self):
1221        # Ensure we don't leak file descriptors, but put in try/except in case
1222        # attributes are already deleted
1223        try:
1224            self.close()
1225        except AttributeError:
1226            pass
1227